# 03_binning.ipynb

This notebook applies different binning techniques to the `TotalPrice` feature to improve model performance.

In [None]:
import pandas as pd

# Load cleaned data
data = pd.read_csv('../data/processed/cleaned_data.csv')
data['TotalPrice'] = data['Quantity'] * data['UnitPrice']
data.head()

## Equal-Width Binning

In [None]:
# Apply equal-width binning
data['TotalPriceBin_EqualWidth'] = pd.cut(data['TotalPrice'], bins=5, labels=False)
data['TotalPriceBin_EqualWidth'].value_counts().sort_index()

## Quantile-Based Binning

In [None]:
# Apply quantile-based binning
data['TotalPriceBin_Quantile'] = pd.qcut(data['TotalPrice'], q=5, labels=False, duplicates='drop')
data['TotalPriceBin_Quantile'].value_counts().sort_index()

## KMeans Binning

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)
data['TotalPriceBin_KMeans'] = kmeans.fit_predict(data[['TotalPrice']])
data['TotalPriceBin_KMeans'].value_counts().sort_index()

## Visualization of Binning Methods

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(18, 5))

plt.subplot(1, 3, 1)
sns.histplot(data['TotalPrice'], bins=30, kde=False, hue=data['TotalPriceBin_EqualWidth'], palette='tab10', legend=False)
plt.title('Equal-Width Binning')

plt.subplot(1, 3, 2)
sns.histplot(data['TotalPrice'], bins=30, kde=False, hue=data['TotalPriceBin_Quantile'], palette='tab10', legend=False)
plt.title('Quantile Binning')

plt.subplot(1, 3, 3)
sns.histplot(data['TotalPrice'], bins=30, kde=False, hue=data['TotalPriceBin_KMeans'], palette='tab10', legend=False)
plt.title('KMeans Binning')

plt.tight_layout()
plt.show()

## Save Binned Data

In [None]:
# Save the data with binned features
data.to_csv('../data/processed/binned_features.csv', index=False)
print("Binned features saved to ../data/processed/binned_features.csv")