In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import KBinsDiscretizer

In [14]:
df = pd.read_csv('D:\\100 days of ML\Feature_Engineering\\bining.csv',usecols=['Age','Fare','Survived'])
df.head()

  df = pd.read_csv('D:\\100 days of ML\Feature_Engineering\\bining.csv',usecols=['Age','Fare','Survived'])


Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Age       714 non-null    float64
 2   Fare      891 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 21.0 KB


In [16]:
df.isnull().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [17]:
df.dropna(inplace=True)

In [19]:
df.shape

(714, 3)

In [20]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [21]:
X = df.iloc[:,1:]
Y = df.iloc[:,0]

In [22]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [31]:
X_train.head()

Unnamed: 0,Age,Fare
328,31.0,20.525
73,26.0,14.4542
253,30.0,16.1
719,33.0,7.775
666,25.0,13.0


In [32]:
clf = DecisionTreeClassifier()
clf.fit(X_train,Y_train)

y_pred = clf.predict(X_test)
accuracy_score(Y_test,y_pred)

0.6223776223776224

In [34]:
np.mean(cross_val_score(DecisionTreeClassifier(),X,Y,cv=10,scoring='accuracy'))

np.float64(0.6289319248826291)

In [35]:
K_Age = KBinsDiscretizer(n_bins=15,encode='ordinal',strategy='quantile')
K_fare = KBinsDiscretizer(n_bins=15,encode='ordinal',strategy='quantile')

In [36]:
trf = ColumnTransformer([
    ('K_Age',K_Age,[0]),
    ('K_fare',K_fare,[1])
])

In [38]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

trf.named_transformers_['K_Age'].bin_edges_



array([array([ 0.42,  6.  , 16.  , 19.  , 21.  , 23.  , 25.  , 28.  , 30.  ,
              32.  , 35.  , 38.  , 42.  , 47.  , 54.  , 80.  ])             ],
      dtype=object)

In [39]:
trf.named_transformers_['K_fare'].bin_edges_


array([array([  0.    ,   7.25  ,   7.775 ,   7.8958,   8.1583,  10.5   ,
               13.    ,  14.4542,  18.75  ,  26.    ,  26.55  ,  31.275 ,
               51.4792,  76.2917, 108.9   , 512.3292])                   ],
      dtype=object)

In [43]:
output = pd.DataFrame({
    'Age' : X_train['Age'],
    'Age_trf' : X_train_trf[:,0],
    'Fare' : X_train['Fare'],
    'Fare_trf' : X_train_trf[:,1]
})

In [44]:
output.sample(5)

Unnamed: 0,Age,Age_trf,Fare,Fare_trf
734,23.0,5.0,13.0,6.0
399,28.0,7.0,12.65,5.0
309,30.0,8.0,56.9292,12.0
222,51.0,13.0,8.05,3.0
515,47.0,13.0,34.0208,11.0


In [45]:
clf = DecisionTreeClassifier()
clf.fit(X_train_trf,Y_train)
y_pred = clf.predict(X_test_trf)

In [47]:
accuracy_score(Y_test,y_pred)

0.6363636363636364

# 📦 Binning in Machine Learning

## 📝 What is Binning (Simple Words)?
Binning = **putting numbers into buckets** instead of using the raw numbers.  

Example with ages:  
18, 21, 24, 32, 45, 67


We create **bins** (buckets):  
- **Bin 1**: 0–18 → “Teen”  
- **Bin 2**: 19–40 → “Adult”  
- **Bin 3**: 41–60 → “Middle Age”  
- **Bin 4**: 60+ → “Senior”  

👉 Now each age is replaced with its bin → simpler and easier for some ML models.

---

## 🤔 Why ML Engineers Use Binning
- ✅ **Smooth noisy data** (remove small random fluctuations)  
- ✅ **Make features interpretable** (e.g., “income group” instead of exact income)  
- ✅ **Help categorical models** (decision trees, Naïve Bayes, etc.)  
- ✅ **Highlight non-linear patterns** (like risk groups in credit scoring)  

⚠️ **Risks of Binning**
- Too few bins → lose important details  
- Too many bins → no simplification benefit  
- Wrong bin boundaries → misleading results  

---

## 🚀 Best Practices (ML Engineer Style)
1. **Decide based on data distribution** → use histograms to find natural cut points  
2. **Use quantile binning for fairness** → ensures each bin has a similar number of samples  
3. **Try automated binning** → tools like `pandas.cut()` or `KBinsDiscretizer` in `scikit-learn`  
4. **Validate with cross-validation** → always check if binning improves model performance  

---

## 🧠 Shortcut Definition
> “Binning = turning a continuous, messy feature into a small number of simpler categories that highlight patterns.”

