# There are two common Techniques for Numerical Encoding

* ## 1 Discretization / Binning
* ## 2 Binarization

## In applied mathematics, discretization is the process of transferring continuous functions, models, variables, and equations into discrete counterparts.

# Use of Discretization

* ## To Handle Outliers
* ## To Improve the value spread

# Types of Discretization
* # Unsupervised
    * ## Three Types Of Unsupervised Discretization
        * ### Equal Width / Uniform
        * ### Equal Frequency / Quantile
        * ### K-Means
        <br>
* # Supervised
    * ## Only One Type of Supervised Discretization
        * ### Decision Tree 
* # Custom
<br>

# Unsupervised Discretization

## Equal Width / Uniform

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import KBinsDiscretizer

In [52]:
data = pd.read_csv("train.csv",usecols=["Age","Fare","Survived"])
data.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


# drop value

In [53]:
data.shape

(891, 3)

In [54]:
data.dropna(inplace=True)

In [55]:
data.shape

(714, 3)

In [56]:
x = data.iloc[:,1:]
y = data.iloc[:,0]

In [57]:
xtrain , xtest, ytarin, ytest = train_test_split(x,y,test_size=0.2,random_state=42) 

# Fit Model

In [58]:
clf = DecisionTreeClassifier()
clf.fit(xtrain,ytarin)

# Predection

In [91]:
ypred = clf.predict(xtest)

# Accuracy

In [92]:
accuracy_score(ytest,ypred)

0.6433566433566433

# Cross Val

In [93]:
np.mean(cross_val_score(DecisionTreeClassifier(),x,y,scoring="accuracy",cv=10))

0.624706572769953

# Apply Equal Width / Uniform

In [110]:
kbin_age = KBinsDiscretizer(n_bins=15,strategy='uniform',encode="ordinal")
kbin_fare = KBinsDiscretizer(n_bins=15,strategy='uniform',encode="ordinal")

In [111]:
trf = ColumnTransformer(
[
    
    ("f1",kbin_age,[0]),
    ("f2",kbin_age,[1]),

]

)

In [112]:
XTR = trf.fit_transform(xtrain)
XTS = trf.transform(xtest)

# Check Transform

In [113]:
trf.named_transformers_

{'f1': KBinsDiscretizer(encode='ordinal', n_bins=15, strategy='uniform'),
 'f2': KBinsDiscretizer(encode='ordinal', n_bins=15, strategy='uniform')}

# Check Bins

In [114]:
trf.named_transformers_["f1"].n_bins_

array([15])

# Check Edges range

In [115]:
trf.named_transformers_["f1"].bin_edges_

array([array([ 0.42      ,  5.72533333, 11.03066667, 16.336     , 21.64133333,
              26.94666667, 32.252     , 37.55733333, 42.86266667, 48.168     ,
              53.47333333, 58.77866667, 64.084     , 69.38933333, 74.69466667,
              80.        ])                                                   ],
      dtype=object)

In [116]:
trf.named_transformers_["f2"].bin_edges_

array([array([  0.     ,  34.15528,  68.31056, 102.46584, 136.62112, 170.7764 ,
              204.93168, 239.08696, 273.24224, 307.39752, 341.5528 , 375.70808,
              409.86336, 444.01864, 478.17392, 512.3292 ])                     ],
      dtype=object)

In [117]:
pd.DataFrame({
    "age":xtrain["Age"],
    "age_trf":XTR[:,0],

    "fare":xtrain["Fare"],
    "fare_trf":XTR[:,1],

})

Unnamed: 0,age,age_trf,fare,fare_trf
328,31.0,5.0,20.5250,0.0
73,26.0,4.0,14.4542,0.0
253,30.0,5.0,16.1000,0.0
719,33.0,6.0,7.7750,0.0
666,25.0,4.0,13.0000,0.0
...,...,...,...,...
92,46.0,8.0,61.1750,1.0
134,25.0,4.0,13.0000,0.0
337,41.0,7.0,134.5000,3.0
548,33.0,6.0,20.5250,0.0
