## Binarization

In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [16]:
data_set = pd.read_csv('binarizationDataset.csv')
data_set.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44,72000,0
1,Spain,27,48000,1
2,Germany,30,54000,0
3,Spain,38,61000,0
4,Germany,40,1000,1


In [17]:
age = data_set.iloc[:, 1].values
salary = data_set.iloc[:, 2].values
print ("\nOriginal age data values : \n",  age)
print ("\nOriginal salary data values : \n",  salary)


Original age data values : 
 [44 27 30 38 40 35 78 48 50 37]

Original salary data values : 
 [72000 48000 54000 61000  1000 58000 52000 79000 83000 67000]


In [18]:
from sklearn.preprocessing import Binarizer
 
x = age
x = x.reshape(1, -1)
y = salary
y = y.reshape(1, -1)

In [19]:
binarizer_1 = Binarizer(threshold=35)
binarizer_2 = Binarizer(threshold=61000)
 
# Transformed feature
print ("\nBinarized age : \n", binarizer_1.fit_transform(x))
 
print ("\nBinarized salary : \n", binarizer_2.fit_transform(y))


Binarized age : 
 [[1 0 0 1 1 0 1 1 1 1]]

Binarized salary : 
 [[1 0 0 0 0 0 0 1 1 1]]


## Binning

In [9]:
import numpy as np
import pandas as pd

In [10]:
small_counts = np.random.randint(0, 100, 20)
print(small_counts)
print(np.floor_divide(small_counts, 10))

[68 72 48 56 47 77 58 98 73 34 93 34 13 11 57 10 96 81 97 26]
[6 7 4 5 4 7 5 9 7 3 9 3 1 1 5 1 9 8 9 2]


In [11]:
large_counts = [296 ,8286, 64011, 80, 3, 725, 867, 2215, 7689, 11495, 91897, 44, 28, 7971, 926, 12]
print(pd.qcut(large_counts, 4, labels=False))
large_counts_series = pd.Series(large_counts)
print(large_counts_series.quantile([0.25, 0.5, 0.75]))

[1 3 3 1 0 1 1 2 2 3 3 0 0 2 2 0]
0.25      71.00
0.50     896.50
0.75    8049.75
dtype: float64


## Scaling

In [12]:
import pandas as pd
from sklearn import preprocessing


In [13]:
data = pd.read_csv("bin.csv")
print(data)

   country  age  salary purchased
0   france   44   72000        no
1    spain   27   48000       yes
2  germany   30   54000        no
3    spain   38   61000        no
4  germany   40    1000       yes


In [14]:
x = data.iloc[:,1:3].values
print(x)

[[   44 72000]
 [   27 48000]
 [   30 54000]
 [   38 61000]
 [   40  1000]]


In [17]:
## MIN MAX SCALER

min_max_scaler = preprocessing.MinMaxScaler(feature_range = (0, 1))
x_after_min_max_scaler = min_max_scaler.fit_transform(x)
print(x_after_min_max_scaler)

[[1.         1.        ]
 [0.         0.66197183]
 [0.17647059 0.74647887]
 [0.64705882 0.84507042]
 [0.76470588 0.        ]]


In [16]:
## STANDARDIZATION


stand = preprocessing.StandardScaler()
x_after_Standardisation = stand.fit_transform(x)
print(x_after_Standardisation)

[[ 1.29394853  1.01469054]
 [-1.38862769  0.03273195]
 [-0.91523188  0.2782216 ]
 [ 0.34715692  0.56462619]
 [ 0.66275412 -1.89027028]]


## LOG TRANSFORMATION

In [16]:
import pandas as pd
import numpy as np


In [17]:
data = pd.DataFrame({'value':[2, 45, -23, 85, 28,2,35,-12]})
data['log+1'] = (data['value']+1).transform(np.log)
print(data)

data['log'] = (data['value']-data['value'].min()+1).transform(np.log)
print(data)

   value     log+1
0      2  1.098612
1     45  3.828641
2    -23       NaN
3     85  4.454347
4     28  3.367296
5      2  1.098612
6     35  3.583519
7    -12       NaN
   value     log+1       log
0      2  1.098612  3.258097
1     45  3.828641  4.234107
2    -23       NaN  0.000000
3     85  4.454347  4.691348
4     28  3.367296  3.951244
5      2  1.098612  3.258097
6     35  3.583519  4.077537
7    -12       NaN  2.484907
