### SciKit Learn Preprocessing Overview

#### Scaling the data & Splitting the data into train/tests sets

In [20]:
import numpy as np

In [21]:
from sklearn.preprocessing import MinMaxScaler

In [22]:
data = np.random.randint(0, 100, (10,2))
data

array([[80,  0],
       [95, 25],
       [43, 90],
       [83, 97],
       [70, 82],
       [72,  1],
       [83,  6],
       [47, 50],
       [76, 24],
       [43, 81]])

In [23]:
scaler_model = MinMaxScaler()

In [24]:
type(scaler_model)

sklearn.preprocessing.data.MinMaxScaler

`Fit to our data (allows the model to learn what the minimum/maximum values are for each column)`

In [25]:
scaler_model.fit(data)



MinMaxScaler(copy=True, feature_range=(0, 1))

`Transform the data(normalizing data): Minimum becomes 0 and maximum becomes 1.
`

In [26]:
scaler_model.transform(data)

array([[ 0.71153846,  0.        ],
       [ 1.        ,  0.25773196],
       [ 0.        ,  0.92783505],
       [ 0.76923077,  1.        ],
       [ 0.51923077,  0.84536082],
       [ 0.55769231,  0.01030928],
       [ 0.76923077,  0.06185567],
       [ 0.07692308,  0.51546392],
       [ 0.63461538,  0.24742268],
       [ 0.        ,  0.83505155]])

`Do it all at once:`

In [27]:
scaler_model.fit_transform(data)



array([[ 0.71153846,  0.        ],
       [ 1.        ,  0.25773196],
       [ 0.        ,  0.92783505],
       [ 0.76923077,  1.        ],
       [ 0.51923077,  0.84536082],
       [ 0.55769231,  0.01030928],
       [ 0.76923077,  0.06185567],
       [ 0.07692308,  0.51546392],
       [ 0.63461538,  0.24742268],
       [ 0.        ,  0.83505155]])

`Train/Test split:`

In [37]:
import pandas as pd

In [38]:
data = np.random.randint(0, 101, (50,4))

In [41]:
df = pd.DataFrame(data=data, columns = ['f1', 'f2', 'f3', 'label'])

In [42]:
df

Unnamed: 0,f1,f2,f3,label
0,46,26,73,49
1,76,81,51,10
2,2,14,57,14
3,76,0,93,82
4,97,87,61,35
5,78,89,79,70
6,28,16,91,56
7,29,93,95,29
8,25,49,77,2
9,46,73,79,16


In [44]:
X = df[['f1', 'f2', 'f3']]

In [54]:
y = df['label']

In [51]:
X

Unnamed: 0,f1,f2,f3
0,46,26,73
1,76,81,51
2,2,14,57
3,76,0,93
4,97,87,61
5,78,89,79
6,28,16,91
7,29,93,95
8,25,49,77
9,46,73,79


In [55]:
y

0      49
1      10
2      14
3      82
4      35
5      70
6      56
7      29
8       2
9      16
10     33
11     23
12     31
13     59
14     20
15     69
16     80
17     16
18     38
19     28
20     89
21     18
22     55
23      0
24     68
25     36
26     24
27      7
28     10
29     30
30     28
31    100
32     99
33     72
34     34
35     37
36      9
37     18
38     18
39     64
40     67
41     25
42     71
43     30
44     45
45      6
46     44
47     46
48      9
49     27
Name: label, dtype: int64

In [56]:
from sklearn.model_selection import train_test_split

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [59]:
X_train.shape

(35, 3)

In [60]:
X_test.shape

(15, 3)