### SciKit Learn Preprocessing Overview

#### Scaling the data & Splitting the data into train/tests sets

In [61]:
import numpy as np

In [62]:
from sklearn.preprocessing import MinMaxScaler

In [63]:
data = np.random.randint(0, 100, (10,2))
data

array([[54, 58],
       [13, 26],
       [33, 15],
       [45, 77],
       [11, 30],
       [31, 77],
       [58, 44],
       [60, 84],
       [35, 40],
       [95, 77]])

In [64]:
scaler_model = MinMaxScaler()

In [65]:
type(scaler_model)

sklearn.preprocessing.data.MinMaxScaler

`Fit to our data (allows the model to learn what the minimum/maximum values are for each column)`

In [None]:
scaler_model.fit(data)

`Transform the data(normalizing data): Minimum becomes 0 and maximum becomes 1.
`

In [67]:
scaler_model.transform(data)

array([[ 0.51190476,  0.62318841],
       [ 0.02380952,  0.15942029],
       [ 0.26190476,  0.        ],
       [ 0.4047619 ,  0.89855072],
       [ 0.        ,  0.2173913 ],
       [ 0.23809524,  0.89855072],
       [ 0.55952381,  0.42028986],
       [ 0.58333333,  1.        ],
       [ 0.28571429,  0.36231884],
       [ 1.        ,  0.89855072]])

`Do it all at once:`

In [None]:
scaler_model.fit_transform(data)

`Train/Test split:`

In [69]:
import pandas as pd

In [70]:
data = np.random.randint(0, 101, (50,4))

In [71]:
df = pd.DataFrame(data=data, columns = ['f1', 'f2', 'f3', 'label'])

In [72]:
df

Unnamed: 0,f1,f2,f3,label
0,30,37,33,74
1,31,69,11,32
2,45,72,60,46
3,39,7,97,92
4,42,25,15,32
5,44,59,1,80
6,54,33,82,13
7,1,10,4,95
8,38,94,80,74
9,63,47,61,13


In [73]:
X = df[['f1', 'f2', 'f3']]

In [74]:
y = df['label']

In [75]:
X

Unnamed: 0,f1,f2,f3
0,30,37,33
1,31,69,11
2,45,72,60
3,39,7,97
4,42,25,15
5,44,59,1
6,54,33,82
7,1,10,4
8,38,94,80
9,63,47,61


In [76]:
y

0      74
1      32
2      46
3      92
4      32
5      80
6      13
7      95
8      74
9      13
10     96
11     37
12     17
13     98
14     12
15     54
16     56
17     56
18     34
19     71
20     85
21      6
22     63
23     32
24     59
25     48
26     78
27     15
28     98
29     69
30     55
31     18
32     77
33     99
34     38
35      9
36     18
37     47
38    100
39     35
40     82
41     96
42      2
43     55
44      7
45     74
46     32
47     57
48      6
49      0
Name: label, dtype: int64

In [77]:
from sklearn.model_selection import train_test_split

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [79]:
X_train.shape

(35, 3)

In [80]:
X_test.shape

(15, 3)