In [186]:
# Importing the libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [187]:
# Importing the dataset
datasets = pd.read_csv('Data.csv')

In [188]:
# Given dataset contains:
datasets.head(1)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No


In [189]:
X = datasets.iloc[:, :-1]

In [190]:
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [191]:
X = datasets.iloc[:, :-1].values

In [192]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [193]:
Y = datasets.iloc[:, 3].values

In [194]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

# Missing Data
Complete missing data - [ scikit-learn - docs - Imputer](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html)

In [173]:
# Take care of missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

In [174]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Encoding categorical Data

In [175]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [176]:
labelencoder_x = LabelEncoder()
X[:, 0] =  labelencoder_x.fit_transform(X[:, 0])
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [177]:
oneHotEncoder = OneHotEncoder(categorical_features=[0])
X = oneHotEncoder.fit_transform(X).toarray()

# Train Dataset & Test Dataset

In [195]:
from sklearn.model_selection import train_test_split

In [196]:
print("\nDataset 1:\n", X)
print("\nDataset 2:\n", Y)


Dataset 1:
 [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]

Dataset 2:
 ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [197]:
# Arrays as first parameters
# test_size - how much data it's going to testset and trainset
# test_size - in most cases 0.2 || 0.25 is the best

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [198]:
print("\nDataset x_train:\n", x_train)
print("\nDataset x_test:\n", x_test)

print("\nDataset y_train:\n", y_train)
print("\nDataset y_test:\n", y_test)


Dataset x_train:
 [['Germany' 40.0 nan]
 ['France' 37.0 67000.0]
 ['Spain' 27.0 48000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Spain' 38.0 61000.0]
 ['France' 44.0 72000.0]
 ['France' 35.0 58000.0]]

Dataset x_test:
 [['Germany' 30.0 54000.0]
 ['Germany' 50.0 83000.0]]

Dataset y_train:
 ['Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes']

Dataset y_test:
 ['No' 'No']


# Feature Scaling
Means that we have to put same scale for two features. <br>
> age: 1-100 and salary: 1000-999999 <br>
* In this example values are <b>dominated by salary</b> because square of salary is much <b>bigger than age square</b>...

#### There is few ways to scale your data:
- Standardisation
- Normalisation

![_auto_0](attachment:_auto_0)

In [182]:
from sklearn.preprocessing import StandardScaler

In [183]:
sc_X = StandardScaler()
x_train = sc_X.fit_transform(x_train)
x_test = sc_X.transform(x_test)

In [184]:
x_train

array([[-1. ,  2.6, -0.8,  0.3,  0.1],
       [ 1. , -0.4, -0.8, -0.3,  0.5],
       [-1. , -0.4,  1.3, -2. , -1.5],
       [-1. , -0.4,  1.3,  0.1, -1.1],
       [ 1. , -0.4, -0.8,  1.6,  1.7],
       [-1. , -0.4,  1.3, -0.1, -0.2],
       [ 1. , -0.4, -0.8,  1. ,  1. ],
       [ 1. , -0.4, -0.8, -0.6, -0.5]])

In [185]:
x_test

array([[-1. ,  2.6, -0.8, -1.5, -0.9],
       [-1. ,  2.6, -0.8,  2. ,  2.1]])

In [202]:
# Data Preprocessing Template

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

'from sklearn.preprocessing import StandardScaler\nsc_X = StandardScaler()\nX_train = sc_X.fit_transform(X_train)\nX_test = sc_X.transform(X_test)\nsc_y = StandardScaler()\ny_train = sc_y.fit_transform(y_train)'

In [203]:
x_train 

array([['Germany', 40.0, nan],
       ['France', 37.0, 67000.0],
       ['Spain', 27.0, 48000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Spain', 38.0, 61000.0],
       ['France', 44.0, 72000.0],
       ['France', 35.0, 58000.0]], dtype=object)