## Data Preprocessing Learning

### Import Libraries

In [125]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Get Data from Local / Read Data

In [126]:
# Read data from data.csv file by using Pandas read_csv() command
data = pd.read_csv('./Data.csv') # data readed 
print(data) # print dataset into console

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


## Taking care of missing data
#### Remove Invalid rows (NaN rows)

In [127]:
# Drop rows where rows have empty values
data.dropna() # this comment only remove rows where have NaN 
data.dropna(inplace=True) # adding implace=true to update into actual data

# Now Print data to view
data


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Splitting dataset into the Training set and Test set

### Seperate data into X & Y
where 
X = Input Data and 
Y = Output Result


In [128]:
X = data.iloc[ : , :-1].values # except last row, remaining will store into X
Y = data.iloc[ : , -1].values # Last row as Result

In [129]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['France', 35.0, 58000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [130]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes'], dtype=object)

### Spliting into Training and Test Data set

In [118]:
# import train_test_split model selection from sklearn
from sklearn.model_selection import train_test_split 

In [132]:
# Spliting into models as 80% of Training and 20% of Test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, random_state=1)
# X_train, X_test = train_test_split(X, Y, test_size= 0.2, random_state=1)

In [133]:
# Print
X_train
# X_test
# Y_train
# Y_test

array([['Spain', 27.0, 48000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 44.0, 72000.0],
       ['France', 35.0, 58000.0],
       ['Spain', 38.0, 61000.0],
       ['France', 48.0, 79000.0]], dtype=object)

## Feature Scaling

In [134]:
# import standard scalar preprocessing from sklearn
from sklearn.preprocessing import StandardScaler

# create model
sc = StandardScaler()

In [138]:
# apply into our test data models
X_train[:, 1:] = sc.fit_transform(X_train[: , 1:])
X_test[:, 1:] = sc.transform(X_test[: , 1:])

In [139]:
X_train

array([['Spain', -1.6813254068367944, -1.53532041839857],
       ['Germany', 1.2189609199566758, 1.3179299166784184],
       ['France', 0.46236448688011844, 0.42119409708279354],
       ['France', -0.6725301627347177, -0.7201060369480019],
       ['Spain', -0.29423194619643905, -0.4755417225128314],
       ['France', 0.9667621089311568, 0.9918441640981912]], dtype=object)

In [140]:
X_test

array([['France', 1.0000000000000002, 0.9999999999999996],
       ['Germany', -0.9999999999999997, -1.0000000000000004]],
      dtype=object)

In [None]:
## Complete for Data Preprocessing also complete handson as well