# DATA PRE-PROCESSING

## Importing the Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the Dataset

In [2]:
dataset=pd.read_csv('Data.csv')
print(dataset)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


#### Creating matrix of features and dependent variable vector

In [3]:
X=dataset.iloc[:,:-1]
Y=dataset.iloc[:,-1]

In [4]:
print(X)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


In [5]:
print(Y)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


## Taking care of missing data

In [6]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan , strategy='mean')
imputer.fit(X.iloc[:, 1:3])
X.iloc[:,1:3]=imputer.transform(X.iloc[:,1:3])

In [7]:
print(X)

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000


## Encoding categorical data

#### Encoding the independent variable

In [8]:
#we are dummy encoding as the machine learning algorithms will be
#confused with the values like Spain > Germany > France
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(categories='auto'), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [9]:
np.set_printoptions(suppress=True)
print (X)

[[    1.             0.             0.            44.
  72000.        ]
 [    0.             0.             1.            27.
  48000.        ]
 [    0.             1.             0.            30.
  54000.        ]
 [    0.             0.             1.            38.
  61000.        ]
 [    0.             1.             0.            40.
  63777.77777778]
 [    1.             0.             0.            35.
  58000.        ]
 [    0.             0.             1.            38.77777778
  52000.        ]
 [    1.             0.             0.            48.
  79000.        ]
 [    0.             1.             0.            50.
  83000.        ]
 [    1.             0.             0.            37.
  67000.        ]]


#### Encoding the dependent variable

In [10]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
Y=le.fit_transform(Y)

In [11]:
print(Y)


[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into Training set and Test set

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=1)

In [13]:
print(X_train)

[[    0.             0.             1.            38.77777778
  52000.        ]
 [    0.             1.             0.            40.
  63777.77777778]
 [    1.             0.             0.            44.
  72000.        ]
 [    0.             0.             1.            38.
  61000.        ]
 [    0.             0.             1.            27.
  48000.        ]
 [    1.             0.             0.            48.
  79000.        ]
 [    0.             1.             0.            50.
  83000.        ]
 [    1.             0.             0.            35.
  58000.        ]]


In [14]:
print(X_test)

[[    0.     1.     0.    30. 54000.]
 [    1.     0.     0.    37. 67000.]]


In [15]:
print(Y_train)

[0 1 0 0 1 1 0 1]


In [16]:
print(Y_test)

[0 1]


## Feature Scaling

In [17]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train[:,3:]=sc.fit_transform(X_train[:,3:])
X_test[:,3:]=sc.transform(X_test[:,3:])

In [18]:
print(X_train)

[[ 0.          0.          1.         -0.19159184 -1.07812594]
 [ 0.          1.          0.         -0.01411729 -0.07013168]
 [ 1.          0.          0.          0.56670851  0.63356243]
 [ 0.          0.          1.         -0.30453019 -0.30786617]
 [ 0.          0.          1.         -1.90180114 -1.42046362]
 [ 1.          0.          0.          1.14753431  1.23265336]
 [ 0.          1.          0.          1.43794721  1.57499104]
 [ 1.          0.          0.         -0.74014954 -0.56461943]]


In [19]:
print(X_test)

[[ 0.          1.          0.         -1.46618179 -0.9069571 ]
 [ 1.          0.          0.         -0.44973664  0.20564034]]
