# Data Preprocessing Tools

## --> Importing Libraries

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

## --> Importing the Dataset

In [17]:
data = pd.read_csv("Data.csv")
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


#### Here the independent variables also called features(country,age,Salary) are used to predict the dependent variable(Purchased)

In [18]:
# Independent Variable
x = data.iloc[:,:-1].values
# Dependent variable
y = data.iloc[:,-1].values

## --> Taking Care of Missing Data

In [19]:
from sklearn.impute import SimpleImputer

In [21]:
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(x[:,1:])
x[:,1:] = imputer.transform(x[:,1:])

In [22]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## --> Encoding Independent Variable

In [24]:
# One hot encoding the country column
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [25]:
x

array([[0.0, 1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 1.0, 0.0, 0.0, 35.0, 58000.0],
       [1.0, 0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 0.0, 48.0, 79000.0],
       [1.0, 0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

## --> Encoding Dependent Variable

In [26]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [27]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## --> Splitting train and test set

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [32]:
X_train

array([[1.0, 0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 0.0, 48.0, 79000.0],
       [1.0, 0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

## --> Feature Scaling


####    Note: Feature Scaling is normally already implemented in some algorithms and may not always be required.
#### *** Feature Scaling is applied always after splitting train and test set to keep the test set scale factor unknown to the model

####    Two types of feature Scaling:
  ####    1. Standardisation (Values lie between -3 <-> +3)
   ####  2. Normalisation ( values lie between 0 <-> 1)
        
   #### * Standardisation will work for all cases whereas normalisation is used only when the data follows a normalised distribution
    

In [38]:
from sklearn.preprocessing import StandardScaler

In [41]:
sc = StandardScaler()
X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] = sc.fit_transform(X_test[:,3:])


In [42]:
X_train

array([[1.0, 0.0, 0.0, 1.2909944487358056, -0.19159184384578554,
        -1.0781259408412427],
       [1.0, 0.0, 1.0, -0.7745966692414833, -0.014117293757057846,
        -0.07013167641635404],
       [0.0, 1.0, 0.0, -0.7745966692414833, 0.5667085065333239,
        0.6335624327104546],
       [1.0, 0.0, 0.0, 1.2909944487358056, -0.3045301939022487,
        -0.307866172742979],
       [1.0, 0.0, 0.0, 1.2909944487358056, -1.901801144700799,
        -1.4204636155515822],
       [0.0, 1.0, 0.0, -0.7745966692414833, 1.1475343068237056,
        1.2326533634535488],
       [1.0, 0.0, 1.0, -0.7745966692414833, 1.4379472069688966,
        1.5749910381638883],
       [0.0, 1.0, 0.0, -0.7745966692414833, -0.7401495441200352,
        -0.5646194287757336]], dtype=object)