# Data Preprocessing Tools

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd

## Importing the dataset

In [4]:
dataset = pd.read_csv("Data.csv")
# Need to define features and the dependent variables
x = dataset.iloc[:,:-1].values
# df.iloc[].values allow inputs from an integar, slice with ints, or array of ints;
# :, means take all the rows, and specify the columns :-1 means from 0 to except last one
y = dataset.iloc[:,-1].values

In [5]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [6]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [7]:
# We can replace the missing value with the average value/ median...
from sklearn.impute import SimpleImputer
# Here we define missing value as np.nan (not number), strategy choose mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# Apply the object to the variables, only look at the numerical columns
imputer.fit(x[:,1:3])
# Ask for the replacements for the two columns; Substitute the two columns with the updated ones
x[:,1:3] = imputer.transform(x[:,1:3])

In [8]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

In [None]:
# We turn these into binaries, means create a unique column to many new columns
# One column for each categorical level :)

### Encoding the Independent Variable

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# For ct we need two arguments: transformers (index of columns) + remainders (index)
# In transformers, need method, package, index; remainder = passthrough
ct = ColumnTransformer(transformers = [('encoder',OneHotEncoder(),[0])], remainder ='passthrough')

# We have fit and transform, which directly transform inside the data frame
x = np.array(ct.fit_transform(x))

In [10]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [12]:
# Now we are processing a boolean
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [13]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [14]:
from sklearn.model_selection import train_test_split
# We will get 4 sets, xtrain, xtest, ytrain, ytest

# We need x, y, split size, random_state (seed)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 1)

In [17]:
print(x_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [18]:
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [19]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [20]:
print(y_test)

[0 1]


## Feature Scaling

In [24]:
# We have to apply feature scaling after splitting so that test set will not leak information
# Standardization (-3 to 3) Xstand = (X-mean)/std; Works for all the time
# Normalization (0 to 1) Xnorm = (x-min(x))/(max-min); Recommend for normal distribution

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# fit only gets the value and do the calculaton, transform will rewrite it
x_train[:,3:] = sc.fit_transform(x_train[:,3:])

# We should not get a new scaler! We need to use the same scale as in train set
# Use the fit from previous results
x_test[:,3:] = sc.transform(x_test[:,3:])

In [25]:
print(x_train)

[[0.0 0.0 1.0 -0.19159184384578554 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057846 -0.07013167641635404]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.3045301939022487 -0.307866172742979]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]


In [26]:
print(x_test)

[[0.0 1.0 0.0 -1.4661817944830127 -0.9069571034860731]
 [1.0 0.0 0.0 -0.4497366439748442 0.20564033932253026]]
