# Data Preprocessing Tools

## Importing the libraries

In [1]:
#numpy will allow us to work with arrays
import numpy as np #the `as` will allow us to set a shortcut so we do not need to call the full `numpy` and can instead call `np`

#matplotlib.pyplot will allow us to create nice visualizations
import matplotlib.pyplot as plt #the .pyplot will allow us to pick the specific module from matplotlib 

#pandas will allow us to import data and create matrices of features and dependent variable vector
import pandas as pd 

## Importing the dataset

In [2]:
#we will import the dataset using the `pandas` library - this will create a dataframe
dataset = pd.read_csv('Data.csv')

#creating the matrix of features
X = dataset.iloc[:, :-1].values #the function `iloc` will allow us to locate indices (rows x columns) - since we want all the rows we use the colon ':'.
#then we want to take all of the columns except the last one (DV) by adding a new range ':-1'; we are taking the first index (0) up to -1 (the last column)
#a range in python includes the lower bound (0) and excludes the upper bound (-1 in this case)

#creating the dependent variable vector 
y = dataset.iloc[:, -1].values #the function `iloc` will allow us to locate indices (rows x columns)
#in this case, we just want the last column, so we are selecting all of the rows and just the last column (-1); notice that this is not a range anymore

In [3]:
#we are just showing the predictor matrix
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
#we are just showing the dependent variable vector
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [10]:
#this will show us how to look at the first n observations of the dataset
dataset.head(2)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes


In [11]:
#run this in order to get the structure of the dataframe - similar to str() in `R`
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [19]:
#to look at only certain columns, we use `.iloc`
dataset.iloc[:, 0:2]

Unnamed: 0,Country,Age
0,France,44.0
1,Spain,27.0
2,Germany,30.0
3,Spain,38.0
4,Germany,40.0
5,France,35.0
6,Spain,
7,France,48.0
8,Germany,50.0
9,France,37.0


In [22]:
#you can also select columns just by typing in the name in character form
dataset["Age"]

0    44.0
1    27.0
2    30.0
3    38.0
4    40.0
5    35.0
6     NaN
7    48.0
8    50.0
9    37.0
Name: Age, dtype: float64

In [16]:
#to look at only certain rows, we use `.iloc`
dataset.iloc[1:5, ]

Unnamed: 0,Country,Age,Salary,Purchased
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


## Taking care of missing data

In [24]:
#there are many strategies to handling missing data and many different techniques that can be implemented, so this is only one of the available options 
#we will be using this library to handle missing data - this is a big data science package that is used very frequently 
from sklearn.impute import SimpleImputer

#instead of deleted the data, we will replace the missing value with the mean of the column - this is a classic method
#although this method is simple, it should not be used in practice and more sophistacted methods are available
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #missing_values must be which missing values we want to replace and the strategy= is what method we are using 

#the next step is to apply the imputer object onto the matrix of features 
#the `.fit` method will connect the imputer to the matrix of features - it will compute the average for the columns with missing data
imputer.fit(X[:, 1:3]) #we are specifying just the age and salary column because they are numeric - the other ones would not work since they are characters 

#to do the actual replacement and impute the missing values, we need to call `.transform`
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [25]:
#now we will inspect the data frame to see what the missing values have been replaced with
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [26]:
#the reason that we need to encode data is to convert the strings into numbers so that the computer can read them 
#we can encode the countries (France, Germany, Spain) as (0,1,2) but the machine may think that it is ordinal - instead we will do one-hot encoding or dummy coding
#one-hot encoding or dummy coding turns every category into a binary column 

#for encoding we are again going to use the `sklearn` library 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#the first is to create an object of the `ColumnTransformer` class
#the argument `transformers=` specifies the transformation and which index of the column
#the second argument `remainder=` specifies we want to keep the columns where we dont apply the transformation
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
                                    #transformers = [type of transformation, type of encoding, which columns]
                                    #passthrough means we want to keep the columns that are not being transformed 

#we are now fitting and transforming all at once, so adding the dummy variables to the dataframe - note the original column will be removed 
#additionally, we need to force it to be a `np.array` since it will not be this by default 
X = np.array(ct.fit_transform(X))

In [27]:
#now we will inspect the data frame to see the result 
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [28]:
#we are now going to encode the dependent variable, which has 2 categories, into [0,1]
#again, we are going to use the `sklearn` library
from sklearn.preprocessing import LabelEncoder

#creating an object of class `LabelEncoder` - we do not have to put anything in the parentheses cause it is a single vector
le = LabelEncoder()

#we are now fitting and transforming all at once 
y = le.fit_transform(y)

In [29]:
#now we will inspect the output 
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [30]:
#again, we are going to use the `sklearn` library
#one major question in data science is whether or not to scale the data before or after splitting it
#according to this instructor, we should split the data first and then scale it
#feature scaling consists of all the variables so that variables are all on the same scale 
#the reason that you scale the features after is because the training and testing set are supposed to be completely separate - the test set should be a brand new set
#scaling uses the mean and SD of the feature, so applying it before the split will result in information leakage into the test set
from sklearn.model_selection import train_test_split

#we are doing a split of 80% training, 20% testing
#there will be 4 sets in total (2 for the predictor matrix and 2 for the dependent variable vector)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
                                                #first argument = predictor matrix
                                                #second argument = dependent variable vector
                                                #test_size = how big the test set is (20% in this case)
                                                #random_state = this is setting the seed for reproducibility 

In [31]:
#checking the results
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [32]:
#checking the results
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [33]:
#checking the results
print(y_train)

[0 1 0 0 1 1 0 1]


In [34]:
#checking the results
print(y_test)

[0 1]


## Feature Scaling

In [36]:
#feature scaling allows us to put all of the features on the same scale
#for some of the machine learning models, we do not want some features to dominate the other features and for the machine learning model to exclude the dominated features
#not all machine learning models need scaled features

#standardization consists of subtracting the mean of the feature and dividing by the SD
#normalization is subtracing the minimum and dividing by the (max - min)
#normalization is recommended when you have a normal distribution in your features
#standardization always performs well, so we generally prefer it 

#when scaling the training and test sets,you should perform feature normalization over the training data. Then perform normalization on testing instances as well 
#but this time using the mean and variance of training explanatory variables

#again, we are using the `sklearn` library 
from sklearn.preprocessing import StandardScaler

#we are doing to define the `StandardScaler` class
sc = StandardScaler()

#you do not need to scale the dummy variables, so we are only doing it to the numeric variables (3:4)
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

#you do not need to scale the dummy variables, so we are only doing it to the numeric variables (3:4)
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [37]:
#checking the results
print(X_train)

[[0.0 0.0 1.0 -0.1915918438457856 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057902 -0.07013167641635401]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.3045301939022488 -0.30786617274297895]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]


In [38]:
#checking the results
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830127 -0.9069571034860731]
 [1.0 0.0 0.0 -0.44973664397484425 0.20564033932253029]]
