# Data preprocessing template

## Libraries and dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.impute import SimpleImputer

## Data Import

In [2]:
# Opening the csv file

filename = 'data_preprocessing.csv'
path = os.path.join('data', filename)
raw_data = pd.read_csv(path)


In [3]:
print(raw_data)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [4]:
# feature_matrix -> often called x
# dependant_var -> often called y

feature_matrix = raw_data.iloc[:, :-1]
dependant_var = raw_data.iloc[:, -1]


In [11]:
print(feature_matrix)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


In [12]:
print(dependant_var)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


## Replacing data with the average


In [25]:
feature_matrix.iloc[:, 0]


0     France
1      Spain
2    Germany
3      Spain
4    Germany
5     France
6      Spain
7     France
8    Germany
9     France
Name: Country, dtype: object

In [27]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(feature_matrix.iloc[:, 1:3])
feature_matrix.iloc[:, 1:3] = imputer.transform(feature_matrix.iloc[:, 1:3])

In [28]:
feature_matrix

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


## Encode categorical data

In [32]:
# Independent variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

column_transform = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

# Change the dataframe to a np array
feature_matrix = np.array(column_transform.fit_transform(feature_matrix))


In [37]:
print(feature_matrix)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]]


In [34]:
# Dependent vatiable
# Change to binary

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
dependant_var = le.fit_transform(dependant_var)

In [35]:
print(dependant_var)

[0 1 0 0 1 1 0 1 0 1]


## Obtaining test and training set

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feature_matrix, dependant_var, test_size = 0.2, random_state = 1)

In [41]:
print(X_train)

[[0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]]


In [42]:
print(X_test)

[[0.0e+00 1.0e+00 0.0e+00 3.0e+01 5.4e+04]
 [1.0e+00 0.0e+00 0.0e+00 3.7e+01 6.7e+04]]


In [43]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [44]:
print(y_test)

[0 1]


## Feature Scaling

In [46]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])


In [47]:
print(X_train)

[[ 0.          0.          1.         -0.19159184 -1.07812594]
 [ 0.          1.          0.         -0.01411729 -0.07013168]
 [ 1.          0.          0.          0.56670851  0.63356243]
 [ 0.          0.          1.         -0.30453019 -0.30786617]
 [ 0.          0.          1.         -1.90180114 -1.42046362]
 [ 1.          0.          0.          1.14753431  1.23265336]
 [ 0.          1.          0.          1.43794721  1.57499104]
 [ 1.          0.          0.         -0.74014954 -0.56461943]]


In [48]:
print(X_test)

[[0.0e+00 1.0e+00 0.0e+00 3.0e+01 5.4e+04]
 [1.0e+00 0.0e+00 0.0e+00 3.7e+01 6.7e+04]]
