# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load dataset

In [2]:
dataset = pd.read_csv('datasets/Data.csv')

In [3]:
dataset.head(5)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [4]:
X = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, -1].values

In [5]:
print(X[:5])

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]]


# Clean data

In [6]:
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X[:, 1:3])
X[:, 1:3] = imp_mean.transform(X[:, 1:3])
print(X[:5])

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]]


# Encode categorical data

## Encoding independent variable

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [8]:
ct = ColumnTransformer( [("encoder", OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [9]:
print('feature_columns', ct.get_feature_names())
print(X[:5])

feature_columns ['encoder__x0_France', 'encoder__x0_Germany', 'encoder__x0_Spain', 'x1', 'x2']
[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]]


## Encoding dependent variable

In [10]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [11]:
print(y[:5])

[0 1 0 0 1]


# Split data between train and test

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=1)

# Feature scaling

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train[:, 3:] = scaler.fit_transform(X_train[:, 3:])
X_test[:, 3:] = scaler.transform(X_test[:, 3:])

In [14]:
print('X_test', X_test[:5])
print('X_train', X_train[:5])

X_test [[0.0 1.0 0.0 -1.303027190298516 -1.0461917895282287]
 [1.0 0.0 0.0 -0.4203313517091989 0.013586906357509865]
 [0.0 0.0 1.0 -0.1961546307976262 -1.2092346658183424]
 [0.0 1.0 0.0 -0.04203313517092016 -0.24909328322100627]]
X_train [[1.0 0.0 0.0 0.46236448688011816 0.42119409708279393]
 [0.0 0.0 1.0 -0.29423194619643933 -0.475541722512831]
 [0.0 0.0 1.0 -1.6813254068367947 -1.5353204183985696]
 [1.0 0.0 0.0 0.9667621089311564 0.9918441640981916]
 [0.0 1.0 0.0 1.2189609199566755 1.3179299166784189]]


# Done