In [183]:
# import the library
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [184]:
# import the dataset
dataset = pd.read_csv("dataset.csv")

In [185]:
# extract the independent variable
x = dataset.iloc[:, :-1].values
x

array([['India', 38.0, 68000.0],
       ['France', 43.0, 45000.0],
       ['Germany', 30.0, 54000.0],
       ['France', 48.0, 65000.0],
       ['Germany', 40.0, nan],
       ['India', 35.0, 58000.0],
       ['Germany', nan, 53000.0],
       ['France', 49.0, 79000.0],
       ['India', 50.0, 88000.0],
       ['France', 37.0, 77000.0]], dtype=object)

In [186]:
# extract the dependent variable
y = dataset.iloc[:, 3]
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

In [195]:
# handling missing data (replace missing data with mean value)
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
imputer = imputer.fit(x[:, 1:3]) # fit imputer object to the independent variables x
x[:, 1:3] = imputer.transform(x[:, 1:3]) # Replace missing data with the calculated mean value
x

array([[0.0, 0.0, 1.0, 38.0, 68000.0],
       [1.0, 0.0, 0.0, 43.0, 45000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 48.0, 65000.0],
       [0.0, 1.0, 0.0, 40.0, 65222.22222222222],
       [0.0, 0.0, 1.0, 35.0, 58000.0],
       [0.0, 1.0, 0.0, 41.111111111111114, 53000.0],
       [1.0, 0.0, 0.0, 49.0, 79000.0],
       [0.0, 0.0, 1.0, 50.0, 88000.0],
       [1.0, 0.0, 0.0, 37.0, 77000.0]], dtype=object)

In [188]:
# categorical data for country variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

label_encoder_x = LabelEncoder()
x[:, 0] = label_encoder_x.fit_transform(x[:, 0])
x

array([[2, 38.0, 68000.0],
       [0, 43.0, 45000.0],
       [1, 30.0, 54000.0],
       [0, 48.0, 65000.0],
       [1, 40.0, 65222.22222222222],
       [2, 35.0, 58000.0],
       [1, 41.111111111111114, 53000.0],
       [0, 49.0, 79000.0],
       [2, 50.0, 88000.0],
       [0, 37.0, 77000.0]], dtype=object)

In [189]:
# encode for dummy variables
onehot_encoder = ColumnTransformer([("Country", OneHotEncoder(), [0])], remainder="passthrough")
x = onehot_encoder.fit_transform(x)
x

array([[0.0, 0.0, 1.0, 38.0, 68000.0],
       [1.0, 0.0, 0.0, 43.0, 45000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 48.0, 65000.0],
       [0.0, 1.0, 0.0, 40.0, 65222.22222222222],
       [0.0, 0.0, 1.0, 35.0, 58000.0],
       [0.0, 1.0, 0.0, 41.111111111111114, 53000.0],
       [1.0, 0.0, 0.0, 49.0, 79000.0],
       [0.0, 0.0, 1.0, 50.0, 88000.0],
       [1.0, 0.0, 0.0, 37.0, 77000.0]], dtype=object)

In [190]:
# encoding for purchased variable
label_encoder_y = LabelEncoder()
y = label_encoder_y.fit_transform(y)

In [196]:
# Split the dataset into train and test set
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [197]:
# Feature scaling of datasets
from sklearn.preprocessing import StandardScaler

st_x = StandardScaler()
x_train = st_x.fit_transform(x_train)
x_test = st_x.transform(x_test)

In [198]:
x_train

array([[-1.        ,  1.73205081, -0.57735027, -0.29460737,  0.1339619 ],
       [ 1.        , -0.57735027, -0.57735027, -0.93095928,  1.22626663],
       [ 1.        , -0.57735027, -0.57735027,  0.34174455, -1.74150472],
       [-1.        ,  1.73205081, -0.57735027, -0.05892147, -0.99956188],
       [ 1.        , -0.57735027, -0.57735027,  1.61444837,  1.41175234],
       [ 1.        , -0.57735027, -0.57735027,  1.40233107,  0.11335238],
       [-1.        , -0.57735027,  1.73205081, -0.71884198,  0.39158094],
       [-1.        , -0.57735027,  1.73205081, -1.35519389, -0.5358476 ]])

In [199]:
x_test

array([[-1.        ,  1.73205081, -0.57735027, -2.41578041, -0.90681902],
       [-1.        , -0.57735027,  1.73205081,  1.82656568,  2.24643804]])