In [None]:
#Data.csv

**Step 1: Importing the libraries**

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

**Step 2: Importing dataset**

In [31]:
# Importing the dataset
dataset = pd.read_csv('Data.csv')

X = dataset.iloc[:, :-1].values # Independent Variables
Y = dataset.iloc[:, 3].values   # Dependent Variables

dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [5]:
X,Y

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

**Step 3: Handling the missing data**

In [33]:
# Fill nan with Mean 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

**Step 4: Encoding categorical data**

In [29]:
from sklearn.preprocessing import LabelEncoder

# Encoding the In_Dependent Variable
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

# Encoding the Dependent Variable
labelencoder_y = LabelEncoder()
Y = labelencoder_y.fit_transform(Y)

X,Y

(array([[0, 44.0, 72000.0],
        [2, 27.0, 48000.0],
        [1, 30.0, 54000.0],
        [2, 38.0, 61000.0],
        [1, 40.0, 63777.77777777778],
        [0, 35.0, 58000.0],
        [2, 38.77777777777778, 52000.0],
        [0, 48.0, 79000.0],
        [1, 50.0, 83000.0],
        [0, 37.0, 67000.0]], dtype=object),
 array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1]))

**Step 5: Creating a dummy variable**

In [50]:
dataset = pd.read_csv('Data.csv')

X2 = dataset[["Country","Age","Salary"]]
Y2 = dataset.Purchased

dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [51]:
# Creating dummy Variable and adding it to the X2
make_dummy = pd.get_dummies(dataset["Country"])
X2 = pd.concat([X2,make_dummy],axis="columns")

# Dropping the Country Spain Column to avoid Dummy Column trap
X2 = X2.drop(["Country","Spain"] , axis="columns")
X2

Unnamed: 0,Age,Salary,France,Germany
0,44.0,72000.0,1,0
1,27.0,48000.0,0,0
2,30.0,54000.0,0,1
3,38.0,61000.0,0,0
4,40.0,,0,1
5,35.0,58000.0,1,0
6,,52000.0,0,0
7,48.0,79000.0,1,0
8,50.0,83000.0,0,1
9,37.0,67000.0,1,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [26]:
# Splitting the dataset into Training Set and Test set
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=0)

**Step 7: Feature Scaling**

In [27]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [52]:
X_train,X_test

(array([[ 0.13483997,  0.26306757,  0.12381479],
        [-0.94387981, -0.25350148,  0.46175632],
        [ 1.21355975, -1.97539832, -1.53093341],
        [ 1.21355975,  0.05261351, -1.11141978],
        [-0.94387981,  1.64058505,  1.7202972 ],
        [ 1.21355975, -0.0813118 , -0.16751412],
        [-0.94387981,  0.95182631,  0.98614835],
        [-0.94387981, -0.59788085, -0.48214934]]),
 array([[ 0.13483997, -1.45882927, -0.90166297],
        [ 0.13483997,  1.98496442,  2.13981082]]))