In [77]:
#Data.csv

**Step 1: Importing the libraries**

In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

**Step 2: Importing dataset**

In [79]:
df = pd.read_csv('/content/Data.csv')

In [80]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


**Step 3: Handling the missing data**

In [82]:
df['Age'] = df['Age'].fillna(round(np.mean(df['Age'])))
df['Salary'] = df['Salary'].fillna(round(np.mean(df['Salary'])))

In [83]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63778.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [84]:
df['Purchased'] = df['Purchased'].map({'Yes':1,'No':0})

In [85]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63778.0,1
5,France,35.0,58000.0,1
6,Spain,39.0,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


**Step 5: Creating a dummy variable**

In [86]:
df = pd.get_dummies(df, columns = ['Country'])

In [87]:
df

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,63778.0,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,39.0,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [88]:
x = df[['Age','Salary','Country_France','Country_Germany','Country_Spain']]
y = df['Purchased']

In [89]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 2)

In [90]:
x_train.shape, x_test.shape

((8, 5), (2, 5))

**Step 7: Feature Scaling**

In [91]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [92]:
x_train

array([[-0.81176183, -0.72132045,  1.        , -0.57735027, -0.57735027],
       [ 0.61377114,  0.58171004,  1.        , -0.57735027, -0.57735027],
       [ 1.24734135,  1.23322529,  1.        , -0.57735027, -0.57735027],
       [-1.60372459, -1.09361488, -1.        ,  1.73205081, -0.57735027],
       [-0.33658417, -0.44209963, -1.        , -0.57735027,  1.73205081],
       [-0.17819162, -1.27976209, -1.        , -0.57735027,  1.73205081],
       [-0.49497673,  0.11634201,  1.        , -0.57735027, -0.57735027],
       [ 1.56412646,  1.60551972, -1.        ,  1.73205081, -0.57735027]])

In [93]:
x_test

array([[-0.01979907, -0.18354115, -1.        ,  1.73205081, -0.57735027],
       [-2.07890225, -1.65205652, -1.        , -0.57735027,  1.73205081]])