In [None]:
#Data.csv

**Step 1: Importing the libraries**

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

**Step 2: Importing dataset**

In [2]:
df = pd.read_csv("Data.csv")
print(df.shape)
df

(10, 4)


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


**Step 3: Handling the missing data**

In [4]:
# total number of missing value 
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [6]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [7]:
df['Purchased'] = df['Purchased'].map({'Yes':1,'No':0})
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63777.777778,1
5,France,35.0,58000.0,1
6,Spain,38.777778,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


**Step 5: Creating a dummy variable**

In [8]:
df = pd.get_dummies(df,columns=['Country'])
df

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,63777.777778,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,38.777778,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [9]:
X = df.drop(columns='Purchased').values
y = df['Purchased'].values

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=20)

In [10]:
X_train.shape, y_train.shape

((8, 5), (8,))

In [11]:
X_test.shape, y_test.shape

((2, 5), (2,))

**Step 7: Feature Scaling**

In [12]:
X_train # features before scaling

array([[5.00000000e+01, 8.30000000e+04, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00],
       [3.50000000e+01, 5.80000000e+04, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [4.40000000e+01, 7.20000000e+04, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [3.00000000e+01, 5.40000000e+04, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00],
       [3.87777778e+01, 5.20000000e+04, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00],
       [3.70000000e+01, 6.70000000e+04, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [4.00000000e+01, 6.37777778e+04, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00],
       [3.80000000e+01, 6.10000000e+04, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00]])

In [13]:
# Standard scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train) 

print("Features after Standard scaling")
scaled_X_train

Features after Standard scaling


array([[ 1.95436358,  2.01487853, -0.77459667,  1.29099445, -0.57735027],
       [-0.73444236, -0.6151297 ,  1.29099445, -0.77459667, -0.57735027],
       [ 0.8788412 ,  0.85767491,  1.29099445, -0.77459667, -0.57735027],
       [-1.63071101, -1.03593102, -0.77459667,  1.29099445, -0.57735027],
       [-0.05726161, -1.24633168, -0.77459667, -0.77459667,  1.73205081],
       [-0.3759349 ,  0.33167326,  1.29099445, -0.77459667, -0.57735027],
       [ 0.16182628, -0.00730558, -0.77459667,  1.29099445, -0.57735027],
       [-0.19668118, -0.29952872, -0.77459667, -0.77459667,  1.73205081]])

In [14]:
# Min Max Scaling
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
minmax_scaled_X_train = min_max_scaler.fit_transform(X_train)

print('Features after min max scaling')
minmax_scaled_X_train

Features after min max scaling


array([[1.        , 1.        , 0.        , 1.        , 0.        ],
       [0.25      , 0.19354839, 1.        , 0.        , 0.        ],
       [0.7       , 0.64516129, 1.        , 0.        , 0.        ],
       [0.        , 0.06451613, 0.        , 1.        , 0.        ],
       [0.43888889, 0.        , 0.        , 0.        , 1.        ],
       [0.35      , 0.48387097, 1.        , 0.        , 0.        ],
       [0.5       , 0.37992832, 0.        , 1.        , 0.        ],
       [0.4       , 0.29032258, 0.        , 0.        , 1.        ]])

In [15]:
# Normalization
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
normalized_X_train = normalizer.fit_transform(X_train)

print("Features after normalization")
normalized_X_train

Features after normalization


array([[6.02409529e-04, 9.99999818e-01, 0.00000000e+00, 1.20481906e-05,
        0.00000000e+00],
       [6.03448166e-04, 9.99999818e-01, 1.72413762e-05, 0.00000000e+00,
        0.00000000e+00],
       [6.11110997e-04, 9.99999813e-01, 1.38888863e-05, 0.00000000e+00,
        0.00000000e+00],
       [5.55555470e-04, 9.99999846e-01, 0.00000000e+00, 1.85185157e-05,
        0.00000000e+00],
       [7.45726288e-04, 9.99999722e-01, 0.00000000e+00, 0.00000000e+00,
        1.92307639e-05],
       [5.52238722e-04, 9.99999847e-01, 1.49253709e-05, 0.00000000e+00,
        0.00000000e+00],
       [6.27177577e-04, 9.99999803e-01, 0.00000000e+00, 1.56794394e-05,
        0.00000000e+00],
       [6.22950699e-04, 9.99999806e-01, 0.00000000e+00, 0.00000000e+00,
        1.63934394e-05]])