In [None]:
#Data.csv

**Step 1: Importing the libraries**

In [1]:
import pandas as pd
import numpy as np
import sklearn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing


**Step 2: Importing dataset**

In [2]:
df=pd.read_csv("/content/Data.csv")
df.head

<bound method NDFrame.head of    Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes>

**Step 3: Handling the missing data**

In [3]:
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [7]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [8]:
df2=df.mask(df == ' ')
print(df2)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [9]:
df2["Age"].fillna("39.0", inplace = True)
df2["Salary"].fillna("64000.0",inplace = True)
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [10]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df2['Purchased']= label_encoder.fit_transform(df['Purchased'])
df2['Purchased'].unique()
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,64000.0,1
5,France,35.0,58000.0,1
6,Spain,39.0,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


**Step 5: Creating a dummy variable**

In [11]:
df2["Country"].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [12]:
df_dc = pd.get_dummies(df2,columns=["Country"],
                       prefix='',prefix_sep='')
df_dc.head

<bound method NDFrame.head of     Age   Salary  Purchased  France  Germany  Spain
0  44.0  72000.0          0       1        0      0
1  27.0  48000.0          1       0        0      1
2  30.0  54000.0          0       0        1      0
3  38.0  61000.0          0       0        0      1
4  40.0  64000.0          1       0        1      0
5  35.0  58000.0          1       1        0      0
6  39.0  52000.0          0       0        0      1
7  48.0  79000.0          1       1        0      0
8  50.0  83000.0          0       0        1      0
9  37.0  67000.0          1       1        0      0>

**Step 6: Splitting the datasets into training sets and Test sets**

In [13]:
X = df_dc.iloc[:, :-1].values # select all rows and select all columns except the last column as my feature
y = df_dc.iloc[:, 1].values # target as arrays
X_train, X_test, y_train, y_test = train_test_split(
              X, y, test_size=0.33, random_state=42)

In [14]:
y_test


array([83000.0, 48000.0, 58000.0, 72000.0], dtype=object)

In [15]:
X_train.shape, X_test.shape

((6, 5), (4, 5))

**Step 7: Feature Scaling**

In [16]:
#MIN MAX SCALER
  
min_max_scaler = preprocessing.MinMaxScaler(feature_range =(0, 1))
  
# Scaled feature
X_after_min_max_scaler = min_max_scaler.fit_transform(X)
  
print ("\nAfter min max Scaling : \n", X_after_min_max_scaler)
  
  
#Standardisation
  
Standardisation = preprocessing.StandardScaler()
  
# Scaled feature
X_after_Standardisation = Standardisation.fit_transform(X)
  
print ("\nAfter Standardisation : \n", X_after_Standardisation)


After min max Scaling : 
 [[0.73913043 0.68571429 0.         1.         0.        ]
 [0.         0.         1.         0.         0.        ]
 [0.13043478 0.17142857 0.         0.         1.        ]
 [0.47826087 0.37142857 0.         0.         0.        ]
 [0.56521739 0.45714286 1.         0.         1.        ]
 [0.34782609 0.28571429 1.         1.         0.        ]
 [0.52173913 0.11428571 0.         0.         0.        ]
 [0.91304348 0.88571429 1.         1.         0.        ]
 [1.         1.         0.         0.         1.        ]
 [0.43478261 0.54285714 1.         1.         0.        ]]

After Standardisation : 
 [[ 0.75560965  0.74743385 -1.          1.22474487 -0.65465367]
 [-1.71465267 -1.44017742  1.         -0.81649658 -0.65465367]
 [-1.27872403 -0.8932746  -1.         -0.81649658  1.52752523]
 [-0.11624764 -0.25522131 -1.         -0.81649658 -0.65465367]
 [ 0.17437146  0.01823009  1.         -0.81649658  1.52752523]
 [-0.55217628 -0.52867272  1.          1.22474487 