# Data Preprocessing

## Step 1: Importing the libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Step 2: Importing dataset

In [5]:
df = pd.read_csv("R:\\GUVI\\Tasks\\Task-7\\DataPreprocessing\\Data.csv")

In [6]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


## Step 3: Handling the missing data

In [7]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [8]:
df.Salary = df.Salary.fillna(value=df["Salary"].mean())

In [9]:
df.Age = df.Age.fillna(value=df["Age"].mean())

## Step 4: Encoding categorical data

In [10]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
df.Purchased = LE.fit_transform(df.Purchased)

In [11]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63777.777778,1
5,France,35.0,58000.0,1
6,Spain,38.777778,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


## Step 5: Creating a dummy variable

In [12]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
df.Country = LE.fit_transform(df.Country)
x = df.loc[:,["Country"]]
df1 = ohe.fit_transform(x).toarray()
df1 =pd.DataFrame(df1)
df1 = df1.loc[:,1:]

In [13]:
df = pd.concat([df,df1],axis="columns")
df = df.drop(["Country"],axis=1)


In [14]:
df

Unnamed: 0,Age,Salary,Purchased,1,2
0,44.0,72000.0,0,0.0,0.0
1,27.0,48000.0,1,0.0,1.0
2,30.0,54000.0,0,1.0,0.0
3,38.0,61000.0,0,0.0,1.0
4,40.0,63777.777778,1,1.0,0.0
5,35.0,58000.0,1,0.0,0.0
6,38.777778,52000.0,0,0.0,1.0
7,48.0,79000.0,1,0.0,0.0
8,50.0,83000.0,0,1.0,0.0
9,37.0,67000.0,1,0.0,0.0


## Step 6: Splitting the datasets into training sets and Test sets

In [15]:
x = df.loc[:,["Age","Salary",1,2]].values
y = df.loc[:,"Purchased"].values

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

In [17]:
x_train, x_test

(array([[3.70000000e+01, 6.70000000e+04, 0.00000000e+00, 0.00000000e+00],
        [3.80000000e+01, 6.10000000e+04, 0.00000000e+00, 1.00000000e+00],
        [3.87777778e+01, 5.20000000e+04, 0.00000000e+00, 1.00000000e+00],
        [4.40000000e+01, 7.20000000e+04, 0.00000000e+00, 0.00000000e+00],
        [3.50000000e+01, 5.80000000e+04, 0.00000000e+00, 0.00000000e+00],
        [2.70000000e+01, 4.80000000e+04, 0.00000000e+00, 1.00000000e+00],
        [4.80000000e+01, 7.90000000e+04, 0.00000000e+00, 0.00000000e+00]]),
 array([[5.00000000e+01, 8.30000000e+04, 1.00000000e+00, 0.00000000e+00],
        [3.00000000e+01, 5.40000000e+04, 1.00000000e+00, 0.00000000e+00],
        [4.00000000e+01, 6.37777778e+04, 1.00000000e+00, 0.00000000e+00]]))

## Step 7: Feature Scaling

In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [19]:
x_train, x_test

(array([[-0.2029809 ,  0.44897083,  0.        , -0.8660254 ],
        [-0.04111006, -0.14030338,  0.        ,  1.15470054],
        [ 0.08478949, -1.0242147 ,  0.        ,  1.15470054],
        [ 0.93011502,  0.94003267,  0.        , -0.8660254 ],
        [-0.52672259, -0.43494049,  0.        , -0.8660254 ],
        [-1.82168936, -1.41706417,  0.        ,  1.15470054],
        [ 1.5775984 ,  1.62751925,  0.        , -0.8660254 ]]),
 array([[ 1.90134009,  2.02036872,  1.        , -0.8660254 ],
        [-1.33607682, -0.82778996,  1.        , -0.8660254 ],
        [ 0.28263164,  0.13250875,  1.        , -0.8660254 ]]))