In [None]:
#Data.csv

**Step 1: Importing the libraries**

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline 

**Step 2: Importing dataset**

In [29]:
df=pd.read_csv("/content/Data.csv")

In [30]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [31]:
df.shape

(10, 4)

In [32]:
df.describe()  #from this we can see no outliers in data 

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


**Step 3: Handling the missing data**

In [33]:
df.isnull().sum()

#age column has 1 missing value 
#salary column has one missing value 

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [34]:
# we can either drop or impute
# as the no of missing is very low and no outlier in columns we can do mean imputation
def mean_impute(df,column,mean_val):
  df[column]=df[column].fillna(mean_val)

In [35]:
mean_val=df["Age"].mean()
mean_val=round(mean_val)
print(mean_val)
mean_impute(df,"Age",mean_val) #imputing age column

39


In [36]:
mean_val=df["Salary"].mean()
mean_val=round(mean_val)
print(mean_val)
mean_impute(df,"Salary",mean_val) #imputing age column

63778


In [37]:
df  #after mean imputing we dont have any missing value in our data 

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63778.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [38]:
df.dtypes 

# country and purchased column is categorical 

Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object

In [40]:
df["Country"].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

**Step 5: Creating a dummy variable**

In [41]:
# as country column is multi class non ordinal --> we can do one hot encoding 
new_df=pd.get_dummies(df["Country"])

In [42]:
new_df.head()

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0


In [43]:
df=new_df.join(df).drop(["Country"],axis=1)

In [44]:
df.head()

Unnamed: 0,France,Germany,Spain,Age,Salary,Purchased
0,1,0,0,44.0,72000.0,No
1,0,0,1,27.0,48000.0,Yes
2,0,1,0,30.0,54000.0,No
3,0,0,1,38.0,61000.0,No
4,0,1,0,40.0,63778.0,Yes


In [45]:
df["Purchased"].unique()

array(['No', 'Yes'], dtype=object)

In [46]:
#as purchased column is binary we can do label encoding 
df["Purchased"]=df["Purchased"].map({"No":0,"Yes":1})

In [47]:
df

Unnamed: 0,France,Germany,Spain,Age,Salary,Purchased
0,1,0,0,44.0,72000.0,0
1,0,0,1,27.0,48000.0,1
2,0,1,0,30.0,54000.0,0
3,0,0,1,38.0,61000.0,0
4,0,1,0,40.0,63778.0,1
5,1,0,0,35.0,58000.0,1
6,0,0,1,39.0,52000.0,0
7,1,0,0,48.0,79000.0,1
8,0,1,0,50.0,83000.0,0
9,1,0,0,37.0,67000.0,1


**Step 6: Splitting the datasets into training sets and Test sets**

In [48]:
X = df.iloc[:,:-1]       
y = df["Purchased"].values  
X.head()

Unnamed: 0,France,Germany,Spain,Age,Salary
0,1,0,0,44.0,72000.0
1,0,0,1,27.0,48000.0
2,0,1,0,30.0,54000.0
3,0,0,1,38.0,61000.0
4,0,1,0,40.0,63778.0


In [49]:
#spliting the dataset df
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

**Step 7: Feature Scaling**

In [50]:
from sklearn.preprocessing import StandardScaler # importing the required function
scaler = StandardScaler() #initialisation 
X_train = scaler.fit_transform(X_train) # find the values of mu and sigma
X_test = scaler.transform(X_test)