In [None]:
#Data.csv

**Step 1: Importing the libraries**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model


**Step 2: Importing dataset**

In [17]:
df = pd.read_csv("Data.csv")

In [18]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [6]:
df.dtypes

Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object

In [7]:
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


#### Step 3: Handling the missing data

In [19]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

df['Salary'] = df['Salary'].fillna(df['Salary'].mean())

In [20]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [25]:
#from sklearn.preprocessing import LabelEncoder
#le = LabelEncoder()"""

In [21]:
df['Purchased'] = df['Purchased'].map({'Yes': 1, 'No': 0})         

In [29]:
#df['Purchased'] = le.fit_transform(df['Purchased'])

In [26]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63777.777778,1
5,France,35.0,58000.0,1
6,Spain,38.777778,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


In [25]:
dummy_df = pd.get_dummies(df.Purchased)
dummy_df

Unnamed: 0,0,1
0,1,0
1,0,1
2,1,0
3,1,0
4,0,1
5,0,1
6,1,0
7,0,1
8,1,0
9,0,1


**Step 5: Creating a dummy variable**

In [27]:
df = pd.get_dummies(df, columns=['Purchased'])

In [28]:
df

Unnamed: 0,Country,Age,Salary,Purchased_0,Purchased_1
0,France,44.0,72000.0,1,0
1,Spain,27.0,48000.0,0,1
2,Germany,30.0,54000.0,1,0
3,Spain,38.0,61000.0,1,0
4,Germany,40.0,63777.777778,0,1
5,France,35.0,58000.0,0,1
6,Spain,38.777778,52000.0,1,0
7,France,48.0,79000.0,0,1
8,Germany,50.0,83000.0,1,0
9,France,37.0,67000.0,0,1


**Step 6: Splitting the datasets into training sets and Test sets**

In [30]:
from sklearn.model_selection import train_test_split        

X = df.loc[:,["Age", "Salary"]].values
y = df.loc[:,"Purchased_1"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state = 12)

In [31]:
X_train

array([[4.40000000e+01, 7.20000000e+04],
       [4.00000000e+01, 6.37777778e+04],
       [3.70000000e+01, 6.70000000e+04],
       [3.80000000e+01, 6.10000000e+04],
       [3.00000000e+01, 5.40000000e+04],
       [2.70000000e+01, 4.80000000e+04],
       [3.87777778e+01, 5.20000000e+04]])

In [32]:
X_test

array([[3.5e+01, 5.8e+04],
       [5.0e+01, 8.3e+04],
       [4.8e+01, 7.9e+04]])

In [41]:
df.corr()

Unnamed: 0,Age,Salary,Purchased_0,Purchased_1
Age,1.0,0.912577,0.200214,-0.200214
Salary,0.912577,1.0,0.056717,-0.056717
Purchased_0,0.200214,0.056717,1.0,-1.0
Purchased_1,-0.200214,-0.056717,-1.0,1.0


**Step 7: Feature Scaling**

In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)


In [35]:

X_train = scaler.transform(X_train)

X_test = scaler.transform(X_test)

In [36]:
X_test

array([[-0.25587018, -0.20948535],
       [ 2.49182662,  2.90315072],
       [ 2.12546704,  2.40512895]])

In [37]:
X_train

array([[-6.41203944, -7.43061008],
       [-6.54625877, -7.43073753],
       [-6.64692328, -7.43068758],
       [-6.61336844, -7.43078059],
       [-6.88180712, -7.43088911],
       [-6.98247162, -7.43098212],
       [-6.58727024, -7.43092011]])