In [1]:
#Data.csv

**Step 1: Importing the libraries**

In [2]:
import pandas as pd

**Step 2: Importing dataset**

In [3]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 3: Handling the missing data**

In [4]:
data.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [5]:
iqr = data.Age.quantile(0.75) - data.Age.quantile(0.25)
upper_threshold = data.Age.quantile(0.75) + (1.5 * iqr) # q3 + 1.5iqr
lower_threshold = data.Age.quantile(0.25) - (1.5 * iqr) # q1 - 1.5iqr
upper_threshold, lower_threshold

(57.5, 21.5)

In [6]:
iqr1 = data.Salary.quantile(0.75) -data.Salary.quantile(0.25)
upper_threshold = data.Salary.quantile(0.75) + (1.5 * iqr1) # q3 + 1.5iqr
lower_threshold = data.Salary.quantile(0.25) - (1.5 * iqr1) # q1 - 1.5iqr
upper_threshold, lower_threshold

(99000.0, 27000.0)

In [7]:
data.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [8]:
data.shape

(10, 4)

In [9]:
data["Age"].fillna(0, inplace = True)
data['Salary'].fillna(0, inplace = True)

In [10]:
data.isnull().sum()

Country      0
Age          0
Salary       0
Purchased    0
dtype: int64

In [11]:
data.dtypes

Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object

In [12]:
data.drop_duplicates()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,0.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,0.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [13]:
X = data[['Age','Salary']].values
y = data['Purchased']
y = y.map({'Yes':1, 'No':0})

**Step 5: Creating a dummy variable**

In [14]:
one_hot_encoded_data = pd.get_dummies(data, columns = ['Purchased'])
print(one_hot_encoded_data)

   Country   Age   Salary  Purchased_No  Purchased_Yes
0   France  44.0  72000.0             1              0
1    Spain  27.0  48000.0             0              1
2  Germany  30.0  54000.0             1              0
3    Spain  38.0  61000.0             1              0
4  Germany  40.0      0.0             0              1
5   France  35.0  58000.0             0              1
6    Spain   0.0  52000.0             1              0
7   France  48.0  79000.0             0              1
8  Germany  50.0  83000.0             1              0
9   France  37.0  67000.0             0              1


**Step 6: Splitting the datasets into training sets and Test sets**

In [15]:
from sklearn.model_selection import train_test_split
X = data[['Age','Salary']].values
y = data['Purchased'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 7)

**Step 7: Feature Scaling**

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)