In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('mini_data.csv')

In [3]:
print(dataset)

  Profession   Age   Salary Purchased
0     Doctor  44.0  72000.0        No
1     Lawyer  27.0  48000.0       Yes
2     Banker  30.0  54000.0        No
3     Lawyer  38.0  61000.0        No
4     Banker  40.0      NaN       Yes
5     Doctor  35.0  58000.0       Yes
6     Lawyer   NaN  52000.0        No
7     Doctor  48.0  79000.0       Yes
8     Banker  50.0  83000.0        No
9     Doctor  37.0  67000.0       Yes


In [4]:
X = dataset.iloc[ : , 0:-1].values

In [5]:
print(X)

[['Doctor' 44.0 72000.0]
 ['Lawyer' 27.0 48000.0]
 ['Banker' 30.0 54000.0]
 ['Lawyer' 38.0 61000.0]
 ['Banker' 40.0 nan]
 ['Doctor' 35.0 58000.0]
 ['Lawyer' nan 52000.0]
 ['Doctor' 48.0 79000.0]
 ['Banker' 50.0 83000.0]
 ['Doctor' 37.0 67000.0]]


In [6]:
Y = dataset.iloc[ : , 3].values

In [7]:
print(Y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [8]:
from sklearn.impute import SimpleImputer

In [9]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

In [10]:
fitting = imputer.fit(X[ : , 1:])

In [11]:
transformation = fitting.transform(X[ : , 1:])

In [12]:
X[ : , 1: ] = transformation

In [13]:
print(X)

[['Doctor' 44.0 72000.0]
 ['Lawyer' 27.0 48000.0]
 ['Banker' 30.0 54000.0]
 ['Lawyer' 38.0 61000.0]
 ['Banker' 40.0 63777.77777777778]
 ['Doctor' 35.0 58000.0]
 ['Lawyer' 38.77777777777778 52000.0]
 ['Doctor' 48.0 79000.0]
 ['Banker' 50.0 83000.0]
 ['Doctor' 37.0 67000.0]]


# Encoding categorical variable

## Encoding independent variable

In [14]:
from sklearn.compose import ColumnTransformer

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')

In [17]:
X = np.array(ct.fit_transform(X))

In [18]:
print(X)

[[0.0 1.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 48.0 79000.0]
 [1.0 0.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 37.0 67000.0]]


## Encoding dependent variable

In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
le = LabelEncoder()

In [21]:
Y = le.fit_transform(Y)

In [22]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


# Splitting the data set into training set and test set

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [25]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 48.0 79000.0]
 [1.0 0.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 35.0 58000.0]]


In [26]:
print(X_test)

[[1.0 0.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 37.0 67000.0]]


In [27]:
print(Y_train)

[0 1 0 0 1 1 0 1]


In [28]:
print(Y_test)

[0 1]
