In [133]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [134]:
path = './data/Data.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [135]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


# Data Imputation (Missing Data Replacement)

In [136]:
def find_percent_missing_data(dataframe):
    for col in df.columns:
        missing_data = dataframe[col].isna().sum()
        missing_percent = (missing_data  / len(dataframe))* 100
        print(f"{col}: {missing_percent}% ")

In [137]:
find_missing_data(df)

Country: 0.0% 
Age: 10.0% 
Salary: 10.0% 
Purchased: 0.0% 


In [138]:
# Input 
# " : " mean get all row
x = df.iloc[ : , : -1]
type(x)

pandas.core.frame.DataFrame

In [139]:
# Convert DataFrame to np array (x usually in np.array type)
x = x.values
type(x)

numpy.ndarray

In [140]:
# Output or prediction
last_column = -1
y = df.iloc[ : , last_column]
# Similar to x, y also usually in np.array type
y = y.values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [141]:
from sklearn.impute import SimpleImputer
# Create an instance of class SimpleImputer: np.nan  is the empty value in the dataset
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# 1:3 column have index 1 to 2
# fit() calculate strategy mean and replace nan with mean of column 1 and 2 
imputer.fit(x[: , 1:3])
# Update x
x[:, 1:3] = imputer.transform(x[:, 1:3])
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Encode Categorical Data
## 1: Encode  Independent variable (X)

In [142]:
from sklearn.compose import  ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

column_index = 0
city = ColumnTransformer(transformers= [('encoder', OneHotEncoder(), [column_index] )], remainder = "passthrough")
x  = city.fit_transform(x)

In [155]:
x #         France, Spain,  Ger,     Age,        Salary

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

## 2: Encode Dependent Variable (y)

In [144]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [145]:
from sklearn.preprocessing import LabelEncoder

In [146]:
y = LabelEncoder().fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset (x =data, y = output) into the Training set and Test set

In [157]:
from sklearn.model_selection import train_test_split
np.random.seed(42)
# 80 % train and 20 % test
x_train, x_test, y_train, y_test =train_test_split(x, y, test_size = 0.2 )

In [158]:
x_train

array([[1.0, 0.0, 0.0, 35.0, 58000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0]], dtype=object)

In [159]:
x_test

array([[0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0]], dtype=object)

In [160]:
y_train

array([1, 0, 1, 0, 1, 1, 0, 0])

In [161]:
y_test

array([0, 1])

## Feature Scaling

In [162]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:, 3:] = sc.fit_transform(x_train[ :, 3:])
x_train

array([[1.0, 0.0, 0.0, -0.7529426005471072, -0.6260377781240918],
       [1.0, 0.0, 0.0, 1.008453807952985, 1.0130429500553495],
       [1.0, 0.0, 0.0, 1.7912966561752484, 1.8325833141450703],
       [0.0, 1.0, 0.0, -1.7314961608249362, -1.0943465576039322],
       [1.0, 0.0, 0.0, -0.3615211764359756, 0.42765697570554906],
       [0.0, 1.0, 0.0, 0.22561095973072184, 0.05040823668012247],
       [0.0, 0.0, 1.0, -0.16581046438040975, -0.27480619351421154],
       [0.0, 0.0, 1.0, -0.013591021670525094, -1.3285009473438525]],
      dtype=object)

In [163]:
x_test[: , 3:] = sc.fit_transform(x_test[ :, 3:])

In [154]:
x_test

array([[ 1.,  1.],
       [-1., -1.]])