In [304]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [305]:
# load dataset
df = pd.read_csv('Data (1).csv')
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [306]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


# Data imputation

In [307]:
for col in df.columns:
    missing_value = df[col].isna().sum()
    missing_percent = missing_value/len(df)*100
    print(f'Column {col} has {missing_percent}% missing value')

Column Country has 0.0% missing value
Column Age has 10.0% missing value
Column Salary has 10.0% missing value
Column Purchased has 0.0% missing value


In [308]:
from sklearn.impute import SimpleImputer
# Create an instance
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

#split dataset 
x = df.iloc[:, :-1]
y = df.iloc[:, -1]



In [309]:
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [310]:
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

In [311]:
# fit on the dataset to caculate the statistic for each column
imputer.fit(x[['Age', 'Salary']])

In [312]:
# transform missing data
x[['Age', 'Salary']] = imputer.transform(x[['Age', 'Salary']])

In [313]:
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


# ENCODE DEPENDENT VARIABLE

## encode independent variable

In [314]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# remainder = 'passthrough', to keep the cols which not be transformd, otherwise, the remaining will  be
# create an instance
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')


In [315]:
# convert x form dataset to matrix
x = x.values
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [316]:
# convert x from matrix to np.array
x = np.array(ct.fit_transform(x))
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

## encode dependent variable

In [317]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
y = lb.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# split the dataset

In [318]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

In [319]:
x_train

array([[0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

# FEATURE SCALING

In [320]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])
x_test[:, 3:] = sc.fit_transform(x_test[:, 3:])
x_test

array([[0.0, 1.0, 0.0, -1.3880272079128577, -0.5513801778287937],
       [1.0, 0.0, 0.0, 0.4594174561401711, 1.40351317992784],
       [0.0, 0.0, 1.0, 0.9286097517726866, -0.8521330020990451]],
      dtype=object)

In [321]:
x_train

array([[0.0, 1.0, 0.0, -0.038910211282047996, -0.22960023388015188],
       [1.0, 0.0, 0.0, 0.5058327466666259, 0.49120534884662787],
       [0.0, 0.0, 1.0, -0.3112816902563849, -0.4731156334500103],
       [0.0, 0.0, 1.0, -1.809324824615238, -1.6127677034369463],
       [1.0, 0.0, 0.0, 1.0505757046152997, 1.1048641557626704],
       [0.0, 1.0, 0.0, 1.3229471835896367, 1.455526331143266],
       [1.0, 0.0, 0.0, -0.7198389087178904, -0.736112264985457]],
      dtype=object)

# TRAINING MACHINE LEARNING MODEL

In [322]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x, y)

# evaluate model

In [323]:
# evalute on training set
lr.score(x_train, y_train)

0.7142857142857143

In [324]:
# evalute on test set
lr.score(x_test, y_test)


0.0

In [325]:
y_pred = lr.predict(x_test)

In [326]:
pd.DataFrame({'y_test': y_test, 'y_pred' : y_pred})

Unnamed: 0,y_test,y_pred
0,0,1
1,1,0
2,0,1


In [None]:
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor
dt_model.fit(x_train, y_train)
y_preds = dt_model