In [1]:
# For data wrangling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For data preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# For modelling
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# For model evaluation
from sklearn.metrics import accuracy_score, recall_score, f1_score

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv'

dataset_df = pd.read_csv(url)

In [3]:
dataset_df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [5]:
# Checking for missing values
dataset_df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [6]:
# Checking for the summary statistics of the dataset
dataset_df.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403


### Data preprocessing

In [7]:
# Checking for imbalance in the target variable

dataset_df.stabf.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

The distribution above is imbalanced. To be on a common ground for the quiz, the imbalance will not be handled. The imbalance will be handled using undersample method.

In [8]:
# Splitting the dataset into target and features

features = dataset_df.drop(columns=['stab', 'stabf'])
target = dataset_df['stabf']

In [9]:
# Splitting the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)

In [10]:
x_train.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
2694,6.255995,2.542401,7.024714,9.476518,3.529888,-1.224881,-0.688228,-1.61678,0.568221,0.618403,0.685739,0.660088
5140,5.070581,5.490253,8.075688,0.761075,4.220888,-1.280596,-1.902185,-1.038107,0.443515,0.097244,0.916955,0.129254
2568,1.220072,8.804028,3.874283,8.433949,3.614027,-1.039236,-0.953566,-1.621224,0.908353,0.923594,0.238881,0.660156
3671,7.498402,6.697603,8.798626,2.126236,3.134585,-1.581906,-0.589386,-0.963293,0.260826,0.899003,0.964752,0.600598
7427,7.074006,1.337511,6.100756,7.759156,2.526922,-0.92254,-0.6326,-0.971782,0.98458,0.716082,0.836928,0.165162


In [11]:
y_train.head()

2694    unstable
5140    unstable
2568    unstable
3671    unstable
7427    unstable
Name: stabf, dtype: object

In [12]:
y_train.value_counts()

unstable    5092
stable      2908
Name: stabf, dtype: int64

There is still imbalance in the distribution. This can be handled by using SMOTE to balance the training data.

### Scaling the Features

In [13]:
# Instantiating the Standard Scaler

scaler = StandardScaler()

In [14]:
# Applying Standard Scaler on the training set

x_train_scaled = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)

In [15]:
x_train_scaled.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.367327,-0.986042,0.650447,1.547527,-0.29149,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
1,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2,-1.46785,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.68256,-0.855302,1.39935,1.451534,-1.045743,0.492489
3,0.820081,0.52992,1.299657,-1.141975,-0.812854,-0.763632,1.521579,0.65878,-0.958319,1.361958,1.60414,0.275303
4,0.665424,-1.425627,0.3123,0.919137,-1.614296,0.760315,1.422019,0.639243,1.676895,0.69566,1.137504,-1.312575


In [16]:
# Applying Standard Scaler on the testing set

x_test_scaled = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)

In [17]:
x_test_scaled.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.593951,-0.412733,1.503924,1.116943,0.403423,-1.492971,-0.785033,1.566781,-0.901007,1.167203,-1.50733,1.084726
1,0.20219,0.374416,-0.1888,-0.522268,-0.225967,-1.058483,0.420047,1.028627,-1.625721,-0.39566,1.414651,1.226011
2,-1.079044,-0.313745,-0.884634,0.01708,-0.943122,0.112653,0.801335,0.733004,1.457108,-1.438495,0.651821,-1.682168
3,-0.08312,-1.107327,0.372805,-1.708152,0.75399,-1.637972,0.403805,-0.088036,0.083322,-1.672322,-0.357714,1.055865
4,0.873921,1.438466,0.086662,1.715037,-0.15388,-0.007015,-0.197053,0.472315,0.136549,-1.469731,0.956396,-0.819727


In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
encoder = LabelEncoder()

In [20]:
y_train = encoder.fit_transform(y_train)

y_test = encoder.fit_transform(y_test)

## QUIZ QUESTIONS

### Question 1

What is the F1 score of the case classifier?

In [21]:
"""
Information given in the confusion matrix:
TP = 355
FP = 1480
FN = 45
TN = 120
"""

# Precision formula: Precision = TP/(TP + FP)

precision = 355/(355 + 1480)

# Recall formula: Recall = TP/(TP + FN)

recall = 355/(355 + 45)

# Compute F1 score
F1_score = 2*(precision*recall)/(precision + recall)

F1_score = round(F1_score, 4)

print(f'F1_score = {F1_score}')

F1_score = 0.3177


### Question 2
Which method can be used as best fit in Logistic Regression?

### Answer 2
Maximum Likehood

### Question 3

Why do we use weak learners in boosting?

### Answer 3
To prevent overfitting

### Question 4

Which confusion matrix represents the model that satisfies the stated requirements?

### Answer 4

In [53]:
# Recall = TP/(TP + FN)
# False Positive Rate (FPR) = FP/(FP + TN)
# Cost = 5*FP + FN

def value_calc(TN,FP,FN,TP):
    recall = TP/(TP + FN)
    fpr = FP/(FP + TN)
    cost = 5*FP + FN
    print(f'Recall = {recall}\n')
    print(f'False Positive Rate = {fpr}\n')
    print(f'Cost = {cost}\n')

In [54]:
# For option A:
# TN = 98, FP = 2, TP = 18, TP = 82

value_calc(98,2,18,82)

Recall = 0.82

False Positive Rate = 0.02

Cost = 28



In [55]:
# For option B:
# TN = 99, FP = 1, FN = 21, TP = 79
value_calc(99,1,21,79)

Recall = 0.79

False Positive Rate = 0.01

Cost = 26



In [56]:
# For option C:
# TN = 96, FP = 4, FN = 10, TP = 90
value_calc(96,4,10,90)

Recall = 0.9

False Positive Rate = 0.04

Cost = 30



In [58]:
# For option D:
# TN = 91, FP = 9, FN = 22, TP = 78
value_calc(91,9,22,78)

Recall = 0.78

False Positive Rate = 0.09

Cost = 67



In [60]:
answers_dict = {'Option A':[0.82,0.02,28], 'Option B':[0.79,0.01,26], 'Option C':[0.9,0.04,30], 'Option D': [0.78,0.09,67]}

answers_df = pd.DataFrame(answers_dict, index=['Recall', 'False_Positive_Rate', 'Cost'])

answers_df

Unnamed: 0,Option A,Option B,Option C,Option D
Recall,0.82,0.79,0.9,0.78
False_Positive_Rate,0.02,0.01,0.04,0.09
Cost,28.0,26.0,30.0,67.0


From the above, option A satistied the stated conditions.

### Question 5
Which would you use to imporove the performance of a classifier?

### Answer 5
Bagging

### Question 6
Which of the following is not an Ensemble model?

### Answer 6
Decision Tree

### Question 7
Which metric should we use to evaluate a classifier that predicts if insurance claims are fraudulent or not?

### Answer 7
Recall

### Question 8
The ROC curve was generated from a classification algorithm. What can we say about the classifier?

### Answer 8
The model has no discrimination capacity to distinguish between the positive and negative class.

### Question 9
Based on the given matrix, which number was predicted with the least accuracy?

### Answer 9
8

### Question 10
A model has 90% accuracy and a poor recall with a training data containing 900 negative and 100 positive instances. What steps can be used to improve the model's performance?

### Answer 10
use bagging method

generate synthetic samples/data using SMOTE

### Question 11
How should you pre-process a label data for developing a machine learning classification algorithm?

### Answer 11
OneHotEncoding

### Question 12
What is the entropy of the target variable if its actual values are given as:
[1,0,1,1,0,1,0]?

### Answer 12

total = 7
number of ones = 4
number of zeros = 3

entropy = -(3/7log(3/7) + 4/7log(4/7))

entropy = -3/7log(3/7) - 4/7log(4/7)

### Question 13
Which of these is not a good metric for evaluating classificatio algorithms for data with imbalanced class problems?

### Answer 13
Accuracy

### Question 14
What is the accuracy on the test set using the random forest classifier? In 4 decimal places

### Solution 14

In [28]:
rand_forest = RandomForestClassifier(random_state = 1)

In [29]:
rand_forest.fit(x_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [30]:
rand_forest_pred = rand_forest.predict(x_test_scaled)

In [63]:
accuracy = round(accuracy_score(y_test, rand_forest_pred), 4)

In [64]:
accuracy

0.929

### Question 15
What is the accuracy on the test set using the xgboost classifier? In 4 decimal places

### Answer 15

In [33]:
xgb_cl = XGBClassifier(random_state=1)

In [34]:
xgb_cl.fit(x_train_scaled, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [35]:
xgb_pred = xgb_cl.predict(x_test_scaled)

xgb_pred

array([1, 1, 0, ..., 0, 1, 1])

In [36]:
acc_score = accuracy_score(y_test, xgb_pred)

In [37]:
acc_score

0.9195

### Question 16
What is the accuracy on the test set using the LGBM classifier? In 4 decimal places

### Answer 16

In [38]:
lgbm_cl = LGBMClassifier(random_state=1)

In [39]:
lgbm_cl.fit(x_train_scaled, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=1, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [40]:
lgbm_pred = lgbm_cl.predict(x_test_scaled)

In [41]:
acc_score = accuracy_score(y_test, lgbm_pred)

In [42]:
acc_score

0.9375

### Answer 17

In [43]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None]

In [44]:
hyperparam_grid = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'max_features':max_features}

In [45]:
ext_trees_cl = ExtraTreesClassifier(random_state=1)

In [65]:
rand_grid_search = RandomizedSearchCV(ext_trees_cl, param_distributions=hyperparam_grid,
                                      random_state=1,
                                      scoring='accuracy',
                                      n_iter=10,
                                      cv=5,
                                      n_jobs=-1,
                                      verbose=1)

In [66]:
rand_grid_search.fit(x_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.5min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=ExtraTreesClassifier(bootstrap=False,
                                                  ccp_alpha=0.0,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  max_samples=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                        

In [67]:
rand_grid_search.best_params_

{'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1000}

### Question 18

Train a new ExtraTreesClassifier with the new hyperparameters and compare the accuracy with the former accuracy.

In [68]:
# For ExtraTreesClassifier with no hyperparameter tuning

ext_trees_cl.fit(x_train_scaled, y_train)

ext_trees_pred = ext_trees_cl.predict(x_test_scaled)

untuned_acc_score = round(accuracy_score(y_test, ext_trees_pred), 4)

untuned_acc_score

0.928

In [69]:
# For ExtraTreesClassifier with hyperparameter tuning

ext_trees_tuned = ExtraTreesClassifier(n_estimators=1000, min_samples_split=2,
                                       min_samples_leaf=8,
                                       max_features=None, random_state=1)

ext_trees_tuned.fit(x_train_scaled, y_train)

new_pred = ext_trees_tuned.predict(x_test_scaled)

tuned_acc_score = accuracy_score(y_test, ext_trees_pred)

tuned_acc_score

0.928

### Question 19
What other hyperparameter optimization methods can be used apart from random search?

### Answer 19
All of the Methods given in the question.

### Question 20
Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the most and least important respectively?

### Answer
tau2, p1

In [51]:
feature_imp = ext_trees_tuned.feature_importances_

feature_imp

array([0.13729124, 0.14081904, 0.13413116, 0.13539377, 0.00368958,
       0.00538171, 0.00538868, 0.00504104, 0.10295388, 0.10854578,
       0.11218334, 0.10918078])

In [52]:
sorted(zip(feature_imp, features), reverse=True)

[(0.14081904109390708, 'tau2'),
 (0.13729123866104237, 'tau1'),
 (0.13539376858074692, 'tau4'),
 (0.13413115854843838, 'tau3'),
 (0.11218334415531182, 'g3'),
 (0.10918078049059556, 'g4'),
 (0.1085457837402933, 'g2'),
 (0.10295387795203104, 'g1'),
 (0.005388675009440006, 'p3'),
 (0.005381711298752081, 'p2'),
 (0.005041044094188226, 'p4'),
 (0.003689576375253394, 'p1')]

From the above, the feature with the highest feature importance is 'tau2' while the feature with the lowest feature importance is 'p1'