### Importing the libraries needed for preprocessing, model building and evaluation

In [36]:
import pandas as pd
import numpy as np

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [38]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score, precision_score

In [39]:
# reading the dataset
df = pd.read_csv("Data_for_UCI_named.csv")

In [40]:
#taking a look at our dataset
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [41]:
# checking some statistical overview of the dataset
df.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403


In [42]:
# checking for missing values
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [43]:
# dropping the stab column as instructed
df = df.drop("stab", axis=1)

In [44]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [45]:
# creating our dependent and independent variable
x = df.drop("stabf", axis=1)
y = df["stabf"]

In [46]:
# converting the categorical column stabf to numerical
y = pd.get_dummies(y)
y = y.drop("unstable", axis=1)

In [60]:
y.shape

(10000, 1)

In [47]:
# spliting the dataset into train and test split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state = 1)

In [48]:
# Standardizing the dataset using standardScaler
sc = StandardScaler()

scaled_x_train = pd.DataFrame(sc.fit_transform(x_train))
scaled_x_train.columns = x_train.columns

scaled_x_test = pd.DataFrame(sc.transform(x_test))
scaled_x_test.columns = x_test.columns

In [49]:
# Creating an instance for the models
rf = RandomForestClassifier(random_state=1)
etc = ExtraTreesClassifier(random_state=1)
xgb = XGBClassifier(random_state=1)
lgbm = LGBMClassifier(random_state=1)

### Question 1
### What is the F1 score of this classifier?

In [58]:
precision = (355) / (355+1480)
recall = (355) / (355+45)
f1_score = (2 * precision  *recall) / (precision + recall)
print(round(f1_score, 4))

0.3177


### Question 14
### What is the accuracy on the test set using the random forest classifier? In 4 decimal places.

In [51]:
rf.fit(scaled_x_train, y_train)
rf_preds = rf.predict(scaled_x_test)
rf_score = accuracy_score(y_test, rf_preds)
print(round(rf_score,4))

  rf.fit(scaled_x_train, y_train)


0.9295


### Question 15
### What is the accuracy on the test set using the xgboost classifier? In 4 decimal places.

In [52]:
xgb.fit(scaled_x_train, y_train)
xgb_preds = xgb.predict(scaled_x_test)
xgb_score = accuracy_score(y_test, xgb_preds)
print(round(xgb_score,4))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.9455


### Question 16
### What is the accuracy on the test set using the LGBM classifier? In 4 decimal places.

In [53]:
lgbm.fit(scaled_x_train, y_train)
lgbm_preds = lgbm.predict(scaled_x_test)
lgbm_score = accuracy_score(y_test, lgbm_preds)
print(round(lgbm_score,4))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.9395


### Question 17
### Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

Options

In [54]:
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {'n_estimators': n_estimators,

'min_samples_leaf': min_samples_leaf,

'min_samples_split': min_samples_split,

'max_features': max_features}

In [55]:
 rdmsearch = RandomizedSearchCV(etc, param_distributions=hyperparameter_grid,n_jobs=1, n_iter=10, cv=5, verbose=1,scoring="accuracy", random_state=1)

rdmsearch.fit(x_train,y_train)
best_params = rdmsearch.best_params_
best_score = rdmsearch.best_score_

print(best_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

{'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None}


### Question 18
### Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?

In [56]:
# training ExtraTreesClassifier without hyperparameter
etc.fit(scaled_x_train, y_train)
etc_preds = etc.predict(scaled_x_test)
etc_score = accuracy_score(y_test, etc_preds)
print(round(etc_score, 4))

  etc.fit(scaled_x_train, y_train)


0.9285


In [57]:
#Training ExtraTreesClassifier with the best parameters
best_param_etc = ExtraTreesClassifier(n_estimators=1000, min_samples_split=2, min_samples_leaf=8,max_features=None, random_state=1)
best_param_etc.fit(scaled_x_train, y_train)
best_param_etc_preds = best_param_etc.predict(scaled_x_test)
best_param_etc_score = accuracy_score(y_test, best_param_etc_preds)
print(round(best_param_etc_score, 4))

  best_param_etc.fit(scaled_x_train, y_train)


0.927


### The accuracy is lower

### Question 20
### Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the most and least important respectively?

In [70]:
features = pd.Series(best_param_etc.feature_importances_, index = x.columns)
features

tau1    0.137240
tau2    0.140508
tau3    0.134680
tau4    0.135417
p1      0.003683
p2      0.005337
p3      0.005429
p4      0.004962
g1      0.102562
g2      0.107578
g3      0.113063
g4      0.109541
dtype: float64

In [71]:
print(features.max(), features.min())

0.14050750384993677 0.003683422151688322


### The max is tau2, and the min is p1