<a href="https://colab.research.google.com/github/Moshood-O/Hamoye-Internship/blob/master/StageC_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hamoye Internship (Data Science Track)
# Stage C Code Solutions

## Import Necessary Dependencies

In [1]:
import pandas as pd # data manipulations

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier # bagging ensemble models

# boosting ensemble models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# for splitting datasets into training and test sets
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

from sklearn.preprocessing import StandardScaler # scaling

## Mount the drive

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Read the dataset into a `DataFrame`

In [3]:
grid_df = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Data_for_UCI_named.csv")
grid_df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


## Get a summary of `grid_df`

In [4]:
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


## Drop the `stab` column

In [5]:
grid_df.drop(["stab"], axis=1, inplace=True)
grid_df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


## Split the data into training and test sets

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X = grid_df.drop(["stabf"], axis=1) # define predictors
y = grid_df["stabf"] # define target

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print("X_train", X_train.shape)
print("X_test", X_test.shape)

X_train (8000, 12)
X_test (2000, 12)


## Feature Scaling

In [8]:
scaler = StandardScaler() # instantiate a StandardScaler object

# fit the scaler object to X_train, transform X_train and then store the results in a dataframe
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train, y_train), columns=X_train.columns)

# transform X_test using the fitted scaler object and then store the results in a dataframe
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

## Question 14

## Random Forest

**What is the accuracy on the test set using the random forest classifier? In 4 decimal places**

## Answer: `0.9295`

## Solution:

### Train a Random Forest model

In [9]:
forest_model = RandomForestClassifier(random_state=1) # create a random forest model

forest_model.fit(X_train_scaled, y_train) # train the model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

### Make predictions with the model on the test set

In [10]:
y_pred_forest = forest_model.predict(X_test_scaled)

### Calculate the accuracy of the model

In [11]:
round(accuracy_score(y_test, y_pred_forest), 4)

0.929

## Question 15

## XGBClassifier

**What is the accuracy on the test set using the xgboost classifier? In 4 decimal places**

## Answer: `0.9195`

## Solution:

### Train an extreme gradient boosting (xgboost) model

In [12]:
xgboost_model = XGBClassifier(random_state=1) # Create an xgboost model

xgboost_model.fit(X_train_scaled, y_train) # Train the model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

### Make predictions with the model

In [13]:
y_pred_xgb = xgboost_model.predict(X_test_scaled)

### Calculate the accuracy of the model

In [14]:
round(accuracy_score(y_test, y_pred_xgb), 4)

0.9195

## Question 16

## LGBMClassifier

**What is the accuracy on the test set using the LGBM classifier? In 4 decimal places**

## Answer: `0.9375`

## Solution:

### Train a light gradient boosting (LGBM) classifier

In [15]:
lgbm_model = LGBMClassifier(random_state=1) # create an lgbm model

lgbm_model.fit(X_train_scaled, y_train) # train the model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=1, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

### Make predictions with the model

In [16]:
y_pred_lgbm = lgbm_model.predict(X_test_scaled)

### Calculate the accuracy of the model

In [17]:
round(accuracy_score(y_test, y_pred_lgbm))

1.0

## Question 17

## Extra Trees Classifier

**To improve the Extra Trees Classifier, you will use the following parameters (number of estimators, minimum number of samples, minimum number of samples for leaf node and the number of features to consider when looking for the best split) for the hyperparameter grid needed to run a Randomized Cross Validation Search (RandomizedSearchCV).** 

n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

**Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?**

## Answer: `N_estimators = 1000, min_samples_split = 2, min_samples_leaf = 8, `                  `max_features = None`

## Solution:

### Train an Extra Trees model without hyperparameter optimization

In [18]:
extra_trees = ExtraTreesClassifier(random_state=1) # create an extra trees model

extra_trees.fit(X_train_scaled, y_train) # train the model

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=1, verbose=0,
                     warm_start=False)

### Make predictions with the model

In [19]:
y_pred_extra_trees = extra_trees.predict(X_test_scaled)

### Calculate the accuracy of the model

In [20]:
extra_trees_accuracy = accuracy_score(y_test, y_pred_extra_trees)
extra_trees_accuracy

0.928

### Perform hyperparameter tuning using RandomizedSearchCV

In [21]:
# Define the hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

In [22]:
new_extra_trees = ExtraTreesClassifier(random_state=1) # create a new extra trees model

# instantiate a RandomizedSearchCV object and conduct a search of the grid
random_search = RandomizedSearchCV(new_extra_trees, hyperparameter_grid, cv=5, n_iter=10, scoring="accuracy",\
                                   verbose=1, n_jobs=-1, random_state=1)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.5min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=ExtraTreesClassifier(bootstrap=False,
                                                  ccp_alpha=0.0,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  max_samples=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                        

### Get the best values of the hyperparameters as found by the random search

In [23]:
optimal_parameters = random_search.best_params_
optimal_parameters

{'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1000}

## Question 18

**Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?**

## Answer: `lower`

## Solution:

### Train an Extra Trees model based on the optimal hyperparameters as found by the random search

In [24]:
optimal_extra_trees = ExtraTreesClassifier(**optimal_parameters, random_state=1) # create an extra trees model

optimal_extra_trees.fit(X_train_scaled, y_train) # train the model

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features=None,
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=8, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=1000,
                     n_jobs=None, oob_score=False, random_state=1, verbose=0,
                     warm_start=False)

### Make predictions with the model

In [25]:
y_pred_optimal_extra = optimal_extra_trees.predict(X_test_scaled)

### Calculate the acuracy of the optimal model

In [26]:
accuracy_score(y_test, y_pred_optimal_extra)

0.927

**The initial extra trees model gave an accuracy of 0.928 while the optimized model gave an accuracy of 0.927**

## Question 20

**Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the most and least important respectively?**

## Answer: `tau2`, `p1`

## Solution:

### Get the importance of each feature using the optimal ExtraTreesClassifier model

In [27]:
feature_importance = optimal_extra_trees.feature_importances_

### Store the results in a `Series` and sort it in descending order

In [28]:
feature_importance = pd.Series(feature_importance, index=X_train_scaled.columns).sort_values(ascending=False)
feature_importance

tau2    0.140508
tau1    0.137240
tau4    0.135417
tau3    0.134680
g3      0.113063
g4      0.109541
g2      0.107578
g1      0.102562
p3      0.005429
p2      0.005337
p4      0.004962
p1      0.003683
dtype: float64

### Get the most and least important features

In [29]:
feature_importance[[0, -1]].index.to_list()

['tau2', 'p1']

# Notebook by: OLALEKE, MOSHOOD ADEGBOYEGA