### Import Dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb

### Load Dataset

In [2]:
df = pd.read_csv("./Data_for_UCI_named.csv")

In [3]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
# Drop the 'stab' column
df.drop("stab", axis=1, inplace=True)

In [5]:
# Encode the target variable
df['stabf'] = df['stabf'].map({'stable': 1, 'unstable': 0})

In [6]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,1
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0


In [7]:
# Split the data into train and test sets
X = df.drop('stabf', axis=1)
y = df['stabf']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
# Split the data using StandardScaler()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# Train a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=1)
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)

Random Forest Accuracy: 0.9295


In [10]:
# Train an Extra Trees Classifier
et_classifier = ExtraTreesClassifier(random_state=1)
et_classifier.fit(X_train, y_train)
et_predictions = et_classifier.predict(X_test)
et_accuracy = accuracy_score(y_test, et_predictions)
print("Extra Trees Accuracy:", et_accuracy)

Extra Trees Accuracy: 0.9285


In [11]:
# Train an XGBoost model
xgb_model = xgb.XGBClassifier(random_state=1)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print("XGBoost Accuracy:", xgb_accuracy)

XGBoost Accuracy: 0.9455


In [12]:
# Train a LightGBM model
lgb_model = lgb.LGBMClassifier(random_state=1)
lgb_model.fit(X_train, y_train)
lgb_predictions = lgb_model.predict(X_test)
lgb_accuracy = accuracy_score(y_test, lgb_predictions)
print("LightGBM Accuracy:", lgb_accuracy)

LightGBM Accuracy: 0.9395


### Q17

Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

In [13]:
# Define the parameter grid for the randomized search
param_grid = {
    'n_estimators': [100, 200, 300, 500, 1000],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 7, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Create the ExtraTreesClassifier estimator
estimator = ExtraTreesClassifier(random_state=1)

# Perform the randomized search
random_search = RandomizedSearchCV(
    estimator=estimator,
    param_distributions=param_grid,
    scoring='accuracy',
    n_iter=10,
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=1
)

# Fit the randomized search to your data
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters:", best_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'bootstrap': False}


### Q18

Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?

In [15]:
# Initialize the new ExtraTreesClassifier model with the best hyperparameters from RandomizedSearchCV
new_model = ExtraTreesClassifier(
    n_estimators=300,
    max_depth=None, 
    min_samples_split=2,
    min_samples_leaf=1, 
    max_features='auto',
    bootstrap=True,
    random_state=1
)

# Train the new model on the training data
new_model.fit(X_train, y_train)

# Make predictions with the new model
new_predictions = new_model.predict(X_test)

# Compute the accuracy of the new model
new_accuracy = accuracy_score(y_test, new_predictions)

In [16]:
print("Best hyperparameters Score:", new_accuracy)

Best hyperparameters Score: 0.9225


### Q20

Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the most and least important respectively?

In [22]:
# Get the feature importances
feature_importances = new_model.feature_importances_
# Create a DataFrame to join the feature importances with column names
feature_importance_df = pd.DataFrame({'Feature': range(X_train.shape[1]), 'Importance': feature_importances})

print(feature_importance_df)

    Feature  Importance
0         0    0.115475
1         1    0.116290
2         2    0.112337
3         3    0.112816
4         4    0.041764
5         5    0.043297
6         6    0.042655
7         7    0.042842
8         8    0.088458
9         9    0.094496
10       10    0.096088
11       11    0.093482


In [24]:
feature_importances

array([0.11547547, 0.11628977, 0.11233702, 0.11281598, 0.04176358,
       0.04329658, 0.04265472, 0.04284235, 0.08845826, 0.094496  ,
       0.09608829, 0.09348199])