In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [8]:
data = pd.read_csv('Data_for_UCI_named.csv')
data.drop(columns='stab', inplace=True)
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [9]:
from sklearn.model_selection import train_test_split
X = data.drop(columns='stabf')
y = data['stabf']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_transformed = scaler.fit_transform(X_train)
X_test_transformed = scaler.fit_transform(X_test)

In [11]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
rfc = RandomForestClassifier(random_state=1)
etc = ExtraTreesClassifier(random_state=1)
xgbc = XGBClassifier(random_state=1)
lgbmc = LGBMClassifier(random_state=1)

## RandomForestClassifier

In [14]:
rfc.fit(X_train_transformed, y_train)

# predict
rfc_pred = rfc.predict(X_test_transformed)

# Accuracy score
rfc_accuracy = accuracy_score(y_true=y_test, y_pred=rfc_pred)
print(f'RandomForestClassifier Accuracy score: {round(rfc_accuracy, 4)}')

RandomForestClassifier Accuracy score: 0.928


## XGBClassifier

In [21]:
from sklearn.preprocessing import LabelEncoder
train_target =LabelEncoder().fit_transform(y_train)
test_target = LabelEncoder().fit_transform(y_test)

In [25]:
xgbc.fit(X_train_transformed, train_target)

xgb_pred = xgbc.predict(X_test_transformed)

xgb_accuracy = accuracy_score(test_target, xgb_pred)
print(f'XGBRegressor accuracy: {round(xgb_accuracy,4)}')

XGBRegressor accuracy: 0.946


## LGBMClassifier

In [26]:
lgbmc.fit(X_train_transformed, y_train)

In [29]:
lgbm_pred = lgbmc.predict(X_test_transformed)

lgbm_accuracy = accuracy_score(y_test, lgbm_pred)
print(f'LGBMClassifier accuracy: {round(lgbm_accuracy,4)}')

LGBMClassifier accuracy: 0.9365


## ExtraTreesClassifier

In [55]:
etc = ExtraTreesClassifier(random_state=1)


etc.fit(X_train_transformed, train_target)

# predict
etc_pred = etc.predict(X_test_transformed)

# Accuracy score
etc_accuracy = accuracy_score(y_true=test_target, y_pred=etc_pred)
print(f'ExtraTreesClassifier Accuracy score: {round(etc_accuracy, 4)}')

ExtraTreesClassifier Accuracy score: 0.932


In [54]:
from sklearn.model_selection import RandomizedSearchCV
# using the parameters in the options
n_estimators = [100, 500, 1000,300]
min_samples_split = [7, 2, 2,5]
min_samples_leaf = [4, 8, 6, 8]
max_features = [None,'log2', 'auto', None]

parameters= {'n_estimators': n_estimators, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'max_features': max_features}


randomcv = RandomizedSearchCV(etc,parameters,cv=5, n_iter=10, scoring='accuracy', n_jobs=-1, verbose=1, random_state=1)


rns_search = randomcv.fit(X_train_transformed, train_target)

# best hyperparameters from the randomized search CV
rns_search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 1000,
 'min_samples_split': 7,
 'min_samples_leaf': 6,
 'max_features': None}

In [56]:
etc_rnd = ExtraTreesClassifier(max_features=None,min_samples_leaf=6, min_samples_split=7, n_estimators=1000, random_state=1)

# fit to data
etc_rnd.fit(X_train_transformed, train_target)

# predict
etc_rnd_pred = etc_rnd.predict(X_test_transformed)

# Accuracy score
etcc_accuracy = accuracy_score(y_true=test_target, y_pred=etc_rnd_pred)
print(f'ExtraTreesClassifier Accuracy score: {etcc_accuracy}')

ExtraTreesClassifier Model Accuracy score: 0.926


## Features Importance

In [62]:
features_importance = etc_rnd.feature_importances_
features_weight = np.c_[X.columns, features_importance]

In [63]:
features_weight

array([['tau1', 0.13546158300736952],
       ['tau2', 0.13842145507674694],
       ['tau3', 0.1331275135198437],
       ['tau4', 0.13396645233095442],
       ['p1', 0.005354328485013283],
       ['p2', 0.0074387105063999205],
       ['p3', 0.007280739349278902],
       ['p4', 0.006874405071581972],
       ['g1', 0.10306405644554334],
       ['g2', 0.1079825123590218],
       ['g3', 0.1123191891901725],
       ['g4', 0.10870905465807375]], dtype=object)

tau2 has highest feature_importances_ while p1 has the lowest.