In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.utils import shuffle
import sklearn
from sklearn.metrics import accuracy_score,recall_score, precision_score, f1_score, confusion_matrix, classification_report
#from sklearn.preprocessing import LabelEncoder

In [2]:
df=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv')

In [3]:
df.drop(columns=['stab'],inplace=True)

In [4]:
df.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable


In [24]:
## Splitting the data
X=df.drop(columns=['stabf'])
y=df.stabf
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)        

In [25]:
#scale the data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(x_train)
x_train_scaled=pd.DataFrame(x_train_scaled,columns=X.columns)

x_test_scaled=scaler.transform(x_test)
x_test_scaled=pd.DataFrame(x_test_scaled,columns=X.columns)

In [51]:
## RANDOM FOREST

rfc = RandomForestClassifier(random_state = 1)
#fit on train set
rfc.fit(x_train_scaled, y_train)
rf_pred = rfc.predict(x_test_scaled)
accuracy = accuracy_score(y_test, rf_pred)
print('Accuracy: {}'.format(round(accuracy*100), 4))



print("Training set score: {:.4f}".format(rfc.score(x_train_scaled, y_train)))
print("Test set score: {:.4f}".format(rfc.score(x_test_scaled, y_test)))

Accuracy: 93
Training set score: 1.0000
Test set score: 0.9290


In [50]:
## EXTRA TREES
ETC = ExtraTreesClassifier(random_state = 1)
#fit on train set
ETC.fit(x_train_scaled, y_train)
etc_pred = ETC.predict(x_test_scaled)
accuracy = accuracy_score(y_test, etc_pred)
print('Accuracy: {}%'.format(round(accuracy*100), 2))

print("Training set score: {:.3f}".format(ETC.score(x_train_scaled, y_train)))
print("Test set score: {:.3f}".format(ETC.score(x_test_scaled, y_test)))

Accuracy: 93%
Training set score: 1.000
Test set score: 0.928


In [49]:
## XGBOOST

XGB = XGBClassifier(random_state = 1)
#fit on train set
XGB.fit(x_train_scaled, y_train)
xgb_pred = XGB.predict(x_test_scaled)
accuracy = accuracy_score(y_test, xgb_pred)
print('Accuracy: {}%'.format(round(accuracy*100), 4))


print("Training set score: {:.4f}".format(XGB.score(x_train_scaled, y_train)))
print("Test set score: {:.4f}".format(XGB.score(x_test_scaled, y_test)))



Accuracy: 95%
Training set score: 1.0000
Test set score: 0.9455


In [48]:
## LGBM

LGBM =LGBMClassifier(random_state = 1)
#fit on train set
LGBM.fit(x_train_scaled, y_train)
lgbm_pred = LGBM.predict(x_test_scaled)
accuracy = accuracy_score(y_test, lgbm_pred)
print('Accuracy: {}%'.format(round(accuracy*100), 4))

print("Training set score: {:.4f}".format(LGBM.score(x_train_scaled, y_train)))
print("Test set score: {:.4f}".format(LGBM.score(x_test_scaled, y_test)))

Accuracy: 94%
Training set score: 0.9979
Test set score: 0.9395


In [28]:
from sklearn.model_selection import RandomizedSearchCV

In [29]:
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', None, None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

In [31]:
randomcv = RandomizedSearchCV(estimator = ETC, 
                              param_distributions = hyperparameter_grid, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1,
                              random_state = 1)

In [32]:
search = randomcv.fit(x_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [33]:
#get best parameters
search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [34]:
etc2=ExtraTreesClassifier(n_estimators=1000,min_samples_split=2,min_samples_leaf=8,max_features=None)
etc2.fit(x_train_scaled, y_train)

ExtraTreesClassifier(max_features=None, min_samples_leaf=8, n_estimators=1000)

In [35]:
etc2.score(x_test_scaled, y_test)

0.9255

In [47]:
pd.Series(etc2.feature_importances_, index=etc2.feature_names_in_).sort_values()

p1      0.003616
p4      0.005157
p3      0.005355
p2      0.005357
g1      0.102714
g2      0.108078
g4      0.109333
g3      0.112138
tau3    0.134175
tau4    0.135490
tau1    0.137910
tau2    0.140675
dtype: float64