In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading data
data = pd.read_csv('Data_for_UCI_named.csv')
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
#dropping 'stab' column
data.drop('stab',axis=1,inplace=True)
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [4]:
#Splitting data into predictors and target

X = data.drop('stabf',axis=1)
y = data['stabf']

In [5]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [6]:
#importing algroithms and metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import recall_score, classification_report, accuracy_score

NumExpr defaulting to 2 threads.


In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_x_train = pd.DataFrame(scaler.fit_transform(x_train),columns=x_train.columns)
normalized_x_test = pd.DataFrame(scaler.transform(x_test),columns=x_test.columns)

### Question 2


In [8]:
xgb = XGBClassifier(random_state=1, learning_rate = 0.1, max_depth=3)
xgb.fit(normalized_x_train, y_train)
xgb_pred = xgb.predict(normalized_x_test)
print(f'Accuracy: {round(accuracy_score(y_test,xgb_pred),4)}')

Accuracy: 0.9195


### Question 4

In [9]:
forest = RandomForestClassifier(random_state=1)
forest.fit(normalized_x_train,y_train)
rf_pred = forest.predict(normalized_x_test)
print(f'Accuracy: {round(accuracy_score(y_test,rf_pred),4)}')

Accuracy: 0.929


### Question 14

In [10]:
lgbm = LGBMClassifier(random_state=1)
lgbm.fit(normalized_x_train, y_train)
lgbm_pred = lgbm.predict(normalized_x_test)
print(f'Accuracy: {round(accuracy_score(y_test,lgbm_pred),4)}')

Accuracy: 0.9395


In [11]:
tree = ExtraTreesClassifier(random_state=1)
tree.fit(normalized_x_train, y_train)
tree_pred = tree.predict(normalized_x_test)
print(f'Accuracy: {round(accuracy_score(y_test,tree_pred),4)}')

Accuracy: 0.928


In [12]:
#Hyper-parameters

n_estimators = [50,100,300,500,1000]

min_samples_split = [2,3,5,7,9]

min_samples_leaf = [1,2,4,6,8]

max_features = ['auto','sqrt','log2',None]

hyperparameter_grid = {'n_estimators': n_estimators,
                       
                       'min_samples_leaf': min_samples_leaf,
                       
                       'min_samples_split': min_samples_split,
                       
                       'max_features': max_features}

### Question 11

In [13]:
from sklearn.model_selection import RandomizedSearchCV

In [14]:
rsv = RandomizedSearchCV(estimator=tree,param_distributions=hyperparameter_grid, cv=5, scoring='accuracy',n_iter=10, n_jobs=-1, verbose=1,random_state=1)
search = rsv.fit(normalized_x_train, y_train)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

### Question 3

In [15]:
trees = ExtraTreesClassifier(**search.best_params_, random_state=1)
trees.fit(normalized_x_train,y_train)
trees_pred = trees.predict(normalized_x_test)
print(f'Accuracy: {round(accuracy_score(y_test,trees_pred),4)}')

Accuracy: 0.927


### Question 15

In [16]:
feat_importance = trees.feature_importances_
feat_names = trees.feature_names_in_
rank_importance = pd.Series(feat_importance, index=feat_names).sort_values(ascending=False)
rank_importance

tau2    0.140508
tau1    0.137240
tau4    0.135417
tau3    0.134680
g3      0.113063
g4      0.109541
g2      0.107578
g1      0.102562
p3      0.005429
p2      0.005337
p4      0.004962
p1      0.003683
dtype: float64