In [None]:
import pandas as pd

In [None]:
df= pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv")

In [None]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [None]:
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [None]:
df.duplicated().sum()

0

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [None]:
#exploring the dependent variable
df["stabf"].nunique()

2

In [None]:
#check the distribution of the target variable
df["stabf"].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [None]:
# drop stab because of its direct relationship with stabf
df= df.drop("stab", axis =1)

In [None]:
# selecting feature and target variables
X = df.drop(columns='stabf')
y = df['stabf']

In [None]:
#split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print('X_train shape: {}'.format(x_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('X_test shape: {}'.format(x_test.shape))
print('y_test shape: {}'.format(y_test.shape))

X_train shape: (8000, 12)
y_train shape: (8000,)
X_test shape: (2000, 12)
y_test shape: (2000,)


In [None]:
#standardise train and test set using standard scaler

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)

standardised_train_df = scaler.transform(x_train)
standardised_test_df = scaler.transform(x_test)

In [None]:
# put the data into a data frame
standardised_train_df = pd.DataFrame(standardised_train_df, columns=x_train.columns)
standardised_test_df = pd.DataFrame(standardised_test_df, columns=x_test.columns)
print('standardised_train_df: {}'.format(standardised_train_df.shape))
print('standardised_test_df: {}'.format(standardised_test_df.shape))

standardised_train_df: (8000, 12)
standardised_test_df: (2000, 12)


Random Forest classifier

In [None]:
#train a RandomForestClassifier 
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1)
#fit on train set
rfc.fit(standardised_train_df, y_train)
# predictions the test set
rfc_pred = rfc.predict(standardised_test_df)
rfc_pred.shape  

(2000,)

In [None]:
# model performance
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix, classification_report

#model accuracy
accuracy = accuracy_score(y_test, rfc_pred)
print('Accuracy: {}'.format(round(accuracy*100), 4))


#precision
precision = precision_score(y_test, rfc_pred, pos_label='stable')
print('Precision: {}'.format(round(precision*100), 2))  

#recall
recall = recall_score(y_test, rfc_pred, pos_label='stable')
print('Recall: {}'.format(round(recall*100), 2))

#F1 score
f1 = f1_score(y_test, rfc_pred, pos_label='stable')
print('F1: {}'.format(round(f1*100), 2))

#classification report
print('Classification Report:\n', classification_report(y_test,rfc_pred, digits =4))

#confusion matrix
rfc_cnf_mat = confusion_matrix(y_test, rfc_pred, labels=['unstable', 'stable'])
print('Confusion Matrix:\n', rfc_cnf_mat)

Accuracy: 93
Precision: 92
Recall: 88
F1: 90
Classification Report:
               precision    recall  f1-score   support

      stable     0.9191    0.8778    0.8980       712
    unstable     0.9341    0.9573    0.9456      1288

    accuracy                         0.9290      2000
   macro avg     0.9266    0.9176    0.9218      2000
weighted avg     0.9288    0.9290    0.9286      2000

Confusion Matrix:
 [[1233   55]
 [  87  625]]


In [None]:
print("Training set score: {:.4f}".format(rfc.score(standardised_train_df, y_train)))
print("Test set score: {:.4f}".format(rfc.score(standardised_test_df, y_test)))


Training set score: 1.0000
Test set score: 0.9290


Extra Tree classifier

In [None]:
from  sklearn.ensemble import ExtraTreesClassifier
etc = RandomForestClassifier(random_state = 1)
#fit on train set
etc.fit(standardised_train_df, y_train)
# predictions the test set
etc_pred = etc.predict(standardised_test_df)
etc_pred.shape  


(2000,)

In [None]:
# extra tree classifier model performance
#accuracy
accuracy = accuracy_score(y_test, etc_pred)
print('Accuracy: {}'.format(round(accuracy*100), 4))


#precision
precision = precision_score(y_test, etc_pred, pos_label='stable')
print('Precision: {}'.format(round(precision*100), 2))  

#recall
recall = recall_score(y_test, etc_pred, pos_label='stable')
print('Recall: {}'.format(round(recall*100), 2))

#F1 score
f1 = f1_score(y_test, etc_pred, pos_label='stable')
print('F1: {}'.format(round(f1*100), 2))

#classification report
print('Classification Report:\n', classification_report(y_test,etc_pred, digits =4))

#confusion matrix
etc_cnf_mat = confusion_matrix(y_test, etc_pred, labels=['unstable', 'stable'])
print('Confusion Matrix:\n', etc_cnf_mat)

Accuracy: 93
Precision: 92
Recall: 88
F1: 90
Classification Report:
               precision    recall  f1-score   support

      stable     0.9191    0.8778    0.8980       712
    unstable     0.9341    0.9573    0.9456      1288

    accuracy                         0.9290      2000
   macro avg     0.9266    0.9176    0.9218      2000
weighted avg     0.9288    0.9290    0.9286      2000

Confusion Matrix:
 [[1233   55]
 [  87  625]]


light gradient boosting classifier

In [None]:
from lightgbm import LGBMClassifier

lgb= LGBMClassifier(random_state = 1)

#fit the model
lgb.fit(standardised_train_df, y_train)

LGBMClassifier(random_state=1)

In [None]:
#predict on test set
lgb_pred = lgb.predict(standardised_test_df)

In [None]:
# Model Performance of Light GBM Classifier
#accuracy
lgb_accuracy = accuracy_score(y_test, lgb_pred)
print('Accuracy: {}'.format(round(lgb_accuracy*100), 2))

#precision
lgb_precision = precision_score(y_test, lgb_pred, pos_label='stable')
print('Precision: {}'.format(round(lgb_precision*100), 2))  

#recall
lgb_recall = recall_score(y_test, lgb_pred, pos_label='stable')
print('Recall: {}'.format(round(lgb_recall*100), 2))

#F1 score
lgb_f1 = f1_score(y_test, lgb_pred, pos_label='stable')
print('F1: {}'.format(round(lgb_f1*100), 2))

#classification report
print('Classification Report:\n', classification_report(y_test,lgb_pred, digits =4))

#confusion matrix
lgb_cnf_mat = confusion_matrix(y_test, lgb_pred)
print('Confusion Matrix:\n', lgb_cnf_mat)

Accuracy: 94
Precision: 93
Recall: 89
F1: 91
Classification Report:
               precision    recall  f1-score   support

      stable     0.9297    0.8919    0.9104       712
    unstable     0.9415    0.9627    0.9520      1288

    accuracy                         0.9375      2000
   macro avg     0.9356    0.9273    0.9312      2000
weighted avg     0.9373    0.9375    0.9372      2000

Confusion Matrix:
 [[ 635   77]
 [  48 1240]]


In [None]:
print("Training set score: {:.4f}".format(lgb.score(standardised_train_df, y_train)))
print("Test set score: {:.4f}".format(lgb.score(standardised_test_df, y_test)))

Training set score: 0.9982
Test set score: 0.9375


XGBOOST 

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(standardised_train_df, y_train)

TypeError: ignored

In [None]:
xgb = XGBRegressor(objective='reg:squarederror')

In [None]:
#combination of hyperparameters
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}


In [None]:
from sklearn.model_selection import RandomizedSearchCV

#etc = ExtraTreesClassifier(random_state = 1)

#set up randomsearch with 5folds

randomcv = RandomizedSearchCV(estimator = etc, 
                              param_distributions = hyperparameter_grid, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1,
                              random_state = 1)

NameError: ignored