David Apine Kwenev  
ID: 14d441dfa201f000

Data Source: https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv"

In [3]:
data_df = pd.read_csv(data_url)
data_df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


### Dropping one of the dependent variable (`stab`)
Because of the direct relationship between 'stab' and 'stabf' ('stabf' = 'stable' if 'stab' <= 0, 'unstable' otherwise), 'stab' should be dropped and 'stabf' will remain as the sole dependent variable (binary classification).

In [4]:
#dropping the "stab" column as explained above
data_df.drop('stab', axis=1, inplace=True)

In [5]:
data_df.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable


In [6]:
data_df.stabf.value_counts(normalize=True)

unstable    0.638
stable      0.362
Name: stabf, dtype: float64

### Seperating Independent features and target

In [7]:
X = data_df.drop('stabf', axis=1)
y = data_df['stabf']

### Splitting the data into train and test sets

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Scalling the data

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
norm_x_train = scaler.fit_transform(X_train, y_train)

#converting the scaled data into DataFrame (matter of preference) 
norm_x_train_df = pd.DataFrame(norm_x_train, columns=X_train.columns)

### Transforming the test data using the above StandardScaler object

In [10]:
#transform X_test
norm_x_test = scaler.transform(X_test)

#converting the transformed test data into a DataFrame (again, preference)
norm_x_test_df = pd.DataFrame(norm_x_test, columns=X_test.columns)

### Modelling

In [11]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix

#### Random Forest Classifier

In [12]:
#train Randomforest
rf_clf = RandomForestClassifier(random_state=1)
rf_clf.fit(norm_x_train, y_train)

#obtain predictions
rf_predictions = rf_clf.predict(norm_x_test)

In [13]:
#evaluating rf using accuracy_score
print('RandomForestClassifier:')
print('=='*11)

rf_accuracy = accuracy_score(y_true=y_test, y_pred=rf_predictions)
print('Accuracy: {:.4f}'.format(rf_accuracy)) 


RandomForestClassifier:
Accuracy: 0.9290


#### Extra Trees Classifier

In [14]:
#train ExtraTreesClassifier
ext_clf = ExtraTreesClassifier(random_state=1)
ext_clf.fit(norm_x_train_df, y_train)

#obtain predictions
ext_predictions = ext_clf.predict(norm_x_test_df)

In [15]:
#evaluating extra trees clf using accuracy_score
print('ExtraTreesClassifier:')
print('=='*10)

ext_accuracy = accuracy_score(y_true=y_test, y_pred=ext_predictions)
print('Accuracy: {:.4f}'.format(ext_accuracy))


ExtraTreesClassifier:
Accuracy: 0.9280


#### Question 17
Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1.   
What are the best hyperparameters from the randomized search CV?

In [16]:
from sklearn.model_selection import RandomizedSearchCV

ext_clf = ExtraTreesClassifier(random_state=1)

params = {'n_estimators': [100, 300, 500, 1000],
          'min_samples_split': [2, 5, 7],
          'min_samples_leaf': [4, 6, 8],
          'max_features': ["sqrt", "log2", None]
         }
random_search = RandomizedSearchCV(estimator=ext_clf, param_distributions=params,
                                   cv=5, n_iter=10, scoring='accuracy', n_jobs=-1,
                                  verbose=1, random_state=1)

random_search.fit(norm_x_train_df, y_train)
print('Best hyperparameters:')
print('=='*11)
print(random_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters:
{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': None}


In [17]:
# Using the random_search to obtain predictions and accuracy score
rdm_src_pred = random_search.predict(norm_x_test_df)
rdm_src_accuracy = accuracy_score(y_true=y_test, y_pred=rdm_src_pred)
print('Accuracy: {:.4f}'.format(rdm_src_accuracy))

Accuracy: 0.9295


#### Question 20

Find the feature importance using the optimal ExtraTreesClassifier model.  
Which features are the most and least important respectively?

In [18]:
# return the feature importance array
important_features = random_search.best_estimator_.feature_importances_

#returns the indices in descending order
indices = (-important_features).argsort()

#Feature importance in descending order
norm_x_test_df.columns[indices]

Index(['tau2', 'tau1', 'tau4', 'tau3', 'g3', 'g4', 'g2', 'g1', 'p2', 'p3',
       'p4', 'p1'],
      dtype='object')

#### XGBClassifier

In [19]:
#XGBClassifier expects the labels to be [0 1] instead of ['stable' 'unstable']
# hence we convert "stable" to 1s and "unstable" to 0s for both train and test data
xgb_y_train = (y_train == 'stable').astype(int)
xgb_y_test = (y_test == 'stable').astype(int)

In [20]:
#train ExtraTreesClassifier
xgb_clf = XGBClassifier(random_state=1)
xgb_clf.fit(norm_x_train, xgb_y_train)

#obtain predictions
xgb_predictions = xgb_clf.predict(norm_x_test)

In [21]:
#evaluating xgb using accuracy_score
print('XGBClassifier:')
print('=='*7)

xgb_accuracy = accuracy_score(y_true=xgb_y_test, y_pred=xgb_predictions)
print('Accuracy: {:.4f}'.format(xgb_accuracy))


XGBClassifier:
Accuracy: 0.9455


#### LightGBM Classifier

In [22]:
#train ExtraTreesClassifier
lgbm_clf = LGBMClassifier(random_state=1)
lgbm_clf.fit(norm_x_train, y_train)

#obtain predictions
lgbm_predictions = lgbm_clf.predict(norm_x_test)

In [23]:
#evaluating lgbm clf using accuracy_score
print('LightGBMClassifier:')
print('=='*10)

lgbm_accuracy = accuracy_score(y_true=y_test, y_pred=lgbm_predictions)
print('Accuracy: {:.4f}'.format(lgbm_accuracy))


LightGBMClassifier:
Accuracy: 0.9395
