In [115]:
import pandas as pd
import numpy as np

In [116]:
# load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv"
df = pd.read_csv(url)

In [117]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [118]:
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

## Following instruction from dataset description

In [119]:
# Drop stab and stabf
X = df.drop(columns = ['stab','stabf'])
y = df['stabf']

In [120]:
y.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [121]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y, random_state = 1, test_size=0.2 )

In [122]:
y_test.value_counts()

unstable    1288
stable       712
Name: stabf, dtype: int64

### scaling the dataset

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [11]:
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_train = pd.DataFrame(scaled_x_train,columns = x_train.columns)
scaled_x_train.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.367327,-0.986042,0.650447,1.547527,-0.29149,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
1,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2,-1.46785,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.68256,-0.855302,1.39935,1.451534,-1.045743,0.492489
3,0.820081,0.52992,1.299657,-1.141975,-0.812854,-0.763632,1.521579,0.65878,-0.958319,1.361958,1.60414,0.275303
4,0.665424,-1.425627,0.3123,0.919137,-1.614296,0.760315,1.422019,0.639243,1.676895,0.69566,1.137504,-1.312575


In [12]:
scaled_x_test = scaler.transform(x_test)
scaled_x_test = pd.DataFrame(scaled_x_test, columns = x_test.columns)

## Random Forest Classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state =1)
rfc.fit(scaled_x_train,y_train)

RandomForestClassifier(random_state=1)

In [14]:
pred_forest = rfc.predict(scaled_x_test)

In [15]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,pred_forest)
print(f"Accuracy: {round(accuracy,4)}")

Accuracy: 0.929


## Extra Trees Classifier

In [16]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(random_state = 1)
etc.fit(scaled_x_train,y_train)

ExtraTreesClassifier(random_state=1)

In [17]:
pred_tree = etc.predict(scaled_x_test)

In [18]:
accuracy = accuracy_score(y_test,pred_tree)
print(f"Accuracy: {round(accuracy,4)}")

Accuracy: 0.928


## XGBoost Classifier

In [37]:
# import warnings
# warnings.filterwarnings('ignore')

In [38]:
from xgboost import XGBClassifier
xgbc = XGBClassifier(random_state=1, eval_metric = "logloss")
xgbc.fit(scaled_x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [39]:
pred_xgb = xgbc.predict(scaled_x_test)

In [40]:
accuracy = accuracy_score(y_test, pred_xgb)
print(f"Accuracy: {round(accuracy,4)}")

Accuracy: 0.9455


## LightGMB

In [41]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(random_state=1)
lgbm.fit(scaled_x_train,y_train)

LGBMClassifier(random_state=1)

In [42]:
pred_lgmb = lgbm.predict(scaled_x_test)

In [43]:
accuracy_ = accuracy_score(y_test,pred_lgmb)
print(f"Accuracy: {round(accuracy_,4)}")

Accuracy: 0.9395


## Tuning Paramaters

In [24]:
n_estimators = [50,100,300,500,1000] 
min_samples_split = [2,3,5,7,9] 
min_samples_leaf = [1,2,4,6,8] 
max_features = ['auto','sqrt','log2',None] 
hyperparameter_grid = {'n_estimators':n_estimators,
                       'min_samples_leaf':min_samples_leaf,
                       'min_samples_split':min_samples_split,
                       'max_features':max_features}

## Randomized Cross Validation Search

In [25]:
from sklearn.model_selection import RandomizedSearchCV

In [26]:
ran_search = RandomizedSearchCV(estimator = etc,param_distributions=hyperparameter_grid,random_state=1)
ran_search = ran_search.fit(scaled_x_train,y_train)

In [27]:
ran_search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

## New classifier using Extra tree

In [28]:
n_classifier = ExtraTreesClassifier(random_state=1,n_estimators = 1000,
                                       min_samples_leaf= 8,max_features=None,
                                       min_samples_split=2)
n_classifier.fit(scaled_x_train,y_train)
new_pred = n_classifier.predict(scaled_x_test)

In [29]:
accuracy = accuracy_score(y_test,new_pred)
print(f"Accuracy: {round(accuracy,4)}")

Accuracy: 0.927


## Feature Importance

In [30]:
imp = ran_search.best_estimator_.feature_importances_
imp

array([0.13723975, 0.1405075 , 0.13468029, 0.13541676, 0.00368342,
       0.00533686, 0.00542927, 0.00496249, 0.10256244, 0.10757765,
       0.11306268, 0.10954089])

In [45]:
most_imp_feat = imp.max()
least_imp_feat = imp.min()

In [49]:
most = X.columns[imp == most_imp_feat]
least = X.columns[imp == least_imp_feat]

In [52]:
print(f"most important features is {most[0]}")
print(f"least important features is {least[0]}")

most important features is tau2
least important features is p1


In [31]:
sorted(zip(imp,X),reverse = True)

[(0.14050750384993677, 'tau2'),
 (0.13723974766109256, 'tau1'),
 (0.1354167630909727, 'tau4'),
 (0.13468028520386593, 'tau3'),
 (0.11306267999167334, 'g3'),
 (0.10954089174337298, 'g4'),
 (0.10757764577478764, 'g2'),
 (0.10256244080927947, 'g1'),
 (0.005429268421191957, 'p3'),
 (0.005336864710946151, 'p2'),
 (0.004962486591192238, 'p4'),
 (0.003683422151688322, 'p1')]

### You are working on a spam classifi cation system using regularized logisticregression. “Spam” is a positive class (y = 1) and “not spam” is the negative class (y =0). You have trained your classifi er and there are n = 2000 examples in the test set.The confusion matrix of predicted class vs. actual class is:

In [2]:
TP = 355
TN = 120
FP = 1480
FN = 45
total = TP+TN+FP+FN

accuracy = (TP+TN)/total
print(f"Accuracy: {accuracy}")
precision = TP / (TP+FP)
print(f"Precision: {precision}")
recall = TP / (TP + FN)
print(f"Recall: {recall}")
f1_score = 2 * (precision * recall) / (precision + recall)
print(f"F1 Score: {f1_score}")

Accuracy: 0.2375
Precision: 0.19346049046321526
Recall: 0.8875
F1 Score: 0.3176733780760626


### Which method can we use to best fi t a data in Logistic Regression?

In [107]:
### maximum likelihood

### Why do we use weak learners in boosting?

In [108]:
### To prevent overfitting

### A data scientist is evaluating different binary classifi cation models. A false positiveresult is 5 times more expensive (from a business perspective) than a false negativeresult. The models should be evaluated based on the following criteria:
    1) Must have a recall rate of at least 80%
    2) Must have a false positive rate of 10% or less
    3) Must minimize business costs
### After creating each binary classifi cation model, the data scientist generates thecorresponding confusion matrix. Which confusion matrix represents the modelthat satisfi es the requirements?

In [123]:
## using a trial and error method from the options

options = {"A": {"tN": 98, "fP": 2, "fN":18, "tP":82}, 
           "B": {"tN": 96, "fP": 4, "fN":10, "tP":90},
            "C": {"tN": 99, "fP": 1, "fN":21, "tP": 79},
            "D": {"tN": 91, "fP": 9, "fN":22, "tP":78}
          }
rec = []
fp = []
cs = []
## finding recall,fpr,cost given the values from option
for option,val in options.items():
    v = val
    recall = v["tP"] / (v["tP"] + v["fN"])
    fpr = v["fP"] / (v["fP"] + v["tN"])
    cost = 5 * v["fP"] + v["fN"]
    rec.append(recall*100)
    fp.append(round(fpr*100,2))
    cs.append(cost)
    print(f"Recall {option} = {recall*100}, FPR {option} = {round(fpr*100,2)}, Cost {option} = {cost} ")

df = pd.DataFrame({"Recall": rec, "FPR": fp, "Cost": cs}, index = ["A","B","C","D"])
df

Recall A = 82.0, FPR A = 2.0, Cost A = 28 
Recall B = 90.0, FPR B = 4.0, Cost B = 30 
Recall C = 79.0, FPR C = 1.0, Cost C = 26 
Recall D = 78.0, FPR D = 9.0, Cost D = 67 


Unnamed: 0,Recall,FPR,Cost
A,82.0,2.0,28
B,90.0,4.0,30
C,79.0,1.0,26
D,78.0,9.0,67


In [124]:
df[df["Recall"]> 80]

Unnamed: 0,Recall,FPR,Cost
A,82.0,2.0,28
B,90.0,4.0,30


### You are building a classifi er and the accuracy is poor on both the training and testsets. Which would you use to try to improve the performance?

In [None]:
### Boosting

### Which of the following is not an Ensemble model?

In [111]:
### Decision Tree

### A classifier predicts if insurance claims are fraudulent or not. The cost of paying afraudulent claim is higher than the cost of investigating a claim that is suspected tobe fraudulent. Which metric should we use to evaluate this classifi er?

In [112]:
### Recall

### The ROC curve above was generated from a classifi cation algorithm. What can wesay about this classifi er?

In [113]:
### The model has no discrimination capacity to differentiate between the positive and thenegative class

### A random forest classifi er was used to classify handwritten digits 0-9 into thenumbers they were intended to represent. The confusion matrix below wasgenerated from the results. Based on the matrix, which number was predicted withthe least accuracy?

In [114]:
### 8

### A medical company is building a model to predict the occurrence of thyroid cancer The training data contains 900 negative instances (people who dont have cancer)and 100 positive instances. The resulting model has 90% accuracy, but extremelypoor recall. What steps can be used to improve the model's performance? (SELECTTWO OPTIONS)

In [None]:
### Collect more data for the positive case
### Generate synthetic samples/data using SMOTE

### You are developing a machine learning classifi cation algorithm that categorizeshandwritten digits 0-9 into the numbers they represent. How should you pre-process the label data?

In [None]:
### Normalization

### What is the entropy of the target variable if its actual values are given as: [1,0,1,1,0,1,0]

In [None]:
### formula: -(Summation [p(x) * log p(x)])
### - 3/7 log(3/7) - 4/7 log(4/7)

### Which of this is not a good metric for evaluating classifi cation algorithms for datawith imbalanced class problems?

In [None]:
### Accuracy is not the best metric to use when evaluating imbalanced datasets as it can be misleading

### What is the accuracy on the test set using the random forest classifi er? In 4 decimalplaces.

In [None]:
### 0.9295

### What is the accuracy on the test set using the xgboost classifi er? In 4 decimalplaces.

In [None]:
### 0.9195

### What is the accuracy on the test set using the LGBM classifi er? In 4 decimal places.

In [None]:
### 0.9375

### Using the ExtraTreesClassifi er as your estimator with cv=5, n_iter=10, scoring ='accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the besthyperparameters from the randomized search CV?

In [None]:
### N_estimators = 1000 , min_samples_split = 2 , min_samples_leaf = 8, max_features = None

### Train a new ExtraTreesClassifi er Model with the new Hyperparameters from theRandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimalmodel higher or lower than the initial ExtraTreesClassifi er model with nohyperparameter tuning?

In [None]:
### Lower

### What other hyperparameter optimization methods can you try apart from RandomSearch?

In [None]:
### All of the above

### Find the feature importance using the optimal ExtraTreesClassifi er model. Whichfeatures are the most and least important respectively?

In [None]:
### tau2, p1