# Assignment-7 MA23C011

Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import joblib
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score
from numpy import random
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from datetime import datetime

## Loading dataset and preproccesing it.

In [2]:
IDA_Challenge_dataset = pd.read_csv('aps_failure_training_set.csv', na_values=['na'])
print('Shape of train dataset:', IDA_Challenge_dataset.shape)
IDA_Challenge_dataset.head()

Shape of train dataset: (60000, 171)


Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,,2130706000.0,280.0,0.0,0.0,0.0,0.0,0.0,...,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,neg,33058,,0.0,,0.0,0.0,0.0,0.0,0.0,...,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,neg,41040,,228.0,100.0,0.0,0.0,0.0,0.0,0.0,...,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,neg,12,0.0,70.0,66.0,0.0,10.0,0.0,0.0,0.0,...,240.0,46.0,58.0,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,neg,60874,,1368.0,458.0,0.0,0.0,0.0,0.0,0.0,...,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0


Removing columns with entries are more than $50\% $ null 

In [3]:
threshold = 0.5
IDA_Challenge_dataset = IDA_Challenge_dataset.dropna(thresh=threshold*int(len(IDA_Challenge_dataset)),axis=1)

In [4]:
IDA_Challenge_dataset.shape

(60000, 163)

In [5]:
removed_column = ['ab_000','bm_000','bn_000','bo_000','bp_000','bq_000','br_000','cr_000']

Removing a column with all entries are same.

In [6]:
single_valued = [col for col in IDA_Challenge_dataset.columns if IDA_Challenge_dataset[col].value_counts().shape[0] == 1]
single_valued

['cd_000']

In [7]:
IDA_Challenge_dataset.drop('cd_000',axis=1,inplace=True)

In [8]:
X = IDA_Challenge_dataset.drop('class',axis=1)
y = IDA_Challenge_dataset['class'].map({'neg':0,'pos':1})

Filling all null entries with median.

In [9]:
# imputing the rest of the dataset with median values
imputer = SimpleImputer(strategy='median')
# fitting the train data
imputer.fit(X)

# transforming both train and test datasets
X_imputed = pd.DataFrame(imputer.transform(X), columns=X.columns)

Preprocessing IDA_Challenge_dataset to $ MinMaxScaler $ 

In [10]:
# declaring the normalizer
normalizer = MinMaxScaler()

# fitting the scaler
normalizer.fit_transform(X_imputed)
X_norm = normalizer.transform(X_imputed)


Splitting IDA_Challenge_dataset into train and test. 

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_norm,y,random_state=101,test_size=0.2)

 Cross-validating the best hyper-parameters through $ SVC $

In [21]:
# SVC
svc_params = {'kernel': ['linear', 'rbf', 'poly'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
svc_grid = GridSearchCV(SVC(), param_grid=svc_params, cv=5, scoring='f1_macro')
svc_grid.fit(X_train, y_train)
svc_best = svc_grid.best_estimator_

We got SVC best estimator with these Hyperparameters

In [22]:
print(svc_best)

SVC(C=10, kernel='poly')


In [27]:
svcc_pred = svc_best.predict(X_test)
print(classification_report(svcc_pred,y_test),confusion_matrix(svcc_pred,y_test))

              precision    recall  f1-score   support

         neg       1.00      0.99      1.00     11830
         pos       0.68      0.76      0.72       170

    accuracy                           0.99     12000
   macro avg       0.84      0.88      0.86     12000
weighted avg       0.99      0.99      0.99     12000
 [[11770    60]
 [   41   129]]


We got f1-score of $ 0.86 $ by $SVC$ best estimator.

Cross-validating the best hyper-parameters through $ LogisticRegression $

In [18]:
# Logistic Regression
logreg_params = {'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10], 'solver': ['liblinear']}
logreg_grid = GridSearchCV(LogisticRegression(), param_grid=logreg_params, cv=5, scoring='f1_macro')
logreg_grid.fit(X_train, y_train)
logreg_best = logreg_grid.best_estimator_

We got $LogisticRegression()$ best estimator with these Hyperparameters.

In [19]:
print(logreg_best)

LogisticRegression(C=10, penalty='l1', solver='liblinear')


In [29]:
log_pred = logreg_best.predict(X_test)
print(classification_report(log_pred,y_test),confusion_matrix(log_pred,y_test))

              precision    recall  f1-score   support

         neg       1.00      0.99      1.00     11849
         pos       0.63      0.79      0.70       151

    accuracy                           0.99     12000
   macro avg       0.81      0.89      0.85     12000
weighted avg       0.99      0.99      0.99     12000
 [[11779    70]
 [   32   119]]


We got f1-score of $ 0.85 $ by $ LogisticRegression() $ best estimator.

Cross-validating the best hyper-parameters through $ DecisionTreeClassifier $

In [24]:
# Decision Tree
dt_params = {'max_depth': [10, 15, 25], 'min_samples_leaf': [1, 5, 10]}
dt_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=dt_params, cv=5, scoring='f1_macro')
dt_grid.fit(X_train, y_train)
dt_best = dt_grid.best_estimator_

We got $DecesionTreeClassifier()$ best estimator with these Hyperparameters

In [25]:
print(dt_best)

DecisionTreeClassifier(max_depth=25)


In [30]:
dt_pred = dt_best.predict(X_test)
print(classification_report(dt_pred,y_test),confusion_matrix(dt_pred,y_test))

              precision    recall  f1-score   support

         neg       1.00      1.00      1.00     11816
         pos       0.71      0.73      0.72       184

    accuracy                           0.99     12000
   macro avg       0.85      0.86      0.86     12000
weighted avg       0.99      0.99      0.99     12000
 [[11761    55]
 [   50   134]]


We got f1-score of $ 0.85 $ by $ DecesionTreeClassifier() $ best estimator.

## Task - 2

#### A. Undersampling the majority class

In [12]:
data_1 = IDA_Challenge_dataset[IDA_Challenge_dataset['class']=='neg'].sample(30000)
data_2 = IDA_Challenge_dataset[IDA_Challenge_dataset['class']=='pos']
Undersample_data = pd.concat([data_1,data_2],axis=0)

In [13]:
Undersample_data

Unnamed: 0,class,aa_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
6328,neg,65252,686.0,548.0,0.0,0.0,0.0,0.0,0.0,0.0,...,488096.0,218974.0,492878.0,590418.0,510358.0,257842.0,266652.0,5662.0,0.0,0.0
17010,neg,49168,200.0,188.0,0.0,0.0,0.0,0.0,0.0,0.0,...,374168.0,197406.0,454650.0,483156.0,273172.0,145050.0,244850.0,24620.0,0.0,0.0
25957,neg,126,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,2554.0,530.0,538.0,1856.0,76.0,0.0,0.0,0.0,0.0,0.0
53415,neg,28,22.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,...,430.0,10.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27310,neg,92504,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,2523648.0,667486.0,352764.0,20004.0,630.0,186.0,334.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59484,pos,895178,,,,,0.0,0.0,0.0,0.0,...,9116224.0,4276644.0,8701496.0,8082264.0,5827284.0,2057354.0,1662302.0,10790.0,,
59601,pos,862134,,,,,0.0,38834.0,1227952.0,8877294.0,...,3456564.0,1793170.0,4159190.0,5847384.0,8364506.0,12875424.0,661442.0,2458.0,,
59692,pos,186856,,,0.0,0.0,0.0,0.0,4300.0,910488.0,...,2713108.0,800182.0,322322.0,71638.0,34662.0,7304.0,2538.0,0.0,0.0,0.0
59742,pos,605092,,,,,0.0,44320.0,1048970.0,7820828.0,...,3940400.0,1865730.0,3698692.0,3271958.0,9831898.0,3755392.0,65610.0,0.0,,


In [14]:
X1 = Undersample_data.drop('class',axis=1)
y1 = Undersample_data['class'].map({'neg':0,'pos':1})

In [15]:
imputer.fit(X1)
# transforming both train and test datasets
X1_imputed = pd.DataFrame(imputer.transform(X1), columns=X1.columns)
normalizer.fit_transform(X1_imputed)
X1_norm = normalizer.transform(X1_imputed)

In [16]:
X1_norm.shape

(31000, 161)

In [17]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1_norm,y1,random_state=101,test_size=0.2)

In [18]:
model_svm = SVC(C=10, kernel='poly')

In [19]:
model_svm.fit(X1_train,y1_train)

SVC performance on undersampled data

In [60]:
pred = model_svm.predict(X1_test)
print(classification_report(pred,y1_test))
pred = model_svm.predict(X_test)
print(classification_report(pred,y_test))
pred_t = model_svm.predict(X_train)
print(classification_report(pred_t,y_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6007
           1       0.92      0.97      0.94       193

    accuracy                           1.00      6200
   macro avg       0.96      0.98      0.97      6200
weighted avg       1.00      1.00      1.00      6200

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11820
           1       0.72      0.76      0.74       180

    accuracy                           0.99     12000
   macro avg       0.86      0.88      0.87     12000
weighted avg       0.99      0.99      0.99     12000

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     47200
           1       0.92      0.94      0.93       800

    accuracy                           1.00     48000
   macro avg       0.96      0.97      0.96     48000
weighted avg       1.00      1.00      1.00     48000



In [24]:
model_new=DecisionTreeClassifier(max_depth=25,max_leaf_nodes=10)

In [25]:
model_new.fit(X1_train,y1_train)

In [59]:
pred = model_new.predict(X1_test)
print(classification_report(pred,y1_test))
pred = model_new.predict(X_test)
print(classification_report(pred,y_test))
pred_t = model_new.predict(X_train)
print(classification_report(pred_t,y_train))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      6055
           1       0.62      0.87      0.72       145

    accuracy                           0.98      6200
   macro avg       0.81      0.93      0.86      6200
weighted avg       0.99      0.98      0.99      6200

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11829
           1       0.70      0.78      0.74       171

    accuracy                           0.99     12000
   macro avg       0.85      0.89      0.87     12000
weighted avg       0.99      0.99      0.99     12000

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     47289
           1       0.65      0.74      0.70       711

    accuracy                           0.99     48000
   macro avg       0.82      0.87      0.85     48000
weighted avg       0.99      0.99      0.99     48000



DecesionTreeClassifier performance on Under sampled data.

In [30]:
model_log1 =LogisticRegression(C=10, penalty='l1', solver='liblinear')

In [31]:
model_log1.fit(X1_train,y1_train)

In [61]:
pred = model_log1.predict(X1_test)
print(classification_report(pred,y1_test))
pred = model_log1.predict(X_test)
print(classification_report(pred,y_test))
pred_t = model_log1.predict(X_train)
print(classification_report(pred_t,y_train))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      6036
           1       0.71      0.88      0.78       164

    accuracy                           0.99      6200
   macro avg       0.85      0.93      0.89      6200
weighted avg       0.99      0.99      0.99      6200

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11822
           1       0.69      0.74      0.71       178

    accuracy                           0.99     12000
   macro avg       0.84      0.87      0.85     12000
weighted avg       0.99      0.99      0.99     12000

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     47245
           1       0.72      0.77      0.74       755

    accuracy                           0.99     48000
   macro avg       0.86      0.88      0.87     48000
weighted avg       0.99      0.99      0.99     48000



#### B. Using Class weight applying all above models

In [33]:
model_new=DecisionTreeClassifier(class_weight={0:1,1:2.69},max_depth=35,max_leaf_nodes=10)

In [34]:
model_new.fit(X_train,y_train)

In [35]:
pred=model_new.predict(X_test)

In [36]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11811
           1       0.78      0.70      0.74       189

    accuracy                           0.99     12000
   macro avg       0.89      0.85      0.87     12000
weighted avg       0.99      0.99      0.99     12000



In [37]:
model_svm=SVC(C=10,kernel='rbf',class_weight={0:0.4,1:2})

In [38]:
model_svm.fit(X_train,y_train)

In [39]:
pred=model_svm.predict(X_test)

In [40]:
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11820
           1       0.72      0.76      0.74       180

    accuracy                           0.99     12000
   macro avg       0.86      0.88      0.87     12000
weighted avg       0.99      0.99      0.99     12000



In [41]:
model_log = LogisticRegression(C=10, penalty='l1', solver='liblinear',class_weight={0:1,1:3})

In [42]:
model_log.fit(X_train,y_train)

In [43]:
pred = model_log.predict(X_test)

In [44]:
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     11792
           1       0.73      0.66      0.70       208

    accuracy                           0.99     12000
   macro avg       0.86      0.83      0.85     12000
weighted avg       0.99      0.99      0.99     12000



#### C. Using sample weight hyperparameter to check improvement with all above models

In [45]:
model_new=DecisionTreeClassifier(max_depth=35,max_leaf_nodes=10)

In [46]:
from sklearn.utils.class_weight import compute_sample_weight

# Automatically compute sample weights for unbalanced classes
sample_weights = compute_sample_weight(class_weight={0:1,1:2.69}, y=y_train)

In [47]:
model_new.fit(X_train,y_train,sample_weight=sample_weights)

In [48]:
pred = model_new.predict(X_test)

In [49]:
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11829
           1       0.70      0.78      0.74       171

    accuracy                           0.99     12000
   macro avg       0.85      0.89      0.87     12000
weighted avg       0.99      0.99      0.99     12000



In [50]:
model_svm_2 = SVC(C=10,kernel='poly')

In [51]:
sample_weights = compute_sample_weight(class_weight={0:0.33,1:1}, y=y_train)

In [52]:
model_svm_2.fit(X_train,y_train,sample_weight=sample_weights)

In [53]:
pred = model_svm_2.predict(X_test)
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11827
           1       0.70      0.77      0.73       173

    accuracy                           0.99     12000
   macro avg       0.85      0.88      0.87     12000
weighted avg       0.99      0.99      0.99     12000



In [54]:
model_log_2 = LogisticRegression(C=10, penalty='l1', solver='liblinear')

In [55]:
sample_weights = compute_sample_weight(class_weight={0:0.33,1:0.9}, y=y_train)

In [56]:
model_log_2.fit(X_train,y_train,sample_weight=sample_weights)

In [57]:
pred = model_log_2.predict(X_test)
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     11801
           1       0.71      0.67      0.69       199

    accuracy                           0.99     12000
   macro avg       0.85      0.83      0.84     12000
weighted avg       0.99      0.99      0.99     12000



### D. Now I tried to do oversample the majority and undersample minority then applying all models

In [19]:
X1

Unnamed: 0,aa_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,ag_004,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
6328,65252,686.0,548.0,0.0,0.0,0.0,0.0,0.0,0.0,1124.0,...,488096.0,218974.0,492878.0,590418.0,510358.0,257842.0,266652.0,5662.0,0.0,0.0
17010,49168,200.0,188.0,0.0,0.0,0.0,0.0,0.0,0.0,846.0,...,374168.0,197406.0,454650.0,483156.0,273172.0,145050.0,244850.0,24620.0,0.0,0.0
25957,126,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,178.0,...,2554.0,530.0,538.0,1856.0,76.0,0.0,0.0,0.0,0.0,0.0
53415,28,22.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,...,430.0,10.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27310,92504,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,56576.0,...,2523648.0,667486.0,352764.0,20004.0,630.0,186.0,334.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59484,895178,,,,,0.0,0.0,0.0,0.0,2622.0,...,9116224.0,4276644.0,8701496.0,8082264.0,5827284.0,2057354.0,1662302.0,10790.0,,
59601,862134,,,,,0.0,38834.0,1227952.0,8877294.0,19110112.0,...,3456564.0,1793170.0,4159190.0,5847384.0,8364506.0,12875424.0,661442.0,2458.0,,
59692,186856,,,0.0,0.0,0.0,0.0,4300.0,910488.0,13142612.0,...,2713108.0,800182.0,322322.0,71638.0,34662.0,7304.0,2538.0,0.0,0.0,0.0
59742,605092,,,,,0.0,44320.0,1048970.0,7820828.0,16166970.0,...,3940400.0,1865730.0,3698692.0,3271958.0,9831898.0,3755392.0,65610.0,0.0,,


In [20]:
y1

6328     0
17010    0
25957    0
53415    0
27310    0
        ..
59484    1
59601    1
59692    1
59742    1
59769    1
Name: class, Length: 31000, dtype: int64

In [41]:
# Apply SMOTE to balance the classes in the training set
smote = SMOTE( sampling_strategy= 1, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X1_train, y1_train)

In [42]:
y_train_resampled.value_counts()

class
0    24004
1    24004
Name: count, dtype: int64

In [43]:
new_X_train = pd.DataFrame(X_train_resampled)
new_y_train = pd.DataFrame(y_train_resampled)


In [44]:
df_new = pd.concat([new_X_train,new_y_train],axis=1)

In [45]:
df_neg = df_new[df_new['class'] == 0].sample(20000)

In [46]:
df_pos = df_new[df_new['class'] == 1]

In [47]:
df_final = pd.concat([df_neg,df_pos],axis=0)

In [48]:
X_new_train = df_final.drop('class',axis=1)
y_new_train = df_final['class']

In [49]:
y_new_train.shape

(44004,)

In [None]:
model_svm.fit(X_new_train,y_new_train)

In [39]:
predd = model_svm.predict(X1_test)
print(classification_report(predd,y1_test),confusion_matrix(predd,y1_test))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5884
           1       0.87      0.56      0.68       316

    accuracy                           0.97      6200
   macro avg       0.92      0.78      0.83      6200
weighted avg       0.97      0.97      0.97      6200
 [[5857   27]
 [ 139  177]]


In [40]:
pred = model_svm.predict(X_test)
print(classification_report(pred,y_test))
pred_t = model_svm.predict(X_train)
print(classification_report(pred_t,y_train))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     11472
           1       0.94      0.34      0.49       528

    accuracy                           0.97     12000
   macro avg       0.95      0.67      0.74     12000
weighted avg       0.97      0.97      0.96     12000

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     45761
           1       0.95      0.34      0.50      2239

    accuracy                           0.97     48000
   macro avg       0.96      0.67      0.74     48000
weighted avg       0.97      0.97      0.96     48000



I got best f1 score is $ 0.88 $. <br>
SVM performs outstanding in it with very few sampling.
