# Imbalanced Data Processing Techniques Evaluation

In [None]:
!pip install ipython-autotime
%load_ext autotime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
time: 704 µs (started: 2023-01-27 08:32:43 +00:00)


In [None]:
# Basic Libraries

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from datetime import datetime

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
time: 7.89 s (started: 2023-01-27 08:32:43 +00:00)


In [None]:
# Data Source

df = pd.read_csv("/content/drive/MyDrive/cleaned_gee_data.csv")
df = df.drop(columns = ['Unnamed: 0', 'BRIGHTNESS'], axis=1) # BRIGHTNESS deprecated
df.head()

Unnamed: 0,LATITUDE,LONGITUDE,ACQ_DATE,ACQ_TIME,OPEN_TIME,CLOSE_TIME,FIRE_OCCURRED,CO_MOL/M2,SO2_MOL/M2,NO2_MOL/M2,O3_MOL/M2,LOCATION,INSTRUMENT,DRY_SEASON
0,-5.466232,-0.176027,-1.866392,0.634294,0.506405,0.526945,0,-0.024223,-0.47444,-1.152277,-0.511001,-1.159086,0,1
1,-5.466232,-0.176027,-1.866392,0.634294,0.506405,0.526945,0,0.113599,-0.47444,-1.152277,-0.511001,-1.159086,0,1
2,-5.466232,-0.176027,-1.866392,0.634294,0.506405,0.526945,0,-0.024223,-0.47444,-1.361255,-0.511001,-1.159086,0,1
3,-5.466232,-0.176027,-1.866392,0.634294,0.506405,0.526945,0,0.113599,-0.47444,-1.361255,-0.511001,-1.159086,0,1
4,-5.433352,-0.197441,-1.723773,0.634294,2.28608,1.793843,0,-0.967684,0.339667,-1.25177,0.426114,-1.159086,0,1


time: 970 ms (started: 2023-01-27 08:32:51 +00:00)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171893 entries, 0 to 171892
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   LATITUDE       171893 non-null  float64
 1   LONGITUDE      171893 non-null  float64
 2   ACQ_DATE       171893 non-null  float64
 3   ACQ_TIME       171893 non-null  float64
 4   OPEN_TIME      171893 non-null  float64
 5   CLOSE_TIME     171893 non-null  float64
 6   FIRE_OCCURRED  171893 non-null  int64  
 7   CO_MOL/M2      171893 non-null  float64
 8   SO2_MOL/M2     171893 non-null  float64
 9   NO2_MOL/M2     171893 non-null  float64
 10  O3_MOL/M2      171893 non-null  float64
 11  LOCATION       171893 non-null  float64
 12  INSTRUMENT     171893 non-null  int64  
 13  DRY_SEASON     171893 non-null  int64  
dtypes: float64(11), int64(3)
memory usage: 18.4 MB
time: 20.4 ms (started: 2023-01-27 08:32:52 +00:00)


In [None]:
display(df['FIRE_OCCURRED'].value_counts())

0    170544
1      1349
Name: FIRE_OCCURRED, dtype: int64

time: 12.6 ms (started: 2023-01-27 08:32:52 +00:00)


In [None]:
X = df.drop('FIRE_OCCURRED', axis=1)
y = df['FIRE_OCCURRED']

time: 5.36 ms (started: 2023-01-27 08:32:52 +00:00)


In [None]:
# Training, Testing Split

from sklearn.model_selection import train_test_split

# 90:10

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=10, shuffle=True) # Default
X_train_SMOTE, X_test_SMOTE, y_train_SMOTE, y_test_SMOTE = train_test_split(X, y, test_size=0.10, random_state=10, shuffle=True) # SMOTE
X_train_OVER, X_test_OVER, y_train_OVER, y_test_OVER = train_test_split(X, y, test_size=0.10, random_state=10, shuffle=True) # Over
X_train_UNDER, X_test_UNDER, y_train_UNDER, y_test_UNDER = train_test_split(X, y, test_size=0.10, random_state=10, shuffle=True) # Under
X_train_ALL, X_test_ALL, y_train_ALL, y_test_ALL = train_test_split(X, y, test_size=0.10, random_state=10, shuffle=True) # Under and SMOTE

time: 345 ms (started: 2023-01-27 08:32:52 +00:00)


In [None]:
if len(X_train_SMOTE)==len(y_train_SMOTE) and len(X_test_SMOTE) == len(y_test_SMOTE):
  print("X and y data length matching")
else:
  print("Error in data preparation pipeline")
print()
print("No. of training data = %d" % len(X_train_SMOTE))
print("No. of testing data = %d" % len(X_test_SMOTE))

X and y data length matching

No. of training data = 154703
No. of testing data = 17190
time: 5.56 ms (started: 2023-01-27 08:32:52 +00:00)


In [None]:
display(y_test_SMOTE.value_counts())

0    17059
1      131
Name: FIRE_OCCURRED, dtype: int64

time: 13.3 ms (started: 2023-01-27 08:32:53 +00:00)


In [None]:
# SMOTE

from collections import Counter
from imblearn.over_sampling import SMOTE 

print('Original dataset shape %s' % Counter(y_train_SMOTE))
sm = SMOTE(random_state=10)
X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train_SMOTE, y_train_SMOTE)
print('Resampled dataset shape %s' % Counter(y_train_SMOTE))

Original dataset shape Counter({0: 153485, 1: 1218})
Resampled dataset shape Counter({0: 153485, 1: 153485})
time: 560 ms (started: 2023-01-27 08:32:53 +00:00)


In [None]:
# Oversampling

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=10)
X_train_OVER, y_train_OVER = ros.fit_resample(X_train_OVER, y_train_OVER)
from collections import Counter
print(sorted(Counter(y_train_OVER).items()))

[(0, 153485), (1, 153485)]
time: 161 ms (started: 2023-01-27 08:32:53 +00:00)


In [None]:
# Undersampling

from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=10)
X_train_UNDER, y_train_UNDER = rus.fit_resample(X_train_UNDER, y_train_UNDER)
print(sorted(Counter(y_train_UNDER).items()))

[(0, 1218), (1, 1218)]
time: 51.8 ms (started: 2023-01-27 08:32:53 +00:00)


In [None]:
# All

from sklearn.utils import resample

sample_size = 10000
combined_df = pd.concat([X_train_ALL, y_train_ALL], axis=1)
class_0 = combined_df[combined_df['FIRE_OCCURRED'] == 0]
class_1 = combined_df[combined_df['FIRE_OCCURRED'] == 1]

test_class_resampled_0 = resample(class_0, n_samples=sample_size, random_state=10)
test_class_resampled_1 = resample(class_1, random_state=10)

X_resampled_0 = test_class_resampled_0.iloc[:, :-1]
X_resampled_1 = test_class_resampled_1.iloc[:, :-1]
X_resampled = pd.concat([X_resampled_0, X_resampled_1], ignore_index=True)

y_resampled_0 = test_class_resampled_0.iloc[:, -1]
y_resampled_1 = test_class_resampled_1.iloc[:, -1]
y_resampled = pd.concat([y_resampled_0, y_resampled_1], ignore_index=True)

time: 29.4 ms (started: 2023-01-27 08:32:53 +00:00)


In [None]:
y_resampled.value_counts()

0    10000
1     1218
Name: FIRE_OCCURRED, dtype: int64

time: 6.9 ms (started: 2023-01-27 08:32:53 +00:00)


In [None]:
X_train_ALL = X_resampled
y_train_ALL = y_resampled

time: 496 µs (started: 2023-01-27 08:32:53 +00:00)


In [None]:
print('Original dataset shape %s' % Counter(y_train_ALL))
sm = SMOTE(random_state=10)
X_train_ALL, y_train_ALL = sm.fit_resample(X_train_ALL, y_train_ALL)
print('Resampled dataset shape %s' % Counter(y_train_ALL))

Original dataset shape Counter({0: 10000, 1: 1218})
Resampled dataset shape Counter({0: 10000, 1: 10000})
time: 48.6 ms (started: 2023-01-27 08:32:53 +00:00)


In [None]:
# Evaluation Metrics

from sklearn.metrics import confusion_matrix, recall_score, f1_score, roc_auc_score, accuracy_score

def evaluation_metrics(y_true, y_pred):
  cfm = confusion_matrix(y_true, y_pred).ravel()
  acc = accuracy_score(y_true, y_pred)
  recs = recall_score(y_true, y_pred, average='binary')
  f1s = f1_score(y_true, y_pred, average='binary')
  rocs = roc_auc_score(y_true, y_pred, average='macro')
  return [cfm, acc, recs, f1s, rocs]

time: 3.4 ms (started: 2023-01-27 08:32:53 +00:00)


In [None]:
# Store Model Parameters and Eval

models_final = pd.DataFrame(columns = ['model_name', 'model', 'parameters'])
models_test = pd.DataFrame(columns = ['model_name', 'confusion_matrix', 'accuracy', 'recall', 'f1_score', 'roc_auc_score'])

time: 8.51 ms (started: 2023-01-27 08:32:53 +00:00)


## XGBoost Model

All default parameters

In [None]:
from xgboost import XGBClassifier

time: 15 ms (started: 2023-01-27 08:32:54 +00:00)


In [None]:
name = 'xgboost_clf'

xgboost_clf = XGBClassifier().fit(X_train,y_train) # Default

y_true = y_test
y_pred = xgboost_clf.predict(X_test)
evaluation_results = evaluation_metrics(y_true, y_pred)

models_final = models_final.append({'model_name': name, 
                        'model': xgboost_clf, 
                        'parameters': xgboost_clf.get_params()}, 
                       ignore_index=True)

models_test = models_test.append({'model_name': name, 
                                  'confusion_matrix' : evaluation_results[0], 
                                  'accuracy': evaluation_results[1], 
                                  'recall' : evaluation_results[2], 
                                  'f1_score': evaluation_results[3],
                                  'roc_auc_score': evaluation_results[4]}, 
                                 ignore_index=True)

time: 17.5 s (started: 2023-01-27 08:32:54 +00:00)


In [None]:
name = 'xgboost_clf_SMOTE'

xgboost_clf = XGBClassifier().fit(X_train_SMOTE,y_train_SMOTE) # Default

y_true = y_test_SMOTE
y_pred = xgboost_clf.predict(X_test_SMOTE)
evaluation_results = evaluation_metrics(y_true, y_pred)

models_final = models_final.append({'model_name': name, 
                        'model': xgboost_clf, 
                        'parameters': xgboost_clf.get_params()}, 
                       ignore_index=True)

models_test = models_test.append({'model_name': name, 
                                  'confusion_matrix' : evaluation_results[0], 
                                  'accuracy': evaluation_results[1], 
                                  'recall' : evaluation_results[2], 
                                  'f1_score': evaluation_results[3],
                                  'roc_auc_score': evaluation_results[4]}, 
                                 ignore_index=True)

time: 30.7 s (started: 2023-01-27 08:33:11 +00:00)


In [None]:
name = 'xgboost_clf_OVER'

xgboost_clf = XGBClassifier().fit(X_train_OVER,y_train_OVER) # Default

y_true = y_test_OVER
y_pred = xgboost_clf.predict(X_test_OVER)
evaluation_results = evaluation_metrics(y_true, y_pred)

models_final = models_final.append({'model_name': name, 
                        'model': xgboost_clf, 
                        'parameters': xgboost_clf.get_params()}, 
                       ignore_index=True)

models_test = models_test.append({'model_name': name, 
                                  'confusion_matrix' : evaluation_results[0], 
                                  'accuracy': evaluation_results[1], 
                                  'recall' : evaluation_results[2], 
                                  'f1_score': evaluation_results[3],
                                  'roc_auc_score': evaluation_results[4]}, 
                                 ignore_index=True)

time: 23.8 s (started: 2023-01-27 08:33:42 +00:00)


In [None]:
name = 'xgboost_clf_UNDER'

xgboost_clf = XGBClassifier().fit(X_train_UNDER,y_train_UNDER) # Default

y_true = y_test_UNDER
y_pred = xgboost_clf.predict(X_test_UNDER)
evaluation_results = evaluation_metrics(y_true, y_pred)

models_final = models_final.append({'model_name': name, 
                        'model': xgboost_clf, 
                        'parameters': xgboost_clf.get_params()}, 
                       ignore_index=True)

models_test = models_test.append({'model_name': name, 
                                  'confusion_matrix' : evaluation_results[0], 
                                  'accuracy': evaluation_results[1], 
                                  'recall' : evaluation_results[2], 
                                  'f1_score': evaluation_results[3],
                                  'roc_auc_score': evaluation_results[4]}, 
                                 ignore_index=True)

time: 303 ms (started: 2023-01-27 08:34:06 +00:00)


In [None]:
name = 'xgboost_clf_ALL'

xgboost_clf = XGBClassifier().fit(X_train_ALL,y_train_ALL) # Default

y_true = y_test_ALL
y_pred = xgboost_clf.predict(X_test_ALL)
evaluation_results = evaluation_metrics(y_true, y_pred)

models_final = models_final.append({'model_name': name, 
                        'model': xgboost_clf, 
                        'parameters': xgboost_clf.get_params()}, 
                       ignore_index=True)

models_test = models_test.append({'model_name': name, 
                                  'confusion_matrix' : evaluation_results[0], 
                                  'accuracy': evaluation_results[1], 
                                  'recall' : evaluation_results[2], 
                                  'f1_score': evaluation_results[3],
                                  'roc_auc_score': evaluation_results[4]}, 
                                 ignore_index=True)

time: 1.56 s (started: 2023-01-27 08:34:06 +00:00)


In [None]:
display(models_final)

Unnamed: 0,model_name,model,parameters
0,xgboost_clf,XGBClassifier(),"{'base_score': 0.5, 'booster': 'gbtree', 'cols..."
1,xgboost_clf_SMOTE,XGBClassifier(),"{'base_score': 0.5, 'booster': 'gbtree', 'cols..."
2,xgboost_clf_OVER,XGBClassifier(),"{'base_score': 0.5, 'booster': 'gbtree', 'cols..."
3,xgboost_clf_UNDER,XGBClassifier(),"{'base_score': 0.5, 'booster': 'gbtree', 'cols..."
4,xgboost_clf_ALL,XGBClassifier(),"{'base_score': 0.5, 'booster': 'gbtree', 'cols..."


time: 18.7 ms (started: 2023-01-27 08:34:07 +00:00)


In [None]:
display(models_test)

Unnamed: 0,model_name,confusion_matrix,accuracy,recall,f1_score,roc_auc_score
0,xgboost_clf,"[17053, 6, 101, 30]",0.993775,0.229008,0.359281,0.614328
1,xgboost_clf_SMOTE,"[15264, 1795, 15, 116]",0.894706,0.885496,0.113614,0.890137
2,xgboost_clf_OVER,"[15230, 1829, 12, 119]",0.892903,0.908397,0.114478,0.90059
3,xgboost_clf_UNDER,"[15066, 1993, 8, 123]",0.883595,0.938931,0.109479,0.911051
4,xgboost_clf_ALL,"[15417, 1642, 17, 114]",0.90349,0.870229,0.120827,0.886987


time: 18.6 ms (started: 2023-01-27 08:34:08 +00:00)
