# Model Testing Pipeline

Identifying the best model (considered with best parameters of each respectively)
1.   Re-train models using both training and validation data
2.   Evaluate models against the testing data

In [None]:
!pip install ipython-autotime
%load_ext autotime

In [None]:
# Basic Libraries

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from datetime import datetime

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data Source

df = pd.read_csv("/content/drive/MyDrive/cleaned_gee_data.csv")
df = df.drop(columns = ['Unnamed: 0', 'BRIGHTNESS'], axis=1) # BRIGHTNESS deprecated
df.head()

Unnamed: 0,LATITUDE,LONGITUDE,ACQ_DATE,ACQ_TIME,OPEN_TIME,CLOSE_TIME,FIRE_OCCURRED,CO_MOL/M2,SO2_MOL/M2,NO2_MOL/M2,O3_MOL/M2,LOCATION,INSTRUMENT,DRY_SEASON
0,-5.466232,-0.176027,-1.866392,0.634294,0.506405,0.526945,0,-0.024223,-0.47444,-1.152277,-0.511001,-1.159086,0,1
1,-5.466232,-0.176027,-1.866392,0.634294,0.506405,0.526945,0,0.113599,-0.47444,-1.152277,-0.511001,-1.159086,0,1
2,-5.466232,-0.176027,-1.866392,0.634294,0.506405,0.526945,0,-0.024223,-0.47444,-1.361255,-0.511001,-1.159086,0,1
3,-5.466232,-0.176027,-1.866392,0.634294,0.506405,0.526945,0,0.113599,-0.47444,-1.361255,-0.511001,-1.159086,0,1
4,-5.433352,-0.197441,-1.723773,0.634294,2.28608,1.793843,0,-0.967684,0.339667,-1.25177,0.426114,-1.159086,0,1


time: 1.31 s (started: 2023-01-16 11:36:51 +00:00)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171893 entries, 0 to 171892
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   LATITUDE       171893 non-null  float64
 1   LONGITUDE      171893 non-null  float64
 2   ACQ_DATE       171893 non-null  float64
 3   ACQ_TIME       171893 non-null  float64
 4   OPEN_TIME      171893 non-null  float64
 5   CLOSE_TIME     171893 non-null  float64
 6   FIRE_OCCURRED  171893 non-null  int64  
 7   CO_MOL/M2      171893 non-null  float64
 8   SO2_MOL/M2     171893 non-null  float64
 9   NO2_MOL/M2     171893 non-null  float64
 10  O3_MOL/M2      171893 non-null  float64
 11  LOCATION       171893 non-null  float64
 12  INSTRUMENT     171893 non-null  int64  
 13  DRY_SEASON     171893 non-null  int64  
dtypes: float64(11), int64(3)
memory usage: 18.4 MB
time: 36.2 ms (started: 2023-01-16 11:36:52 +00:00)


In [None]:
display(df['FIRE_OCCURRED'].value_counts())

0    170544
1      1349
Name: FIRE_OCCURRED, dtype: int64

time: 13.5 ms (started: 2023-01-16 11:36:52 +00:00)


In [None]:
X = df.drop('FIRE_OCCURRED', axis=1)
y = df['FIRE_OCCURRED']

time: 13.5 ms (started: 2023-01-16 11:36:52 +00:00)


In [None]:
# Training, Testing Split

from sklearn.model_selection import train_test_split

# 90:10

X_train_SMOTE, X_test_SMOTE, y_train_SMOTE, y_test_SMOTE = train_test_split(X, y, test_size=0.10, random_state=10, shuffle=True) # SMOTE
X_train_OVER, X_test_OVER, y_train_OVER, y_test_OVER = train_test_split(X, y, test_size=0.10, random_state=10, shuffle=True) # Over
X_train_UNDER, X_test_UNDER, y_train_UNDER, y_test_UNDER = train_test_split(X, y, test_size=0.10, random_state=10, shuffle=True) # Under
X_train_ALL, X_test_ALL, y_train_ALL, y_test_ALL = train_test_split(X, y, test_size=0.10, random_state=10, shuffle=True) # Over, Under, and SMOTE

time: 328 ms (started: 2023-01-16 11:36:52 +00:00)


In [None]:
if len(X_train_SMOTE)==len(y_train_SMOTE) and len(X_test) == len(y_test_SMOTE):
  print("X and y data length matching")
else:
  print("Error in data preparation pipeline")
print()
print("No. of training data = %d" % len(X_train_SMOTE))
print("No. of testing data = %d" % len(X_test))

X and y data length matching

No. of training data = 154703
No. of testing data = 17190
time: 6.11 ms (started: 2023-01-16 11:36:53 +00:00)


In [None]:
display(y_test_SMOTE.value_counts())

0    17059
1      131
Name: FIRE_OCCURRED, dtype: int64

time: 7.29 ms (started: 2023-01-16 11:36:53 +00:00)


In [None]:
# SMOTE

from collections import Counter
from imblearn.over_sampling import SMOTE 

print('Original dataset shape %s' % Counter(y_train_SMOTE))
sm = SMOTE(random_state=10)
X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train_SMOTE, y_train_SMOTE)
print('Resampled dataset shape %s' % Counter(y_train_SMOTE))

In [None]:
# Oversampling

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=10)
X_train_OVER, y_train_OVER = ros.fit_resample(X_train_OVER, y_train_OVER)
from collections import Counter
print(sorted(Counter(y_train_OVER).items()))

In [None]:
# Undersampling

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_train_UNDER, y_train_UNDER = rus.fit_resample(X_train_UNDER, y_train_UNDER)
print(sorted(Counter(y_train_UNDER).items()))

In [None]:
# All

## XGBoost Model

In [None]:
name = 'xgboost_clf'

xgboost_clf = XGBClassifier().fit(X_train,y_train) # Default parameters

y_true = y_test
y_pred = xgboost_clf.predict(X_test)
evaluation_results = evaluation_metrics(y_true, y_pred)

models_final = models_final.append({'model_name': name, 
                        'model': xgboost_clf, 
                        'parameters': xgboost_clf.get_params()}, 
                       ignore_index=True)

models_test = models_test.append({'model_name': name, 
                                  'confusion_matrix' : evaluation_results[0], 
                                  'accuracy': evaluation_results[1], 
                                  'recall' : evaluation_results[2], 
                                  'f1_score': evaluation_results[3],
                                  'roc_auc_score': evaluation_results[4]}, 
                                 ignore_index=True)