# Model Training Pipeline


In [None]:
# Basic Libraries

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Data Source

df = pd.read_csv("/content/drive/MyDrive/Senior Project/Datasets/test_gee.csv")
# df = pd.read_csv("/content/drive/MyDrive/Senior Project/Datasets/test.csv")

df = df.drop('Unnamed: 0', axis=1)
df.head()

In [None]:
df1['FireOccurred'].value_counts()
print("Column numbers: ", len(df1.columns))

In [None]:
# Encoder & Scaler

X = df1.drop('FireOccurred', axis=1)
y = df1['FireOccurred']

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

encoder = LabelEncoder()
for i in range(len(df1.columns)-1):
  X.iloc[:,i] = encoder.fit_transform(X.iloc[:,i])

scaler = StandardScaler().fit(X)
test = scaler.transform(X)

display(X)

In [None]:
# Training, Validation, Testing Split

from sklearn.model_selection import train_test_split

# 80:10:10

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/9, random_state=1, shuffle=True)

Original = [X_train, X_val, X_test, y_train, y_val, y_test] # For reference

In [None]:
if len(X_train)==len(y_train) and len(X_test) == len(y_test) and len(X_val) == len(y_val):
  print("X and y data length matching")
else:
  print("Error in data preparation pipeline")
print()
print("No. of training data = %d" % len(X_train))
print("No. of validation data = %d" % len(X_val))
print("No. of testing data = %d" % len(X_test))

In [None]:
display(y_test.value_counts())

In [None]:
# SMOTE

from collections import Counter
from imblearn.over_sampling import SMOTE 

print('Original dataset shape %s' % Counter(y_train))
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train))

In [None]:
# Store Model Parameters and Eval

models = pd.DataFrame(columns = ['model_name', 'model', 'parameters'])
models_eval = pd.DataFrame(columns = ['model_name', 'confusion_matrix', 'accuracy', 'recall', 'f1_score', 'roc_auc_score'])

In [None]:
# Evaluation Metrics

from sklearn.metrics import confusion_matrix, recall_score, f1_score, roc_auc_score, accuracy_score

def evaluation_metrics(y_true, y_pred):
  cfm = confusion_matrix(y_true, y_pred).ravel()
  acc = accuracy_score(y_true, y_pred)
  recs = recall_score(y_true, y_pred, average='binary')
  f1s = f1_score(y_true, y_pred, average='binary')
  rocs = roc_auc_score(y_true, y_pred, average='macro')
  return [cfm, acc, recs, f1s, rocs]

Confusion matrix format : [ tn , fp , fn , tp ]

In [None]:
# Import ML Algorithms

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost
from xgboost import XGBClassifier
import lightgbm
from lightgbm import LGBMClassifier
import tensorflow as tf
from tensorflow import keras
from sklearn.ensemble import VotingClassifier

## Logistic Regression

Library: Scikit-learn

## Support Vector Machine (SVM)

Library: Scikit-learn

## Naive Bayes

Library: Scikit-learn

## K-Nearest Neighbor

Library: Scikit-learn

## Random Forest Classifier

Library: Scikit-learn

## Decision Tree

Library: Scikit-learn

## Gradient Boosting Classifier

Library: Scikit-learn

## XGBoost

Library: xgboost

## LightGBM

Library: lightbgm

## Artificial Neural Network

Library: Keras, Tensorflow

## Ensemble Learning

Library: Scikit-learn, Keras, Tensorflow