# Data Preparation Pipeline
* Data Exploration
* Feature Engineering
* Data Cleaning
* Encoding & Scaling
* SMOTE
* Saving Data

In [None]:
# Basic Libraries

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Data Source

df = pd.read_csv("/content/drive/MyDrive/Senior Project/Datasets/test_gee.csv")
# df = pd.read_csv("/content/drive/MyDrive/Senior Project/Datasets/test.csv")

df = df.drop(columns = ['Unnamed: 0', 'lat_min', 'lat_max', 'long_min', 'long_max','DAYNIGHT'], axis=1)
df.head()

In [None]:
dataplot = sb.heatmap(df.corr())
plt.show()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
display(df['FireOccurred'].value_counts())
print("Column numbers: ", len(df.columns))

In [None]:
display(df['ACQ_DATE'].value_counts())

In [None]:
display(df['ACQ_TIME'].value_counts())

In [None]:
X = df.drop('FireOccurred', axis=1)
y = df['FireOccurred']

In [None]:
# Encoder

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

labelEncoder = LabelEncoder()
ordinalEncoder = OrdinalEncoder()

X[['ACQ_TIME','ACQ_DATE']] = ordinalEncoder.fit_transform(X[['ACQ_TIME','ACQ_DATE']])
X.head()

# for i in range(len(df.columns)-1):
#   X.iloc[:,i] = labelEncoder.fit_transform(X.iloc[:,i])

In [None]:
# Scaler

from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()
X = pd.DataFrame(standardScaler.fit_transform(X),columns = X.columns)

display(X)

In [None]:
# Training, Validation, Testing Split

from sklearn.model_selection import train_test_split

# 80:10:10

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=10, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/9, random_state=10, shuffle=True)

Original = [X_train, X_val, X_test, y_train, y_val, y_test] # For reference

In [None]:
if len(X_train)==len(y_train) and len(X_test) == len(y_test) and len(X_val) == len(y_val):
  print("X and y data length matching")
else:
  print("Error in data preparation pipeline")
print()
print("No. of training data = %d" % len(X_train))
print("No. of validation data = %d" % len(X_val))
print("No. of testing data = %d" % len(X_test))

In [None]:
display(y_test.value_counts())

In [None]:
# SMOTE

from collections import Counter
from imblearn.over_sampling import SMOTE 

print('Original dataset shape %s' % Counter(y_train))
sm = SMOTE(random_state=10)
X_train, y_train = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train))

In [None]:
# Evaluation Metrics

from sklearn.metrics import confusion_matrix, recall_score, f1_score, roc_auc_score, accuracy_score

def evaluation_metrics(y_true, y_pred):
  cfm = confusion_matrix(y_true, y_pred).ravel()
  acc = accuracy_score(y_true, y_pred)
  recs = recall_score(y_true, y_pred, average='binary')
  f1s = f1_score(y_true, y_pred, average='binary')
  rocs = roc_auc_score(y_true, y_pred, average='macro')
  return [cfm, acc, recs, f1s, rocs]

Confusion matrix format : [ tn , fp , fn , tp ]