In [0]:
from warnings import simplefilter

simplefilter(action='ignore', category=FutureWarning)

In [0]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import pickle as pkl
import math
import csv
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
from collections import Counter

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
link = 'https://drive.google.com/open?id=17DbBIxtBcBM9Q4MWbeF46ml8WNv_ATKl'

fluff, id = link.split('=')

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
downloaded = drive.CreateFile({'id':id}) 

downloaded.GetContentFile('Flight_Weather.csv')

dataframe = pd.read_csv('Flight_Weather.csv')

In [0]:
data_label_encoder = preprocessing.LabelEncoder()

encoded_list = data_label_encoder.fit_transform(dataframe['Airport'])

dataframe['EncodedAirport'] = encoded_list

In [0]:
y = dataframe['ArrDel15']

dataframe = dataframe.drop(['ArrDel15','FlightDate','Date','Airport','ArrDelayMinutes'],axis = 1)

X = dataframe

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [22]:
print(" Original DataSet Shape : ",Counter(y_train))

ros = RandomOverSampler(random_state = 42)

X_ros, y_ros = ros.fit_resample(X_train, y_train)

print(" RandomOverSampled DataSet Shape : ",Counter(y_ros))

smote = SMOTE(random_state = 42)

X_smote, y_smote = smote.fit_resample(X_train, y_train)

print(" SMOTE DataSet Shape : ",Counter(y_smote))

 Original DataSet Shape :  Counter({0.0: 1097546, 1.0: 291028})
 RandomOverSampled DataSet Shape :  Counter({0.0: 1097546, 1.0: 1097546})
 SMOTE DataSet Shape :  Counter({0.0: 1097546, 1.0: 1097546})


In [23]:
print(" Original DataSet Shape : ",Counter(y_train))

rus = RandomUnderSampler(random_state = 42)

X_rus, y_rus = rus.fit_resample(X_train,y_train)

print(" RandomUnderampled DataSet Shape : ",Counter(y_rus))

nm = NearMiss()

X_nm, y_nm = nm.fit_resample(X_train, y_train)

print(" NearMiss DataSet Shape : ",Counter(y_nm))

 Original DataSet Shape :  Counter({0.0: 1097546, 1.0: 291028})
 RandomUnderampled DataSet Shape :  Counter({0.0: 291028, 1.0: 291028})
 NearMiss DataSet Shape :  Counter({0.0: 291028, 1.0: 291028})


In [0]:
clf = RandomForestClassifier(n_estimators = 100)

In [25]:
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

filename = "RFC_Normal.pkl"

pkl.dump(clf,open(filename,'wb'))

print(" Classification Report ")

print("\n\n")

print(metrics.classification_report(y_test,y_pred))

 Classification Report 



              precision    recall  f1-score   support

         0.0       0.93      0.98      0.96    365829
         1.0       0.92      0.74      0.82     97030

    accuracy                           0.93    462859
   macro avg       0.93      0.86      0.89    462859
weighted avg       0.93      0.93      0.93    462859



In [26]:
clf.fit(X_ros,y_ros)

y_pred_ros = clf.predict(X_test)

filename = "RFC_ROS.pkl"

pkl.dump(clf,open(filename,'wb'))

print(" Classification Report ")

print("\n\n")

print(metrics.classification_report(y_test,y_pred_ros))

 Classification Report 



              precision    recall  f1-score   support

         0.0       0.94      0.97      0.96    365829
         1.0       0.89      0.78      0.83     97030

    accuracy                           0.93    462859
   macro avg       0.92      0.88      0.89    462859
weighted avg       0.93      0.93      0.93    462859



In [27]:
clf.fit(X_smote,y_smote)

y_pred_smote = clf.predict(X_test)

filename = "RFC_SMOTE.pkl"

pkl.dump(clf,open(filename,'wb'))

print(" Classification Report")

print("\n\n")

print(metrics.classification_report(y_test,y_pred_smote))

 Classification Report



              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95    365829
         1.0       0.88      0.74      0.80     97030

    accuracy                           0.92    462859
   macro avg       0.91      0.85      0.88    462859
weighted avg       0.92      0.92      0.92    462859



In [28]:
clf.fit(X_rus,y_rus)

y_pred_rus = clf.predict(X_test)

filename = "RFC_RUS.pkl"

pkl.dump(clf,open(filename,'wb'))

print(" Classification Report")

print("\n\n")

print(metrics.classification_report(y_test,y_pred_rus))

 Classification Report



              precision    recall  f1-score   support

         0.0       0.96      0.93      0.95    365829
         1.0       0.78      0.87      0.82     97030

    accuracy                           0.92    462859
   macro avg       0.87      0.90      0.89    462859
weighted avg       0.93      0.92      0.92    462859



In [29]:
clf.fit(X_nm,y_nm)

y_pred_nm = clf.predict(X_test)

filename = "RFC_NM.pkl"

pkl.dump(clf,open(filename,'wb'))

print(" Classification Report")

print("\n\n")

print(metrics.classification_report(y_test,y_pred_nm))

 Classification Report



              precision    recall  f1-score   support

         0.0       0.95      0.82      0.88    365829
         1.0       0.55      0.85      0.67     97030

    accuracy                           0.82    462859
   macro avg       0.75      0.83      0.77    462859
weighted avg       0.87      0.82      0.83    462859

