In [0]:
import warnings

from warnings import simplefilter

from sklearn.exceptions import DataConversionWarning

simplefilter(action='ignore', category=FutureWarning)

warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [0]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import pickle as pkl
import math
import csv
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
from collections import Counter

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
def get_dataframe(link,filename) :

  fluff, id = link.split('=')

  downloaded = drive.CreateFile({'id':id}) 

  downloaded.GetContentFile(filename)

  dataframe = pd.read_csv(filename)

  data_label_encoder = preprocessing.LabelEncoder()

  encoded_list = data_label_encoder.fit_transform(dataframe['Airport'])

  dataframe['EncodedAirport'] = encoded_list

  return dataframe

In [0]:
dataframe = get_dataframe('https://drive.google.com/open?id=17DbBIxtBcBM9Q4MWbeF46ml8WNv_ATKl','Flight_Weather.csv')

In [0]:
y = dataframe[['ArrDel15','ArrDelayMinutes']]

dataframe = dataframe.drop(['FlightDate','Date','Airport','ArrDelayMinutes'],axis = 1)

X = dataframe

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [0]:
X_train_regressor = X_train[X_train['ArrDel15'] == 1].drop(['ArrDel15'] , axis = 1)

X_test_regressor = X_test[X_test['ArrDel15'] == 1].drop(['ArrDel15'] , axis = 1)

X_train_classifier = X_train.drop(['ArrDel15'] , axis = 1)

X_test_classifier = X_test.drop(['ArrDel15'] , axis = 1)

y_train_classifier = y_train.drop(['ArrDelayMinutes'] , axis = 1)

y_test_classifier = y_test.drop(['ArrDelayMinutes'] , axis = 1)

y_train_regressor = y_train[y_train['ArrDel15'] == 1].drop(['ArrDel15'] , axis = 1)

y_test_regressor = y_test[y_test['ArrDel15'] == 1].drop(['ArrDel15'] , axis = 1)

In [20]:
print(" Original DataSet Shape : ",Counter(y_train_classifier.T.squeeze()))

ros = RandomOverSampler(random_state = 42)

X_ros, y_ros = ros.fit_resample(X_train_classifier, y_train_classifier.T.squeeze())

print(" RandomOverSampled DataSet Shape : ",Counter(y_ros))

smote = SMOTE(random_state = 42)

X_smote, y_smote = smote.fit_resample(X_train_classifier, y_train_classifier.T.squeeze())

print(" SMOTE DataSet Shape : ",Counter(y_smote))

 Original DataSet Shape :  Counter({0.0: 1097531, 1.0: 291043})
 RandomOverSampled DataSet Shape :  Counter({0.0: 1097531, 1.0: 1097531})
 SMOTE DataSet Shape :  Counter({0.0: 1097531, 1.0: 1097531})


In [21]:
print(" Original DataSet Shape : ",Counter(y_train_classifier.T.squeeze()))

rus = RandomUnderSampler(random_state = 42)

X_rus, y_rus = rus.fit_resample(X_train_classifier, y_train_classifier.T.squeeze())

print(" RandomUnderampled DataSet Shape : ",Counter(y_rus))

nm = NearMiss()

X_nm, y_nm = nm.fit_resample(X_train_classifier, y_train_classifier.T.squeeze())

print(" NearMiss DataSet Shape : ",Counter(y_nm))

 Original DataSet Shape :  Counter({0.0: 1097531, 1.0: 291043})
 RandomUnderampled DataSet Shape :  Counter({0.0: 291043, 1.0: 291043})
 NearMiss DataSet Shape :  Counter({0.0: 291043, 1.0: 291043})


In [0]:
clf = RandomForestClassifier(n_estimators = 100)

In [24]:
clf.fit(X_train_classifier,y_train_classifier)

y_pred = clf.predict(X_test_classifier)

print(" Classification Report ")

print("\n\n")

print(metrics.classification_report(y_test_classifier,y_pred))

 Classification Report 



              precision    recall  f1-score   support

         0.0       0.93      0.98      0.96    365844
         1.0       0.92      0.74      0.82     97015

    accuracy                           0.93    462859
   macro avg       0.93      0.86      0.89    462859
weighted avg       0.93      0.93      0.93    462859



In [25]:
clf.fit(X_smote,y_smote)

y_pred_smote = clf.predict(X_test_classifier)

print(" Classification Report")

print("\n\n")

print(metrics.classification_report(y_test_classifier,y_pred_smote))

 Classification Report



              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95    365844
         1.0       0.88      0.74      0.80     97015

    accuracy                           0.92    462859
   macro avg       0.91      0.85      0.88    462859
weighted avg       0.92      0.92      0.92    462859



In [0]:
rfr = RandomForestRegressor()

In [0]:
rfr.fit(X_train_regressor,y_train_regressor)

y_pred = rfr.predict(X_test_regressor)

In [28]:
print(" Mean Absolute Error : ", metrics.mean_absolute_error(y_test_regressor, y_pred))  

print(" Mean Squared Error : ", metrics.mean_squared_error(y_test_regressor, y_pred))  

print(" Root Mean Squared Error : ", np.sqrt(metrics.mean_squared_error(y_test_regressor, y_pred)))

print(" R2 Score : ",metrics.r2_score(y_test_regressor,y_pred))

 Mean Absolute Error :  6.096832757820955
 Mean Squared Error :  113.36967769623254
 Root Mean Squared Error :  10.647519790835448
 R2 Score :  0.9783283954034779


In [0]:
Y = y_test_regressor

In [0]:
X_test_regression_analysis =  X_test_regressor

In [0]:
X_test_regression_analysis['ArrDelayMinutes'] = y_test_regressor

In [0]:
Y_15_100= Y.loc[Y['ArrDelayMinutes'].isin(list(range(15,101)))]

Y_100_200 = Y.loc[Y['ArrDelayMinutes'].isin(list(range(101,201)))]

Y_200_500 = Y.loc[Y['ArrDelayMinutes'].isin(list(range(201,501)))]

Y_500_1000 = Y.loc[Y['ArrDelayMinutes'].isin(list(range(501,1001)))]

Y_1000_2000 = Y.loc[Y['ArrDelayMinutes'].isin(list(range(1001,2001)))]

Y_15_100_1 = Y_15_100['ArrDelayMinutes']

Y_100_200_1 = Y_100_200['ArrDelayMinutes']

Y_200_500_1 = Y_200_500['ArrDelayMinutes']

Y_500_1000_1 = Y_500_1000['ArrDelayMinutes']

Y_1000_2000_1 = Y_1000_2000['ArrDelayMinutes']


X = X_test_regression_analysis

X_15_100 = X.loc[X['ArrDelayMinutes'].isin(list(range(15,101)))]

X_100_200 = X.loc[X['ArrDelayMinutes'].isin(list(range(101,201)))]

X_200_500 = X.loc[X['ArrDelayMinutes'].isin(list(range(201,501)))]

X_500_1000 = X.loc[X['ArrDelayMinutes'].isin(list(range(501,1001)))]

X_1000_2000 = X.loc[X['ArrDelayMinutes'].isin(list(range(1001,2001)))]

X_15_100_1 = X_15_100.drop(['ArrDelayMinutes'],axis = 1)

X_100_200_1 = X_100_200.drop(['ArrDelayMinutes'],axis = 1)

X_200_500_1 = X_200_500.drop(['ArrDelayMinutes'],axis = 1)

X_500_1000_1 = X_500_1000.drop(['ArrDelayMinutes'],axis = 1)

X_1000_2000_1 = X_1000_2000.drop(['ArrDelayMinutes'],axis = 1)

In [0]:
def scores_rfr(X,Y,Z):

  for i,j,k in zip(X,Y,Z):
    
    print(k)

    y_pred = rfr.predict(i)

    print(" Mean Absolute Error : ", metrics.mean_absolute_error(j, y_pred))  

    print(" Mean Squared Error : ", metrics.mean_squared_error(j, y_pred))  

    print(" Root Mean Squared Error : ", np.sqrt(metrics.mean_squared_error(j, y_pred)))

    print(" R2 Score : ",metrics.r2_score(j,y_pred))

    print("\n\n")

In [0]:
X_list = [X_15_100_1,X_100_200_1,X_200_500_1 ,X_500_1000_1 ,X_1000_2000_1]

Y_list = [Y_15_100_1,Y_100_200_1,Y_200_500_1,Y_500_1000_1 ,Y_1000_2000_1]

Z_list = ['15_100','100_200','200_500','500_1000','1000_2000']

In [36]:
scores_rfr(X_list,Y_list,Z_list)

15_100
 Mean Absolute Error :  4.81122875511109
 Mean Squared Error :  63.10943784053402
 Root Mean Squared Error :  7.944144877866592
 R2 Score :  0.8735729421677457



100_200
 Mean Absolute Error :  11.66538892138023
 Mean Squared Error :  309.04766618765143
 Root Mean Squared Error :  17.57975159630111
 R2 Score :  0.5790607282843185



200_500
 Mean Absolute Error :  15.661889584519068
 Mean Squared Error :  551.1536088787707
 Root Mean Squared Error :  23.476660939724173
 R2 Score :  0.8675537424392712



500_1000
 Mean Absolute Error :  16.395034965034963
 Mean Squared Error :  532.3760755244755
 Root Mean Squared Error :  23.073276220001258
 R2 Score :  0.9744964909643812



1000_2000
 Mean Absolute Error :  29.676000000000002
 Mean Squared Error :  1726.410220000001
 Root Mean Squared Error :  41.550092900016494
 R2 Score :  0.9435842775906901





In [0]:
X_test_pipeline = X_test_classifier

X_train_pipeline = X_train_classifier

y_test_pipeline = y_test_classifier

y_train_classifier = y_train_classifier

In [0]:
clf.fit(X_test_pipeline,y_test_pipeline)

y_pred_pipeline = clf.predict(X_test_pipeline)

In [0]:
X_test_pipeline['Predicted'] = y_pred_pipeline

X_test_pipeline['ArrDelayMinutes'] = y_test_regressor

X_test_pipeline = X_test_pipeline[X_test_pipeline['Predicted'] == 1]

In [0]:
y_pipeline = X_test_pipeline['ArrDelayMinutes']

X_test_pipeline = X_test_pipeline.drop(['Predicted','ArrDelayMinutes'],axis = 1)

X_pipeline = X_test_pipeline

In [0]:
y_regressor_pipeline_pred = rfr.predict(X_pipeline)

In [44]:
print("\n\n Random Forest Regressor \n\n")

print(" Mean Absolute Error : ", metrics.mean_absolute_error(y_pipeline, y_regressor_pipeline_pred))  

print(" Mean Squared Error : ", metrics.mean_squared_error(y_pipeline, y_regressor_pipeline_pred))  

print(" Root Mean Squared Error : ", np.sqrt(metrics.mean_squared_error(y_pipeline, y_regressor_pipeline_pred)))

print(" R2 Score : ",metrics.r2_score(y_pipeline,y_regressor_pipeline_pred))



 Random Forest Regressor 


 Mean Absolute Error :  6.096886634918671
 Mean Squared Error :  113.3708384851671
 Root Mean Squared Error :  10.647574300523434
 R2 Score :  0.9783283029018932
