In [1]:
########################################################################
#    June 03, 2019                                                     #
#    09:00                                                             #
#    Created by: Kunal Gehlot                                          #
########################################################################

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import random
import time
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from IPython.display import Audio, display
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
warnings.filterwarnings("ignore")
def allDone():
  display(Audio(url='01.wav', autoplay=True))


In [2]:
def cnv(src):
    return f'{src[0:2]}:{src[2:4]}:00'

flights = pd.read_csv('flights.csv',\
    converters={'SCHEDULED_DEPARTURE': cnv, 'DEPARTURE_TIME': cnv, 'WHEELS_OFF': cnv, 'SCHEDULED_ARRIVAL': cnv, 'WHEELS_ON': cnv, 'ARRIVAL_TIME': cnv})

flights.columns

# flights = flights.iloc[np.random.choice(np.arange(len(flights)), 250000, False)]

flights = flights.set_index("CANCELLED")
flights = flights.drop(1, axis=0)

flights = flights.set_index("DIVERTED")
flights = flights.drop(1, axis=0)

flights =flights.reset_index()

flights.drop("DIVERTED", axis = 1, inplace=True)

flights = flights.drop_duplicates()
flights[flights.isnull().values.any(axis=1)]

to_drop = ['CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'YEAR', 'TAIL_NUMBER']

flights.drop(to_drop, inplace=True, axis=1)

flights.head()

flights.SCHEDULED_DEPARTURE.loc[flights['SCHEDULED_DEPARTURE'] == '24:00:00'] = '00:00:00'

flights.DEPARTURE_TIME.loc[flights['DEPARTURE_TIME'] == '24:00:00'] = '00:00:00'

flights.WHEELS_OFF.loc[flights['WHEELS_OFF'] == '24:00:00'] = '00:00:00'

flights.WHEELS_ON.loc[flights['WHEELS_ON'] == '24:00:00'] = '00:00:00'

flights.SCHEDULED_ARRIVAL.loc[flights['SCHEDULED_ARRIVAL'] == '24:00:00'] = '00:00:00'

flights.ARRIVAL_TIME.loc[flights['ARRIVAL_TIME'] == '24:00:00'] = '00:00:00'

flights.SCHEDULED_DEPARTURE = pd.to_datetime(
    flights.SCHEDULED_DEPARTURE, format= '%H:%M:%S' ).dt.time

flights.DEPARTURE_TIME = pd.to_datetime(
    flights.DEPARTURE_TIME, format= '%H:%M:%S' ).dt.time

flights.WHEELS_OFF = pd.to_datetime(
    flights.WHEELS_OFF, format= '%H:%M:%S' ).dt.time

flights.WHEELS_ON = pd.to_datetime(
    flights.WHEELS_ON, format= '%H:%M:%S' ).dt.time

flights.SCHEDULED_ARRIVAL = pd.to_datetime(
    flights.SCHEDULED_ARRIVAL, format= '%H:%M:%S' ).dt.time

flights.ARRIVAL_TIME = pd.to_datetime(
    flights.ARRIVAL_TIME, format= '%H:%M:%S' ).dt.time

flights['SCH_DEP_HR'] = flights.SCHEDULED_DEPARTURE.apply(lambda x: x.hour)

flights['SCH_DEP_MN'] = flights.SCHEDULED_DEPARTURE.apply(lambda x: x.minute)

flights['DEP_TM_HR'] = flights.DEPARTURE_TIME.apply(lambda x: x.hour)

flights['DEP_TM_MN'] = flights.DEPARTURE_TIME.apply(lambda x: x.minute)

flights['WHL_OFF_MN'] = flights.WHEELS_OFF.apply(lambda x: x.minute)

flights['WHL_OFF_HR'] = flights.WHEELS_OFF.apply(lambda x: x.hour)

flights['WHL_ON_HR'] = flights.WHEELS_ON.apply(lambda x: x.hour)

flights['WHL_ON_MN'] = flights.WHEELS_ON.apply(lambda x: x.minute)

flights['SCH_ARVL_HR'] = flights.SCHEDULED_ARRIVAL.apply(lambda x: x.hour)

flights['SCH_ARVL_MN'] = flights.SCHEDULED_ARRIVAL.apply(lambda x: x.minute)

flights['ARVL_TM_HR'] = flights.ARRIVAL_TIME.apply(lambda x: x.hour)

flights['ARVL_TM_MN'] = flights.ARRIVAL_TIME.apply(lambda x: x.minute)

flights = flights[['MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'ORIGIN_AIRPORT'
                   , 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'SCH_DEP_HR', 'SCH_DEP_MN'
                   ,'DEPARTURE_TIME', 'DEP_TM_HR', 'DEP_TM_MN', 'DEPARTURE_DELAY', 'TAXI_OUT',
                   'WHEELS_OFF', 'WHL_OFF_HR', 'WHL_OFF_MN', 'SCHEDULED_TIME', 'ELAPSED_TIME',
                  'AIR_TIME', 'DISTANCE', 'WHEELS_ON', 'WHL_ON_HR', 'WHL_ON_MN', 'TAXI_IN',
                  'SCHEDULED_ARRIVAL', 'SCH_ARVL_HR', 'SCH_ARVL_MN', 'ARRIVAL_TIME', 'ARVL_TM_HR'
                  , 'ARVL_TM_MN', 'ARRIVAL_DELAY']]

to_drop2 = ['SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'WHEELS_OFF', 'WHEELS_ON', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME']

flights.drop(to_drop2, inplace=True, axis=1)

airlineDB = pd.DataFrame()

airlineDB['AIRLINE'] = flights['AIRLINE'].value_counts().index

flights['AIRLINE1'] = flights['AIRLINE'].apply(hash)

airlineDB['AIRLINE_HASH'] = flights['AIRLINE1'].value_counts().index

airlineDB.to_csv('AirlineDatabase.csv', index=False)

flights['AIRLINE'] = flights['AIRLINE1']

flights.drop('AIRLINE1', axis=1, inplace=True)

airportDB = pd.DataFrame()

airportDB['AIRPORT'] = flights['ORIGIN_AIRPORT'].value_counts().index

flights['ORIGIN_AIRPORT1'] = flights['ORIGIN_AIRPORT'].apply(hash)

airportDB['AIRPORT_HASH'] = flights['ORIGIN_AIRPORT1'].value_counts().index

airportDB['AIRPORT'] = pd.concat([airportDB['AIRPORT'], flights['DESTINATION_AIRPORT']], ignore_index=True)

flights['DESTINATION_AIRPORT1'] = flights['DESTINATION_AIRPORT'].apply(hash)

airportDB['AIRPORT_HASH'] = pd.concat([airportDB['AIRPORT_HASH'], flights['DESTINATION_AIRPORT1']], ignore_index=True)

airportDB.to_csv('AirportDatabase.csv', index=False)

flights['ORIGIN_AIRPORT'] = flights['ORIGIN_AIRPORT1']

flights['DESTINATION_AIRPORT'] = flights['DESTINATION_AIRPORT1']

flights.drop(['ORIGIN_AIRPORT1', 'DESTINATION_AIRPORT1'], axis=1, inplace=True)

flights.to_csv('flightsNew.csv', index=False)



  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [23]:
flights = pd.read_csv('flightsNew.csv')

flights.ARRIVAL_DELAY[flights.ARRIVAL_DELAY < 15] = 0
flights.ARRIVAL_DELAY[flights.ARRIVAL_DELAY >= 15] = 1

X = flights

flights.columns

X = X.drop(['ARVL_TM_HR', 'ARVL_TM_MN', 'ARRIVAL_DELAY', 'ELAPSED_TIME', 'AIR_TIME',  'WHL_ON_HR', 'WHL_ON_MN', 'TAXI_IN', 'TAXI_OUT', 'WHL_OFF_HR',
       'WHL_OFF_MN', 'SCHEDULED_TIME', 'DISTANCE', 'SCH_ARVL_HR',
       'SCH_ARVL_MN'], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [24]:
flights.shape

(5714008, 27)

In [25]:
X.c

Index(['MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCH_DEP_HR', 'SCH_DEP_MN',
       'DEP_TM_HR', 'DEP_TM_MN', 'DEPARTURE_DELAY'],
      dtype='object')

In [26]:
X.shape

(5714008, 12)

In [27]:
X = X.values

X

array([[  1.,   1.,   4., ...,  23.,  54., -11.],
       [  1.,   1.,   4., ...,   0.,   2.,  -8.],
       [  1.,   1.,   4., ...,   0.,  18.,  -2.],
       ...,
       [ 12.,  31.,   4., ...,  23.,  50.,  -9.],
       [ 12.,  31.,   4., ...,  23.,  53.,  -6.],
       [ 12.,  31.,   4., ...,   0.,  14.,  15.]])

In [28]:
Y = flights.iloc[:,[-1]].values

Y = Y.ravel()

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 0)

# from sklearn.preprocessing import StandardScaler
# sec_X = StandardScaler()
# sec_X
# X_train = sec_X.fit_transform(X_train)
# X_train
# X_test = sec_X.transform(X_test)
# X_test

classifier = RandomForestClassifier(random_state=13)#KNeighborsClassifier(n_neighbours)#KNeighborsClassifier()#LogisticRegression(random_state=0)

classifier.fit(X_train, Y_train)

y_pred = classifier.predict(X_test)

print("Test set score: {:.4f}".format(np.mean(y_pred == Y_test)))

print("Test set score: {:.4f}".format(classifier.score(X_test, Y_test)))



Test set score: 0.9287
Test set score: 0.9287


In [29]:
# print('----------FLIGHTS DELAY PREDICTION----------')
month = int(input("Enter month please: "))
day = int(input("Enter day please: "))
dow = int(input("Enter week day please: "))
flightnum = int(input("Enter flight number please: "))
airlinenm = input("Enter airline name please: ")
Orgin = input("Enter origin airport please: ")
Dest = input("Enter destination airport please: ")
sch = input('Enter scheduled departure time: ')
dep = input('Enter departure time: ')
delay = int(input("Enter delay: "))


d = airlineDB.AIRLINE_HASH.loc[airlineDB.AIRLINE == airlinenm].item()
f = airportDB.AIRPORT_HASH.loc[airportDB.AIRPORT == Orgin].item()
g = airportDB.AIRPORT_HASH.loc[airportDB.AIRPORT == Dest].item()
h = int(sch[0:2])
i = int(sch[3:5])
j = int(dep[0:2])
k = int(dep[3:5])

inputs = [[month, day, dow, d, flightnum, f, g, h, i, j, k, delay]]
# inputs = inputs.values
# inputs  = sec_X.transform(inputs)
# inp = flights.loc
inputs = np.asarray(inputs, dtype='float64')

print(inputs)


answer = classifier.predict(inputs)

if(answer == 0):
    print('Not delayed')
else:
    print('Delayed')



Enter month please:  1
Enter day please:  1
Enter week day please:  4
Enter flight number please:  98
Enter airline name please:  AS
Enter origin airport please:  ATL
Enter destination airport please:  SEA
Enter scheduled departure time:  00:05
Enter departure time:  23:54
Enter delay:  -11


[[ 1.00000000e+00  1.00000000e+00  4.00000000e+00  5.22563232e+18
   9.80000000e+01 -6.75201843e+18  3.28520185e+18  0.00000000e+00
   5.00000000e+00  2.30000000e+01  5.40000000e+01 -1.10000000e+01]]
Not delayed
