In [1]:
from sys import stdin
import numpy as np # linear algebra
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn import metrics

# Load the data
data = pd.read_csv("FlightDelays.csv")

print("The number of rows in the data set is {}.".format(len(data)))
print("The number of features is {}.".format(len(data.columns)))

data.head()



The number of rows in the data set is 450017.
The number of features is 12.


Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,AIRLINE_ID,FL_NUM,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,SCHED_DEP_TIME,SCHED_ARR_TIME,DELAY
0,2017,1,1,7,6,19805,1766,11298,12889,1055,1154,0.0
1,2017,1,1,8,7,19805,1766,11298,12889,1055,1154,0.0
2,2017,1,1,1,7,19805,1766,13930,11298,710,948,0.0
3,2017,1,1,2,1,19805,1766,13930,11298,710,948,0.0
4,2017,1,1,3,2,19805,1766,13930,11298,710,948,0.0


In [2]:
# Finding Null Values
data.isnull().sum()

YEAR                    0
QUARTER                 0
MONTH                   0
DAY_OF_MONTH            0
DAY_OF_WEEK             0
AIRLINE_ID              0
FL_NUM                  0
ORIGIN_AIRPORT_ID       0
DEST_AIRPORT_ID         0
SCHED_DEP_TIME          0
SCHED_ARR_TIME          0
DELAY                8541
dtype: int64

In [3]:
new_data = data.dropna(axis=0)#Delete the rows containing missing data
new_data.isnull().sum() #Check to make sure there are no more NaNs

YEAR                 0
QUARTER              0
MONTH                0
DAY_OF_MONTH         0
DAY_OF_WEEK          0
AIRLINE_ID           0
FL_NUM               0
ORIGIN_AIRPORT_ID    0
DEST_AIRPORT_ID      0
SCHED_DEP_TIME       0
SCHED_ARR_TIME       0
DELAY                0
dtype: int64

In [4]:
print("The number of rows in the new data set is {}.".format(len(new_data)))

The number of rows in the new data set is 441476.


In [5]:
#subset the flight that delayed
delay_flight=new_data[new_data.DELAY==1]
print(delay_flight)

        YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  AIRLINE_ID  FL_NUM  \
22      2017        1      1             2            1       19805    1767   
24      2017        1      1             4            3       19805    1767   
26      2017        1      1             6            5       19805    1767   
45      2017        1      1            10            2       19805    1768   
64      2017        1      1            29            7       19805    1768   
67      2017        1      1             9            1       19805    1769   
78      2017        1      1            23            1       19805    1769   
80      2017        1      1            25            3       19805    1769   
82      2017        1      1            27            5       19805    1769   
91      2017        1      1            16            1       19805    1769   
98      2017        1      1            25            3       19805    1769   
109     2017        1      1             7          

In [6]:
#exploring most five origin airport delay 
most5origin_Airport_delay=delay_flight.groupby('ORIGIN_AIRPORT_ID').DELAY.sum().nlargest(5)
print(most5origin_Airport_delay)

#exploring most five destenation airport delay 
most5dest_Airport_delay=delay_flight.groupby('DEST_AIRPORT_ID').DELAY.sum().nlargest(5)
print(most5dest_Airport_delay)


ORIGIN_AIRPORT_ID
10397    6176.0
12892    5453.0
13930    4202.0
11292    4179.0
14771    3831.0
Name: DELAY, dtype: float64
DEST_AIRPORT_ID
10397    5289.0
12892    4341.0
14771    4083.0
11292    3563.0
13930    3536.0
Name: DELAY, dtype: float64


In [7]:
#cleaning data keeping the most five origin airport delay 
most5Origin_airport=new_data.loc[new_data['ORIGIN_AIRPORT_ID'].isin(['10397', '12892', '13930','11292','14771'])] 
print(most5Origin_airport)

        YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  AIRLINE_ID  FL_NUM  \
2       2017        1      1             1            7       19805    1766   
3       2017        1      1             2            1       19805    1766   
4       2017        1      1             3            2       19805    1766   
5       2017        1      1             4            3       19805    1766   
6       2017        1      1             5            4       19805    1766   
7       2017        1      1             6            5       19805    1766   
8       2017        1      1             7            6       19805    1766   
9       2017        1      1             8            7       19805    1766   
595     2017        1      1             1            7       19805    1783   
596     2017        1      1             2            1       19805    1783   
597     2017        1      1             3            2       19805    1783   
598     2017        1      1             4          

In [8]:
#cleaning data keeping the most five dest airport delay 
most5Dest_airport=most5Origin_airport.loc[new_data['DEST_AIRPORT_ID'].isin(['10397', '12892', '13930','11292','14771'])] 
print(most5Dest_airport)

        YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK  AIRLINE_ID  FL_NUM  \
5395    2017        1      1             1            7       19805    2503   
5396    2017        1      1             2            1       19805    2503   
5397    2017        1      1             3            2       19805    2503   
5398    2017        1      1             4            3       19805    2503   
5399    2017        1      1             5            4       19805    2503   
5400    2017        1      1             6            5       19805    2503   
5401    2017        1      1             8            7       19805    2503   
5402    2017        1      1             9            1       19805    2503   
5403    2017        1      1            10            2       19805    2503   
5404    2017        1      1            11            3       19805    2503   
5405    2017        1      1            12            4       19805    2503   
5406    2017        1      1            13          

In [24]:
# Setting the train set,test set, and validation set
x= new_data.iloc[:,0:11]
y= new_data['DELAY']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2, random_state = 1)
x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size=.25, random_state = 1)

In [28]:
x_train.shape

(264885, 11)

In [29]:
x_test.shape

(88296, 11)

In [30]:
x_val.shape

(88295, 11)

In [9]:
data.head(1)

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,AIRLINE_ID,FL_NUM,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,SCHED_DEP_TIME,SCHED_ARR_TIME,DELAY
0,2017,1,1,7,6,19805,1766,11298,12889,1055,1154,0.0
