# import library

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
from sklearn.metrics import mean_squared_error

# import dataset

In [3]:
dataset = pd.read_csv('C:/Users/Abirami/Desktop/flight delay.csv')
dataset.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,UNIQUE_CARRIER,TAIL_NUM,FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,...,DEP_DEL15,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE
0,2016,1,1,1,5,DL,N836DN,1399,10397,ATL,...,0.0,2143,2102.0,-41.0,0.0,0,0,338,295.0,2182
1,2016,1,1,1,5,DL,N964DN,1476,11433,DTW,...,0.0,1435,1439.0,4.0,0.0,0,0,110,115.0,528
2,2016,1,1,1,5,DL,N813DN,1597,10397,ATL,...,0.0,1215,1142.0,-33.0,0.0,0,0,335,300.0,2182
3,2016,1,1,1,5,DL,N587NW,1768,14747,SEA,...,0.0,1335,1345.0,10.0,0.0,0,0,196,205.0,1399
4,2016,1,1,1,5,DL,N836DN,1823,14747,SEA,...,0.0,607,615.0,8.0,0.0,0,0,247,259.0,1927


In [4]:
dataset.columns

Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK',
       'UNIQUE_CARRIER', 'TAIL_NUM', 'FL_NUM', 'ORIGIN_AIRPORT_ID', 'ORIGIN',
       'DEST_AIRPORT_ID', 'DEST', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY',
       'DEP_DEL15', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY', 'ARR_DEL15',
       'CANCELLED', 'DIVERTED', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME',
       'DISTANCE'],
      dtype='object')

# removing unnecessory columns

In [29]:
data= dataset.drop(['YEAR' , 'QUARTER' , 'MONTH' , 'DAY_OF_MONTH' , 'DAY_OF_WEEK' , 'UNIQUE_CARRIER' , 'TAIL_NUM' , 'FL_NUM' ,'ORIGIN_AIRPORT_ID' , 'ORIGIN' , 'DEST_AIRPORT_ID' , 'DEST' , 'DISTANCE'], axis=1)
data.head()

Unnamed: 0,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DEL15,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME
0,1905,1907.0,2.0,0.0,2143,2102.0,-41.0,0.0,0,0,338,295.0
1,1345,1344.0,-1.0,0.0,1435,1439.0,4.0,0.0,0,0,110,115.0
2,940,942.0,2.0,0.0,1215,1142.0,-33.0,0.0,0,0,335,300.0
3,819,820.0,1.0,0.0,1335,1345.0,10.0,0.0,0,0,196,205.0
4,2300,2256.0,-4.0,0.0,607,615.0,8.0,0.0,0,0,247,259.0


# check datatypes of columns

In [31]:
data.dtypes

CRS_DEP_TIME             int64
DEP_TIME               float64
DEP_DELAY              float64
DEP_DEL15              float64
CRS_ARR_TIME             int64
ARR_TIME               float64
ARR_DELAY              float64
ARR_DEL15              float64
CANCELLED                int64
DIVERTED                 int64
CRS_ELAPSED_TIME         int64
ACTUAL_ELAPSED_TIME    float64
dtype: object

In [24]:
data.shape

(11231, 12)

In [25]:
#finding missing values
data.isna().sum()

CRS_DEP_TIME             0
DEP_TIME               107
DEP_DELAY              107
DEP_DEL15              107
CRS_ARR_TIME             0
ARR_TIME               115
ARR_DELAY              188
ARR_DEL15              188
CANCELLED                0
DIVERTED                 0
CRS_ELAPSED_TIME         0
ACTUAL_ELAPSED_TIME    188
dtype: int64

In [26]:
data.dropna(inplace=True)
data.isna().sum()

CRS_DEP_TIME           0
DEP_TIME               0
DEP_DELAY              0
DEP_DEL15              0
CRS_ARR_TIME           0
ARR_TIME               0
ARR_DELAY              0
ARR_DEL15              0
CANCELLED              0
DIVERTED               0
CRS_ELAPSED_TIME       0
ACTUAL_ELAPSED_TIME    0
dtype: int64

In [27]:
data.shape

(11043, 12)

In [32]:
# convert to categorical
data['CANCELLED'] = data['CANCELLED'].astype(object)
data['DIVERTED'] = data['DIVERTED'].astype(object)
data['ARR_DEL15'] = data['ARR_DEL15'].astype(object)
data['DEP_DEL15'] = data['DEP_DEL15'].astype(object)
data.dtypes

CRS_DEP_TIME             int64
DEP_TIME               float64
DEP_DELAY              float64
DEP_DEL15               object
CRS_ARR_TIME             int64
ARR_TIME               float64
ARR_DELAY              float64
ARR_DEL15               object
CANCELLED               object
DIVERTED                object
CRS_ELAPSED_TIME         int64
ACTUAL_ELAPSED_TIME    float64
dtype: object

In [33]:
# split dependent and independent variables
X = data[['CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY',
       'DEP_DEL15', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'DIVERTED', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME']]
Y = data[['ARR_DEL15']]

In [34]:
# splitting into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [35]:
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0) 
  
# fit the regressor with X and Y data
regressor.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

In [36]:
# prediction
y_pred = regressor.predict(X_test)

In [37]:
print("MSE for DTReg (All features): " , np.round(mean_squared_error(y_test, y_pred), 2))

MSE for DTReg (All features):  0.0


In [38]:
joblib.dump(regressor, 'model1.pkl')

['model1.pkl']