### Flight Delay Prediction

In [None]:
import pandas as pd
from datetime import datetime

The dataset source: https://www.kaggle.com/datasets/yuanyuwendymu/airline-delay-and-cancellation-data-2009-2018?resource=download&select=2018.csv

In [None]:
df_chuncks = pd.read_csv('../data/2018.csv', chunksize=1000000)
df = pd.DataFrame({"columns":[]})
index = 0
for ck in df_chuncks:
    print(ck.shape)
    if index < 2:
        df = pd.concat([df, ck], axis=0, ignore_index=True)
    index+=1
df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
# profiling missing values
# feature columns: FL_HOUR(FL_DATE),FL_DAY(FL_DATE), OP_CARRIER, ORIGIN, DEST, DEP_HOUR(DEP_TIME), DEP_DAY(DEP_TIME)
# target column: DEP_DELAY
df= df[['FL_DATE', 'OP_CARRIER', 'ORIGIN', 'DEST', 'DEP_TIME', 'DISTANCE', 'CRS_DEP_TIME', 'DEP_DELAY']]
df.head()

In [None]:
def extract_dep_hour(dep_time):
    dep_time = str(int(dep_time))
    
    if len(dep_time) == 3:
        dep_time = f'0{dep_time}'
    return dep_time

In [None]:
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
df['FL_DAY'] = df['FL_DATE'].dt.day_name()
df['DEP_HOUR_MIN'] = df['CRS_DEP_TIME'].apply(lambda x: extract_dep_hour(x))
df['DEP_HOUR'] = df['DEP_HOUR_MIN'].apply(lambda x: x[:2])
df['DEP_MIN'] = df['DEP_HOUR_MIN'].apply(lambda x: x[2:])
df= df[['FL_DAY','OP_CARRIER', 'ORIGIN', 'DEST', 'DISTANCE', 'DEP_HOUR', 'DEP_MIN', 'DEP_DELAY']]
df.head()

In [None]:
categorical = ['FL_DAY','OP_CARRIER', 'ORIGIN', 'DEST', 'DEP_HOUR', 'DEP_MIN']
numerical = ['DISTANCE']
df[categorical] = df[categorical].astype(str)
df.dtypes

In [None]:
target = ['DEP_DELAY']
train_df = df.drop(columns = target).copy()

In [None]:
train_df.head()

In [None]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()

In [None]:
train_dict = train_df.to_dict(orient='records')
X = dv.fit_transform(train_dict)
y = df[target]

# deal with missing values on dep_delay
y.fillna(0, inplace=True)

In [None]:
# train test validation split
from sklearn.model_selection import train_test_split

In [None]:
# we are going to split our dataset into 80:10:10 as training:test:validation respectively
train_size=0.8

In [None]:
# split the data in training and other dataset
X_train, X_oth, y_train, y_oth = train_test_split(X, y, train_size=0.8)

# for the other data which is the remaining one, we split it into test and validation
test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_oth, y_oth, test_size=0.5)

print(X_train.shape) 
print(y_train.shape)
print(X_valid.shape) 
print(y_valid.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
y_train_values = y_train[target].values
y_test_values = y_test[target].values
y_valid_values = y_valid[target].values

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error

In [None]:
# lr = LinearRegression()
# lr.fit(X_train, y_train_values)

# y_pred = lr.predict(X_test)

# mean_squared_error(y_test_values, y_pred, squared=False)

In [None]:
# ls = Lasso(0.01)
# ls.fit(X_train, y_train_values)

# y_pred = ls.predict(X_test)

# mean_squared_error(y_test_values, y_pred, squared=False)

In [None]:
svr_rbf=SVR(C=1.0, epsilon=0.2, kernel='rbf')

svr_rbf.fit(X_train, y_train.values.ravel())
y_pred = svr_rbf.predict(X_test)
mean_squared_error(y_test_values, y_pred, squared=False)