### Experimentation for Flight Delay Prediction

In [1]:
import pandas as pd
from datetime import datetime
import mlflow

In [2]:
# initialize mlflow params

# above all, we need to run mlflow server by executing:
#  mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root=artifacts
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("flight-delay-experiment")

<Experiment: artifact_location='artifacts/1', experiment_id='1', lifecycle_stage='active', name='flight-delay-experiment', tags={}>

The dataset source: https://www.kaggle.com/datasets/yuanyuwendymu/airline-delay-and-cancellation-data-2009-2018?resource=download&select=2018.csv

In [3]:
target = ['DEP_DELAY']
data_path = '../data/2018.csv'

In [4]:
def read_df(file_path, dataset_size_in_million):
    df_chuncks = pd.read_csv(file_path, chunksize=1000000)
    df = pd.DataFrame({"columns":[]})
    index = 0
    for ck in df_chuncks:
        if index < dataset_size_in_million:
            df = pd.concat([df, ck], axis=0, ignore_index=True)
        index+=1
    return df

In [5]:
def extract_dep_hour(dep_time):
    dep_time = str(int(dep_time))
    
    if len(dep_time) == 3:
        dep_time = f'0{dep_time}'
    return dep_time

In [6]:
def preprocessing(df_source):
    df= df_source[['FL_DATE', 'OP_CARRIER', 'ORIGIN', 'DEST', 'DEP_TIME', 'DISTANCE', 'CRS_DEP_TIME', 'DEP_DELAY']]
    df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
    df['FL_DAY'] = df['FL_DATE'].dt.day_name()
    df['DEP_HOUR_MIN'] = df['CRS_DEP_TIME'].apply(lambda x: extract_dep_hour(x))
    df['DEP_HOUR'] = df['DEP_HOUR_MIN'].apply(lambda x: x[:2])
    df['DEP_MIN'] = df['DEP_HOUR_MIN'].apply(lambda x: x[2:])
    df['DEP_DELAY'] = df['DEP_DELAY'].apply(lambda x: abs(x))
    df= df[['FL_DAY','OP_CARRIER', 'ORIGIN', 'DEST', 'DISTANCE', 'DEP_HOUR', 'DEP_MIN', 'DEP_DELAY']]
    
    categorical = ['FL_DAY','OP_CARRIER', 'ORIGIN', 'DEST', 'DEP_HOUR', 'DEP_MIN']
    numerical = ['DISTANCE']
    df[categorical] = df[categorical].astype(str)
    
    train_df = df.drop(columns = target).copy()
    
    return train_df

In [7]:

df = read_df(data_path, 2)
train_df = preprocessing(df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['FL_DAY'] = df['FL_DATE'].dt.day_name()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DEP_HOUR_MIN'] = df['CRS_DEP_TIME'].apply(lambda x: extract_dep_hour(x))
A value is trying to be set on a copy o

In [8]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()

train_dict = train_df.to_dict(orient='records')
X = dv.fit_transform(train_dict)
y = df[target]

# missing values -> 0
y.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.fillna(0, inplace=True)


In [9]:
# train test validation split
from sklearn.model_selection import train_test_split
# we are going to split our dataset into 80:10:10 as training:test:validation respectively
train_size=0.8
# split the data in training and other dataset
X_train, X_oth, y_train, y_oth = train_test_split(X, y, train_size=0.8)

# for the other data which is the remaining one, we split it into test and validation
test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_oth, y_oth, test_size=0.5)

print('X_train: ', X_train.shape) 
print('y_train: ', y_train.shape)
print('X_valid: ', X_valid.shape) 
print('y_valid: ', y_valid.shape)
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

X_train:  (1600000, 831)
y_train:  (1600000, 1)
X_valid:  (200000, 831)
y_valid:  (200000, 1)
X_test:  (200000, 831)
y_test:  (200000, 1)


In [12]:
# the models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVR

from sklearn.metrics import mean_squared_error

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import xgboost as xgb
import pickle

In [13]:
# train xgboost model
def best_model_search(train, valid, y_val):
    def objective(params):
        with mlflow.start_run():
            mlflow.set_tag("model", "xgboost")
            mlflow.log_params(params)
            booster = xgb.train(
                params=params,
                dtrain=train,
                num_boost_round=800,
                evals=[(valid, 'validation')],
                early_stopping_rounds=30
            )
            y_pred = booster.predict(valid)
            rmse = mean_squared_error(y_val, y_pred, squared=False)
            mlflow.log_metric("rmse", rmse)

        return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:linear',
        'seed': 42
    }

    best_result = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=1,
        trials=Trials()
    )
    return



In [17]:

mlflow.sklearn.autolog()
model_files = ['lin_reg.bin', 'lasso.bin', 'lvr.bin', 'xgb.bin']

for index, model_class in enumerate([LinearRegression, Lasso, LinearSVR, xgb]):

    if model_files[index] == 'xgb.bin':
        train = xgb.DMatrix(X_train, label=y_train.values)
        valid = xgb.DMatrix(X_valid, label=y_valid.values)
        best_model_search(train, valid, y_valid.values)
        

    else:
        with mlflow.start_run():

            mlflow.set_tag("model", model_files[index])
            mlflow.log_param("data-path", data_path)

            mlmodel = model_class()
            mlmodel.fit(X_train, y_train.values.ravel())


            with open(f'models/{model_files[index]}', 'wb') as f_out:
                pickle.dump((dv, mlmodel), f_out)

            y_pred = mlmodel.predict(X_valid)
            rmse = mean_squared_error(y_valid, y_pred, squared=False)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_artifact(local_path=f"models/{model_files[index]}", artifact_path="models_pickle")



[0]	validation-rmse:42.90899                                                                                                                                                                                              
[1]	validation-rmse:42.81423                                                                                                                                                                                              
[2]	validation-rmse:42.72980                                                                                                                                                                                              
[3]	validation-rmse:42.65444                                                                                                                                                                                              
[4]	validation-rmse:42.58740                                                                                                

[73]	validation-rmse:41.89688                                                                                                                                                                                             
[74]	validation-rmse:41.89648                                                                                                                                                                                             
[75]	validation-rmse:41.89440                                                                                                                                                                                             
[76]	validation-rmse:41.89312                                                                                                                                                                                             
[77]	validation-rmse:41.89152                                                                                               

[147]	validation-rmse:41.82754                                                                                                                                                                                            
[148]	validation-rmse:41.82742                                                                                                                                                                                            
[149]	validation-rmse:41.82730                                                                                                                                                                                            
[150]	validation-rmse:41.82664                                                                                                                                                                                            
[151]	validation-rmse:41.82606                                                                                              

[221]	validation-rmse:41.80309                                                                                                                                                                                            
[222]	validation-rmse:41.80267                                                                                                                                                                                            
[223]	validation-rmse:41.80216                                                                                                                                                                                            
[224]	validation-rmse:41.80244                                                                                                                                                                                            
[225]	validation-rmse:41.80228                                                                                              

[295]	validation-rmse:41.79169                                                                                                                                                                                            
[296]	validation-rmse:41.79174                                                                                                                                                                                            
[297]	validation-rmse:41.79146                                                                                                                                                                                            
[298]	validation-rmse:41.79160                                                                                                                                                                                            
[299]	validation-rmse:41.79157                                                                                              

[369]	validation-rmse:41.78885                                                                                                                                                                                            
[370]	validation-rmse:41.78890                                                                                                                                                                                            
[371]	validation-rmse:41.78886                                                                                                                                                                                            
[372]	validation-rmse:41.78866                                                                                                                                                                                            
[373]	validation-rmse:41.78858                                                                                              