In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
import pandas as pd

def read_csv_to_dataframe(file_path):
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print("An error occurred:", e)
        return None

In [8]:
data=read_csv_to_dataframe('/home/nalin21478/ML-Flight-Delay-Prediction/Data/smoted_data.csv')

In [9]:
data.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,DEST,CRS_ELAPSED_TIME,DISTANCE,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Condition,sch_dep,sch_arr,DEP_DELAY,Delayed
0,11,1,5,10,-0.848855,-0.711515,0.761435,0.184485,-0.038486,15,2.069328,2.821704,-0.727985,3,-2.33048,-1.390801,-1,0
1,11,1,5,28,1.217334,1.354774,0.761435,0.184485,-0.038486,15,2.069328,2.821704,-0.727985,3,-2.33048,-1.390801,-7,0
2,11,1,5,20,-0.372043,-0.224999,0.761435,0.184485,-0.038486,15,2.069328,2.821704,-0.727985,3,-2.33048,-1.390801,40,1
3,11,1,5,30,-0.480789,-0.365448,0.761435,0.184485,-0.038486,15,2.069328,2.821704,-0.727985,3,-2.33048,-1.390801,-2,0
4,11,1,5,1,-0.723378,-0.572189,0.494669,0.002399,-0.038486,15,1.907431,2.563372,-0.555007,3,-2.33048,-1.390801,-4,0


In [10]:
data=data.drop(['Delayed'],axis=1)

In [11]:
from sklearn.model_selection import train_test_split


X = data.iloc[:, :-1]  
y = data.iloc[:, -1]   

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)


In [12]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
import joblib
def evaluate_regressor_with_kfold(X_train, y_train, X_test, y_test, regressor, num_folds=10, model_name=None):

    k_fold = KFold(n_splits=num_folds, shuffle=True, random_state=0)

    mse_scores_train = []
    r2_scores_train = []

    for train_indices, val_indices in k_fold.split(X_train, y_train):
        X_fold_train, X_fold_val = X_train[train_indices], X_train[val_indices]
        y_fold_train, y_fold_val = y_train[train_indices], y_train[val_indices]

        regressor.fit(X_fold_train, y_fold_train)

        y_val_pred = regressor.predict(X_fold_val)

        mse_train = mean_squared_error(y_fold_val, y_val_pred)
        r2_train = r2_score(y_fold_val, y_val_pred)

        mse_scores_train.append(mse_train)
        r2_scores_train.append(r2_train)

    average_mse_train = np.mean(mse_scores_train)
    average_r2_train = np.mean(r2_scores_train)
    print(f'Average Mean Squared Error Train: {average_mse_train}')
    print(f'Average R^2 Score Train: {average_r2_train}')

    y_pred = regressor.predict(X_test)
    mse_test = mean_squared_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)

    print(f'Mean Squared Error Test: {mse_test}')
    print(f'R^2 Score Test: {r2_test}')

    if model_name is not None:
        joblib.dump(regressor, f'{model_name}.pkl')

    return average_mse_train, average_r2_train, mse_test, r2_test

# KNN Regression

In [8]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(weights='distance', n_neighbors=3, metric='manhattan')
result=evaluate_regressor_with_kfold(X_train.values, y_train.values, X_test.values, y_test.values, knn, num_folds=10, model_name='knn_hypertuned_regression')

Average Mean Squared Error Train: 1677.1200322734353
Average R^2 Score Train: 0.6438484862130844
Mean Squared Error Test: 1569.5645056155624
R^2 Score Test: 0.68191712913501


# GB

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

gb_regressor = GradientBoostingRegressor(subsample=0.9, n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_depth=4, learning_rate=0.1)
result=evaluate_regressor_with_kfold(X_train.values, y_train.values, X_test.values, y_test.values, gb_regressor, num_folds=10, model_name='gb_hypertuned_regression')

Average Mean Squared Error Train: 2976.2377742182784
Average R^2 Score Train: 0.37076428453843596
Mean Squared Error Test: 3159.804959174802
R^2 Score Test: 0.35964413747138446


'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 4, 'learning_rate': 0.1

# Adaboost

In [13]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor



adaboost_regressor = AdaBoostRegressor()

result = evaluate_regressor_with_kfold(X_train.values, y_train.values, X_test.values, y_test.values, adaboost_regressor, num_folds=10, model_name='adaboost_regression')


Average Mean Squared Error Train: 7371.104643207684
Average R^2 Score Train: -0.5730566560676766
Mean Squared Error Test: 6371.427074334441
R^2 Score Test: -0.2912128224487345


# Ridge and LAsso

In [14]:
from sklearn.linear_model import Ridge, Lasso


ridge_regressor = Ridge(alpha=1.0)  
result_ridge = evaluate_regressor_with_kfold(X_train.values, y_train.values, X_test.values, y_test.values, ridge_regressor, num_folds=10, model_name='ridge_regression')



Average Mean Squared Error Train: 4464.481413526193
Average R^2 Score Train: 0.05568228979693714
Mean Squared Error Test: 4644.431328981691
R^2 Score Test: 0.05877455474286852


In [15]:

lasso_regressor = Lasso(alpha=0.1)  # You can adjust the alpha parameter
result_lasso = evaluate_regressor_with_kfold(X_train.values, y_train.values, X_test.values, y_test.values, lasso_regressor, num_folds=10, model_name='lasso_regression')


Average Mean Squared Error Train: 4469.523503786137
Average R^2 Score Train: 0.054646689353758035
Mean Squared Error Test: 4652.912151580084
R^2 Score Test: 0.0570558586397506


# Linear Regression

In [22]:

from sklearn.linear_model import LinearRegression

linear_regressor = LinearRegression(fit_intercept=True)



# Evaluate the model using your evaluation function
result_linear = evaluate_regressor_with_kfold(X_train.values, y_train.values, X_test.values, y_test.values, linear_regressor, num_folds=10, model_name='linear_regression')


Average Mean Squared Error Train: 4464.482738762302
Average R^2 Score Train: 0.055681531543925154
Mean Squared Error Test: 4644.388304950272
R^2 Score Test: 0.05878327385404947


# Random Forest

In [17]:
from sklearn.ensemble import RandomForestRegressor

random_forest_regressor = RandomForestRegressor(n_estimators=230, max_depth=10,min_samples_split=2, bootstrap=True)

result_random_forest = evaluate_regressor_with_kfold(X_train.values, y_train.values, X_test.values, y_test.values, random_forest_regressor, num_folds=10, model_name='random_forest_regression')


Average Mean Squared Error Train: 3102.407919822637
Average R^2 Score Train: 0.3414895483571355
Mean Squared Error Test: 3289.5423912427595
R^2 Score Test: 0.3333519687180887


# XGBoost

In [19]:
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.metrics import accuracy_score

xgb = XGBRegressor(n_estimators=130)
result_xgb = evaluate_regressor_with_kfold(X_train.values, y_train.values, X_test.values, y_test.values, xgb, num_folds=10, model_name='xgb_regression')

Average Mean Squared Error Train: 2199.3990005954397
Average R^2 Score Train: 0.5342254955048535
Mean Squared Error Test: 2315.346337624081
R^2 Score Test: 0.5307793929569187


# Decision Tree

In [20]:
from sklearn.tree import DecisionTreeRegressor



tree_regressor = DecisionTreeRegressor(min_samples_split=2, min_samples_leaf=4, max_features='sqrt', max_depth=None)
result_df=evaluate_regressor_with_kfold(X_train.values, y_train.values, X_test.values, y_test.values, tree_regressor, num_folds=10, model_name='decision_tree_regression')

Average Mean Squared Error Train: 3743.065708499535
Average R^2 Score Train: 0.2004470158967444
Mean Squared Error Test: 3819.770978929628
R^2 Score Test: 0.22589755650201782


# SVR

In [21]:
from sklearn.svm import SVR
svr_regressor = SVR()
result_svr = evaluate_regressor_with_kfold(X_train.values, y_train.values, X_test.values, y_test.values, svr_regressor, num_folds=10, model_name='svr_regression')

Average Mean Squared Error Train: 4957.627073095535
Average R^2 Score Train: -0.04902390490461132
Mean Squared Error Test: 5165.627425766063
R^2 Score Test: -0.04684936205421919
