# Function to read a df

In [1]:
import pandas as pd

def read_csv_to_dataframe(file_path):
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print("An error occurred:", e)
        return None
    
    

In [2]:
data = read_csv_to_dataframe("/home/nalin21478/ML-Flight-Delay-Prediction/Data/smoted_data.csv")

# dropping Dep_delay column

In [3]:
data=data.drop(['DEP_DELAY'],axis=1)

In [4]:
data.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,DEST,CRS_ELAPSED_TIME,DISTANCE,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Condition,sch_dep,sch_arr,Delayed
0,11,1,5,10,-0.848855,-0.711515,0.761435,0.184485,-0.038486,15,2.069328,2.821704,-0.727985,3,-2.33048,-1.390801,0
1,11,1,5,28,1.217334,1.354774,0.761435,0.184485,-0.038486,15,2.069328,2.821704,-0.727985,3,-2.33048,-1.390801,0
2,11,1,5,20,-0.372043,-0.224999,0.761435,0.184485,-0.038486,15,2.069328,2.821704,-0.727985,3,-2.33048,-1.390801,1
3,11,1,5,30,-0.480789,-0.365448,0.761435,0.184485,-0.038486,15,2.069328,2.821704,-0.727985,3,-2.33048,-1.390801,0
4,11,1,5,1,-0.723378,-0.572189,0.494669,0.002399,-0.038486,15,1.907431,2.563372,-0.555007,3,-2.33048,-1.390801,0


# function to do Kfold

In [5]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold
import joblib

def evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, classifier, num_folds=10, model_name=None):

    k_fold = KFold(n_splits=num_folds, shuffle=True, random_state=0)
    

    accuracies = []


    for train_indices, val_indices in k_fold.split(X_train, y_train):
        X_fold_train, X_fold_val = X_train[train_indices], X_train[val_indices]
        y_fold_train, y_fold_val = y_train[train_indices], y_train[val_indices]

        
        classifier.fit(X_fold_train, y_fold_train)

    
        y_val_pred = classifier.predict(X_fold_val)

     
        fold_accuracy = accuracy_score(y_fold_val, y_val_pred)
        accuracies.append(fold_accuracy)

   
    average_accuracy = sum(accuracies) / num_folds
    print(f'Average Accuracy Train: {average_accuracy*100}')

    
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy Test: {accuracy*100:.2f}%')
    
    
    class_report = classification_report(y_test, y_pred)
    print('Classification Report:')
    print(class_report)
    
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix:')
    print(conf_matrix)
    if model_name is not None:
        joblib.dump(classifier, f'{model_name}.pkl') 
    return class_report,confusion_matrix





# train test split

In [6]:
from sklearn.model_selection import train_test_split
Independent_features = data.iloc[:, :-1].values
dependent_feature = data.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(
        Independent_features, dependent_feature, test_size=0.2, random_state=0
    )

# decision tree

In [16]:
from sklearn.tree import DecisionTreeClassifier
print("DECISION TREE\n")
decision_tree=DecisionTreeClassifier(criterion='entropy',splitter='best',class_weight='balanced')

evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, decision_tree, num_folds=10,model_name='decision_tree_entropy_smote'
                               )

DECISION TREE

Average Accuracy Train: 86.80182209188659
Accuracy Test: 86.67%
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.87      0.87      4623
           1       0.87      0.87      0.87      4754

    accuracy                           0.87      9377
   macro avg       0.87      0.87      0.87      9377
weighted avg       0.87      0.87      0.87      9377

Confusion Matrix:
[[4009  614]
 [ 636 4118]]


('              precision    recall  f1-score   support\n\n           0       0.86      0.87      0.87      4623\n           1       0.87      0.87      0.87      4754\n\n    accuracy                           0.87      9377\n   macro avg       0.87      0.87      0.87      9377\nweighted avg       0.87      0.87      0.87      9377\n',
 <function sklearn.metrics._classification.confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)>)

# random forest

In [31]:
from sklearn.ensemble import RandomForestClassifier
print("RANDOM FOREST CLASSIFIER \n")
random_forest_classifier = RandomForestClassifier(criterion='entropy',n_estimators=1700)

random_forest_classifier_report,random_forest_classifier_matrix=evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, random_forest_classifier, num_folds=10,model_name='random_forest_smote')


RANDOM FOREST CLASSIFIER 

Average Accuracy Train: 91.11853052519328
Accuracy Test: 91.29%
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91      4623
           1       0.93      0.89      0.91      4754

    accuracy                           0.91      9377
   macro avg       0.91      0.91      0.91      9377
weighted avg       0.91      0.91      0.91      9377

Confusion Matrix:
[[4308  315]
 [ 502 4252]]


# Naive bayes

In [41]:

from sklearn.naive_bayes import GaussianNB
print("NAIVE BAYES CLASSIFIER \n")

# Initialize the Naive Bayes Classifier
nb_classifier = GaussianNB(var_smoothing=0.001)

evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, nb_classifier, num_folds=10,model_name='naive_bayes_smote')


NAIVE BAYES CLASSIFIER 

Average Accuracy Train: 58.00825593175153
Accuracy Test: 58.14%
Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.51      0.55      4623
           1       0.58      0.65      0.61      4754

    accuracy                           0.58      9377
   macro avg       0.58      0.58      0.58      9377
weighted avg       0.58      0.58      0.58      9377

Confusion Matrix:
[[2356 2267]
 [1658 3096]]


('              precision    recall  f1-score   support\n\n           0       0.59      0.51      0.55      4623\n           1       0.58      0.65      0.61      4754\n\n    accuracy                           0.58      9377\n   macro avg       0.58      0.58      0.58      9377\nweighted avg       0.58      0.58      0.58      9377\n',
 <function sklearn.metrics._classification.confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)>)

# KNN

In [40]:

from sklearn.neighbors import KNeighborsClassifier



knn_classifier = KNeighborsClassifier()




print("KNN CLASSIFIER \n")
evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, knn_classifier, num_folds=10,model_name='knn_classifier_smote')



KNN CLASSIFIER 

Average Accuracy Train: 80.93056500488757
Accuracy Test: 81.21%
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.69      0.78      4623
           1       0.76      0.93      0.83      4754

    accuracy                           0.81      9377
   macro avg       0.83      0.81      0.81      9377
weighted avg       0.83      0.81      0.81      9377

Confusion Matrix:
[[3198 1425]
 [ 337 4417]]


('              precision    recall  f1-score   support\n\n           0       0.90      0.69      0.78      4623\n           1       0.76      0.93      0.83      4754\n\n    accuracy                           0.81      9377\n   macro avg       0.83      0.81      0.81      9377\nweighted avg       0.83      0.81      0.81      9377\n',
 <function sklearn.metrics._classification.confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)>)

# Gradient Boost

In [33]:

from sklearn.ensemble import GradientBoostingClassifier


gb_classifier = GradientBoostingClassifier(subsample=1.0, n_estimators=950)
print("GRADIENT BOOSTING CLASSIFIER \n")
evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, gb_classifier, num_folds=10,model_name='gb_classifier_smote')


GRADIENT BOOSTING CLASSIFIER 



Average Accuracy Train: 91.8917593530614
Accuracy Test: 91.96%
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.98      0.92      4623
           1       0.98      0.86      0.92      4754

    accuracy                           0.92      9377
   macro avg       0.93      0.92      0.92      9377
weighted avg       0.93      0.92      0.92      9377

Confusion Matrix:
[[4537   86]
 [ 668 4086]]


('              precision    recall  f1-score   support\n\n           0       0.87      0.98      0.92      4623\n           1       0.98      0.86      0.92      4754\n\n    accuracy                           0.92      9377\n   macro avg       0.93      0.92      0.92      9377\nweighted avg       0.93      0.92      0.92      9377\n',
 <function sklearn.metrics._classification.confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)>)

# multi class logistic regression

In [44]:
from sklearn.linear_model import LogisticRegression

logistic_regression_classifier = LogisticRegression(C=0.001,max_iter=100,multi_class='ovr',penalty='l1',solver='liblinear')

print("LOGISTIC REGRESSION CLASSIFIER \n")
logistic_regression_report, logistic_regression_matrix = evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, logistic_regression_classifier, num_folds=10,model_name='logistic_regression_smote')


LOGISTIC REGRESSION CLASSIFIER 

Average Accuracy Train: 55.44325459877365
Accuracy Test: 55.34%
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.54      0.55      4623
           1       0.56      0.56      0.56      4754

    accuracy                           0.55      9377
   macro avg       0.55      0.55      0.55      9377
weighted avg       0.55      0.55      0.55      9377

Confusion Matrix:
[[2518 2105]
 [2083 2671]]


# XGboost

In [7]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

print("XGBoost CLASSIFIER \n")

xgb_classifier = xgb.XGBClassifier(subsample=0.8, reg_lambda=0.5, reg_alpha=1, n_estimators=800, max_depth=900, learning_rate=0.01, gamma=2, colsample_bytree=0.6)
evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, xgb_classifier, num_folds=10,model_name='xgb_classifier_smote')


XGBoost CLASSIFIER 

Average Accuracy Train: 92.79564631653781
Accuracy Test: 92.66%
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93      4623
           1       0.98      0.87      0.92      4754

    accuracy                           0.93      9377
   macro avg       0.93      0.93      0.93      9377
weighted avg       0.93      0.93      0.93      9377

Confusion Matrix:
[[4541   82]
 [ 606 4148]]


('              precision    recall  f1-score   support\n\n           0       0.88      0.98      0.93      4623\n           1       0.98      0.87      0.92      4754\n\n    accuracy                           0.93      9377\n   macro avg       0.93      0.93      0.93      9377\nweighted avg       0.93      0.93      0.93      9377\n',
 <function sklearn.metrics._classification.confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)>)

# SVM

In [8]:
print("SVM CLASSIFIER \n")
from sklearn.svm import SVC
svm__linear_classifier = SVC(probability=True)

svm__linear_classifier_report,svm__linear_classifier_matrix=evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, svm__linear_classifier, num_folds=10,model_name='svm__smote_strat')


SVM CLASSIFIER 

Average Accuracy Train: 62.01038371989692
Accuracy Test: 62.40%
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.64      0.63      4623
           1       0.63      0.61      0.62      4754

    accuracy                           0.62      9377
   macro avg       0.62      0.62      0.62      9377
weighted avg       0.62      0.62      0.62      9377

Confusion Matrix:
[[2946 1677]
 [1849 2905]]


# Adaboost

In [9]:
from sklearn.ensemble import AdaBoostClassifier


ab_classifier = AdaBoostClassifier()
print("ADABOOST CLASSIFIER \n")
ab_classifier_report,ab_classifier_matrix=evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, ab_classifier, num_folds=10,model_name='ab_classifier_imbalanced')
#

ADABOOST CLASSIFIER 

Average Accuracy Train: 74.41142593086289
Accuracy Test: 74.31%
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.74      0.74      4623
           1       0.75      0.75      0.75      4754

    accuracy                           0.74      9377
   macro avg       0.74      0.74      0.74      9377
weighted avg       0.74      0.74      0.74      9377

Confusion Matrix:
[[3418 1205]
 [1204 3550]]
