In [1]:
import pandas as pd

def read_csv_to_dataframe(file_path):
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print("An error occurred:", e)
        return None
    
    

In [2]:
scaled_df = read_csv_to_dataframe("/home/nalin21478/ML-Flight-Delay-Prediction/Data/smoted_data.csv")

In [3]:
scaled_df=scaled_df.drop(['DEP_DELAY'],axis=1)

In [4]:
scaled_df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,DEST,CRS_ELAPSED_TIME,DISTANCE,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Condition,sch_dep,sch_arr,Delayed
0,11,1,5,10,-0.848855,-0.711515,0.761435,0.184485,-0.038486,15,2.069328,2.821704,-0.727985,3,-2.33048,-1.390801,0
1,11,1,5,28,1.217334,1.354774,0.761435,0.184485,-0.038486,15,2.069328,2.821704,-0.727985,3,-2.33048,-1.390801,0
2,11,1,5,20,-0.372043,-0.224999,0.761435,0.184485,-0.038486,15,2.069328,2.821704,-0.727985,3,-2.33048,-1.390801,1
3,11,1,5,30,-0.480789,-0.365448,0.761435,0.184485,-0.038486,15,2.069328,2.821704,-0.727985,3,-2.33048,-1.390801,0
4,11,1,5,1,-0.723378,-0.572189,0.494669,0.002399,-0.038486,15,1.907431,2.563372,-0.555007,3,-2.33048,-1.390801,0


In [5]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import joblib

def evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, classifier, num_folds=10, model_name=None):

    k_fold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=0)
    

    accuracies = []


    for train_indices, val_indices in k_fold.split(X_train, y_train):
        X_fold_train, X_fold_val = X_train[train_indices], X_train[val_indices]
        y_fold_train, y_fold_val = y_train[train_indices], y_train[val_indices]

        
        classifier.fit(X_fold_train, y_fold_train)

    
        y_val_pred = classifier.predict(X_fold_val)

     
        fold_accuracy = accuracy_score(y_fold_val, y_val_pred)
        accuracies.append(fold_accuracy)

   
    average_accuracy = sum(accuracies) / num_folds
    print(f'Average Accuracy Train: {average_accuracy*100}')

    
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy Test: {accuracy*100:.2f}%')
    
    
    class_report = classification_report(y_test, y_pred)
    print('Classification Report:')
    print(class_report)
    
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix:')
    print(conf_matrix)
    if model_name is not None:
        joblib.dump(classifier, f'{model_name}.pkl') 
    return class_report,confusion_matrix





In [6]:
from sklearn.model_selection import train_test_split
Independent_features = scaled_df.iloc[:, :-1].values
dependent_feature = scaled_df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(
        Independent_features, dependent_feature, test_size=0.2, random_state=0
    )

In [7]:
from sklearn.tree import DecisionTreeClassifier
print("DECISION TREE\n")
decision_tree=DecisionTreeClassifier(criterion='entropy',splitter='best',class_weight='balanced')

evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, decision_tree, num_folds=10,model_name='decision_tree_entropy_smote_strat'
                               )

DECISION TREE

Average Accuracy Train: 86.69248307118102
Accuracy Test: 86.61%
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.87      0.86      4623
           1       0.87      0.87      0.87      4754

    accuracy                           0.87      9377
   macro avg       0.87      0.87      0.87      9377
weighted avg       0.87      0.87      0.87      9377

Confusion Matrix:
[[4007  616]
 [ 640 4114]]


('              precision    recall  f1-score   support\n\n           0       0.86      0.87      0.86      4623\n           1       0.87      0.87      0.87      4754\n\n    accuracy                           0.87      9377\n   macro avg       0.87      0.87      0.87      9377\nweighted avg       0.87      0.87      0.87      9377\n',
 <function sklearn.metrics._classification.confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)>)

In [8]:
from sklearn.ensemble import RandomForestClassifier
print("RANDOM FOREST CLASSIFIER \n")
random_forest_classifier = RandomForestClassifier(criterion='entropy',n_estimators=1700)

random_forest_classifier_report,random_forest_classifier_matrix=evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, random_forest_classifier, num_folds=10,model_name='random_forest_smote_strat')


RANDOM FOREST CLASSIFIER 



In [None]:

from sklearn.naive_bayes import GaussianNB
print("NAIVE BAYES CLASSIFIER \n")

# Initialize the Naive Bayes Classifier
nb_classifier = GaussianNB(var_smoothing=0.001)

evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, nb_classifier, num_folds=10,model_name='naive_bayes_smote_strat')


: 

In [None]:

from sklearn.neighbors import KNeighborsClassifier



knn_classifier = KNeighborsClassifier()




print("KNN CLASSIFIER \n")
evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, knn_classifier, num_folds=10,model_name='knn_classifier_smote_strat')



: 

In [None]:

from sklearn.ensemble import GradientBoostingClassifier


gb_classifier = GradientBoostingClassifier(subsample=1.0, n_estimators=950)
print("GRADIENT BOOSTING CLASSIFIER \n")
evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, gb_classifier, num_folds=10,model_name='gb_classifier_smote_strat')


: 

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_regression_classifier = LogisticRegression(C=0.001,max_iter=100,multi_class='ovr',penalty='l1',solver='liblinear')

print("LOGISTIC REGRESSION CLASSIFIER \n")
logistic_regression_report, logistic_regression_matrix = evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, logistic_regression_classifier, num_folds=10,model_name='logistic_regression_smote_strat')


: 

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

print("XGBoost CLASSIFIER \n")

xgb_classifier = xgb.XGBClassifier(random_state=0)
evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, xgb_classifier, num_folds=10,model_name='xgb_classifier_smote_strat')


: 

In [None]:
print("SVM CLASSIFIER \n")
from sklearn.svm import SVC
svm__linear_classifier = SVC(probability=True)

svm__linear_classifier_report,svm__linear_classifier_matrix=evaluate_classifier_with_kfold(X_train, y_train, X_test, y_test, svm__linear_classifier, num_folds=10,model_name='svm__smote_strat')


: 