In [14]:
# Import Required Libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import RFE

In [2]:
# Read the preprocessed CSV file
dataset = pd.read_csv('/content/Clean_Dataset.csv')

In [3]:
# Inspect the DataFrame
print(dataset.head())

   Unnamed: 0   airline   flight source_city departure_time stops  \
0           0  SpiceJet  SG-8709       Delhi        Evening  zero   
1           1  SpiceJet  SG-8157       Delhi  Early_Morning  zero   
2           2   AirAsia   I5-764       Delhi  Early_Morning  zero   
3           3   Vistara   UK-995       Delhi        Morning  zero   
4           4   Vistara   UK-963       Delhi        Morning  zero   

    arrival_time destination_city    class  duration  days_left  price  
0          Night           Mumbai  Economy      2.17          1   5953  
1        Morning           Mumbai  Economy      2.33          1   5953  
2  Early_Morning           Mumbai  Economy      2.17          1   5956  
3      Afternoon           Mumbai  Economy      2.25          1   5955  
4        Morning           Mumbai  Economy      2.33          1   5955  


In [4]:
#  Classification Setup
# Create binary target: 1 if price > median, else 0
dataset['price_category'] = (dataset['price'] > dataset['price'].median()).astype(int)

In [5]:
# Define features and target for classification
X_class = dataset[['duration', 'days_left', 'airline', 'source_city',
                   'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']]
y_class = dataset['price_category']

In [6]:
# One-hot encode categorical features
X_class_encoded = pd.get_dummies(X_class, drop_first=True)

In [15]:
# ------------------ RFE ------------------
# Use Logistic Regression as estimator for RFE
base_model = LogisticRegression(max_iter=500, solver='liblinear')

# Select top 10 features (you can change n_features_to_select)
rfe = RFE(estimator=base_model, n_features_to_select=10)
rfe.fit(X_class_encoded, y_class)

# Keep only selected features
selected_features = X_class_encoded.columns[rfe.support_]
print("\nSelected Features using RFE:")
print(selected_features)

X_class_selected = X_class_encoded[selected_features]



Selected Features using RFE:
Index(['airline_Air_India', 'airline_GO_FIRST', 'airline_Indigo',
       'airline_SpiceJet', 'airline_Vistara', 'departure_time_Late_Night',
       'stops_two_or_more', 'stops_zero', 'arrival_time_Early_Morning',
       'class_Economy'],
      dtype='object')


In [9]:
# Split data into training and testing sets
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_class_encoded, y_class, test_size=0.2, random_state=0)

# Feature scaling
scaler = StandardScaler()
X_train_c = scaler.fit_transform(X_train_c)
X_test_c = scaler.transform(X_test_c)

In [10]:
#  Train Classification Models
clf_log = LogisticRegression()
clf_rf = RandomForestClassifier(n_estimators=100, random_state=0)
clf_xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=0)

clf_log.fit(X_train_c, y_train_c)
clf_rf.fit(X_train_c, y_train_c)
clf_xgb.fit(X_train_c, y_train_c)

In [11]:
#  Evaluate Models
def evaluate_classifier(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    print(f"\n{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")

evaluate_classifier(clf_log, X_test_c, y_test_c, "Logistic Regression")
evaluate_classifier(clf_rf, X_test_c, y_test_c, "Random Forest")
evaluate_classifier(clf_xgb, X_test_c, y_test_c, "XGBoost")


Logistic Regression Accuracy: 0.9103
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91     30317
           1       0.93      0.89      0.91     29714

    accuracy                           0.91     60031
   macro avg       0.91      0.91      0.91     60031
weighted avg       0.91      0.91      0.91     60031


Random Forest Accuracy: 0.9692
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     30317
           1       0.97      0.96      0.97     29714

    accuracy                           0.97     60031
   macro avg       0.97      0.97      0.97     60031
weighted avg       0.97      0.97      0.97     60031


XGBoost Accuracy: 0.9371
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94     30317
           1       0.97      0

In [12]:
#  Save Best Classifier
pickle.dump(clf_xgb, open("Finalized_Classifier_Model.sav", 'wb'))