In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [3]:
dataset_simplified = pd.read_csv('/content/Clean_Dataset.csv')

In [5]:
dependent_variable = dataset_simplified[['class']]

In [6]:
independent_variables = dataset_simplified.drop(['class', 'Unnamed: 0', 'flight'], axis=1)

In [7]:
independent_variables_encoded = pd.get_dummies(independent_variables, drop_first=True)

In [8]:
independent_variables_encoded['duration_x_days_left'] = independent_variables_encoded['duration'] * independent_variables_encoded['days_left']

In [10]:
if 'price' in independent_variables_encoded.columns: # Check if 'price' is still in features after dropping 'class'
    poly = PolynomialFeatures(degree=2, include_bias=False)
    # Ensure the columns exist before transforming
    cols_for_poly = ['duration', 'price']
    existing_cols_for_poly = [col for col in cols_for_poly if col in independent_variables_encoded.columns]
    if existing_cols_for_poly:
        # Drop rows with NaN values before applying polynomial features
        independent_variables_encoded_cleaned = independent_variables_encoded.dropna(subset=existing_cols_for_poly)
        poly_features = poly.fit_transform(independent_variables_encoded_cleaned[existing_cols_for_poly])

        # Create a DataFrame from the polynomial features
        poly_feature_names = poly.get_feature_names_out(existing_cols_for_poly)
        poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=independent_variables_encoded_cleaned.index)

        # Concatenate the new polynomial features
        independent_variables_encoded = pd.concat([independent_variables_encoded_cleaned, poly_df], axis=1)
    else:
        print("Columns for polynomial features ('duration', 'price') not found after preprocessing.")
else:
    print("'price' column not found in independent variables after dropping 'class'. Skipping polynomial features for 'price'.")
    # If 'price' is not in the independent variables, we can still create polynomial features for 'duration' if it exists
    if 'duration' in independent_variables_encoded.columns:
         # Drop rows with NaN values before applying polynomial features
         independent_variables_encoded_cleaned = independent_variables_encoded.dropna(subset=['duration'])
         poly = PolynomialFeatures(degree=2, include_bias=False)
         poly_features = poly.fit_transform(independent_variables_encoded_cleaned[['duration']])
         poly_feature_names = poly.get_feature_names_out(['duration'])
         poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=independent_variables_encoded_cleaned.index)
         independent_variables_encoded = pd.concat([independent_variables_encoded_cleaned, poly_df], axis=1)

In [12]:
# Ensure the dependent variable has the same index as the independent variables after dropping rows
dependent_variable_aligned = dependent_variable.loc[independent_variables_encoded.index]

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(independent_variables_encoded, dependent_variable_aligned.values.ravel(), test_size=0.30, random_state=0)

In [15]:
from sklearn.feature_selection import RFE

# Initialize RFE with a Decision Tree classifier and the number of features to select
# You might want to experiment with the number of features to select
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=10) # Example: selecting 10 features

# Fit RFE to the training data to select the most important features
rfe.fit(X_train_cls, y_train_cls)

# Get the selected features
selected_features_rfe = X_train_cls.columns[rfe.support_]

print("Selected features by RFE:")
print(list(selected_features_rfe))

# Transform the training and testing data to include only the selected features
X_train_cls_rfe = rfe.transform(X_train_cls)
X_test_cls_rfe = rfe.transform(X_test_cls)

Selected features by RFE:
['days_left', 'price', 'airline_Indigo', 'source_city_Chennai', 'source_city_Hyderabad', 'source_city_Kolkata', 'price', 'duration^2', 'duration price', 'price^2']


In [16]:
from sklearn.metrics import accuracy_score, f1_score, recall_score

print("--- Decision Tree Classifier ---")
classifier_dt_simplified = DecisionTreeClassifier(random_state=0)
classifier_dt_simplified.fit(X_train_cls_rfe, y_train_cls)
y_pred_dt = classifier_dt_simplified.predict(X_test_cls_rfe)
accuracy_dt = accuracy_score(y_test_cls, y_pred_dt)
f1_dt = f1_score(y_test_cls, y_pred_dt, average='weighted') # Use weighted average for multiclass
recall_dt = recall_score(y_test_cls, y_pred_dt, average='weighted') # Use weighted average for multiclass
print("Accuracy:", accuracy_dt)
print("F1 Score:", f1_dt)
print("Recall:", recall_dt)
print("-" * 30)

# Logistic Regression
print("--- Logistic Regression ---")
classifier_lr = LogisticRegression(random_state=0, max_iter=1000)
classifier_lr.fit(X_train_cls_rfe, y_train_cls)
y_pred_lr = classifier_lr.predict(X_test_cls_rfe)
accuracy_lr = accuracy_score(y_test_cls, y_pred_lr)
f1_lr = f1_score(y_test_cls, y_pred_lr, average='weighted') # Use weighted average for multiclass
recall_lr = recall_score(y_test_cls, y_pred_lr, average='weighted') # Use weighted average for multiclass
print("Accuracy:", accuracy_lr)
print("F1 Score:", f1_lr)
print("Recall:", recall_lr)
print("-" * 30)

# Random Forest Classifier
print("--- Random Forest Classifier ---")
classifier_rf = RandomForestClassifier(n_estimators=100, random_state=0)
classifier_rf.fit(X_train_cls_rfe, y_train_cls)
y_pred_rf = classifier_rf.predict(X_test_cls_rfe)
accuracy_rf = accuracy_score(y_test_cls, y_pred_rf)
f1_rf = f1_score(y_test_cls, y_pred_rf, average='weighted') # Use weighted average for multiclass
recall_rf = recall_score(y_test_cls, y_pred_rf, average='weighted') # Use weighted average for multiclass
print("Accuracy:", accuracy_rf)
print("F1 Score:", f1_rf)
print("Recall:", recall_rf)
print("-" * 30)

# Support Vector Machine (SVM) Classifier
print("--- Support Vector Machine (SVM) Classifier ---")
# Note: SVMs can be computationally expensive on large datasets
classifier_svm = SVC(random_state=0)
classifier_svm.fit(X_train_cls_rfe, y_train_cls)
y_pred_svm = classifier_svm.predict(X_test_cls_rfe)
accuracy_svm = accuracy_score(y_test_cls, y_pred_svm)
f1_svm = f1_score(y_test_cls, y_pred_svm, average='weighted') # Use weighted average for multiclass
recall_svm = recall_score(y_test_cls, y_pred_svm, average='weighted') # Use weighted average for multiclass
print("Accuracy:", accuracy_svm)
print("F1 Score:", f1_svm)
print("Recall:", recall_svm)
print("-" * 30)

--- Decision Tree Classifier ---
Accuracy: 0.9998905024568511
F1 Score: 0.999890518682968
Recall: 0.9998905024568511
------------------------------
--- Logistic Regression ---
Accuracy: 0.9969066944060443
F1 Score: 0.9969090963099243
Recall: 0.9969066944060443
------------------------------
--- Random Forest Classifier ---
Accuracy: 0.9999178768426383
F1 Score: 0.9999178859708641
Recall: 0.9999178768426383
------------------------------
--- Support Vector Machine (SVM) Classifier ---
Accuracy: 0.9899399132231971
F1 Score: 0.9898970430914833
Recall: 0.9899399132231971
------------------------------
