In [17]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [18]:
#import data set
df = pd.read_csv('D:\\Academic\\SLIIT\\Year 3\\Semester 1\\IT3051 - Fundamentals of Data Mining\\Group Project\\Py codes\\airline_data_feature_processed.csv')

In [19]:
# Display basic info
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nTarget variable distribution:")
print(df['satisfaction'].value_counts())

Dataset Shape: (103904, 23)

First few rows:
   Online boarding  Inflight wifi service  Type of Travel  Class  \
0                3                      3               1      2   
1                3                      3               0      0   
2                5                      2               0      0   
3                2                      2               0      0   
4                5                      3               0      0   

   AvgServiceScore  TotalServiceScore  Inflight entertainment  \
0         3.857143                 54                       5   
1         2.285714                 32                       1   
2         3.714286                 52                       5   
3         3.000000                 42                       2   
4         3.500000                 49                       3   

   Ease of Online booking  Flight Distance  Seat comfort  ...  \
0                       3              460             5  ...   
1                       3

In [20]:
X = df.iloc[:,:22].values
y = df.iloc[:,22].values


In [21]:
# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [22]:
# Standardize features for better model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"Number of features: {X_train.shape[1]}")

Training set size: 83123 samples
Testing set size: 20781 samples
Number of features: 22


In [23]:
#Model 1
from sklearn.linear_model import LogisticRegression

print("\n" + "="*50)
print("MODEL 1: LOGISTIC REGRESSION")
print("="*50)

# Train the model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate the model
print("\nAccuracy:", accuracy_score(y_test, y_pred_lr))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))



MODEL 1: LOGISTIC REGRESSION

Accuracy: 0.8769549107357683
ROC-AUC Score: 0.9278575232962364

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.90      0.89     11776
           1       0.87      0.84      0.86      9005

    accuracy                           0.88     20781
   macro avg       0.88      0.87      0.87     20781
weighted avg       0.88      0.88      0.88     20781


Confusion Matrix:
[[10644  1132]
 [ 1425  7580]]


In [24]:
#Model 2
from sklearn.ensemble import RandomForestClassifier

print("\n" + "="*50)
print("MODEL 2: RANDOM FOREST CLASSIFIER")
print("="*50)

# Train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)  # Random Forest doesn't require scaling

# Make predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("\nAccuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))




MODEL 2: RANDOM FOREST CLASSIFIER

Accuracy: 0.9620807468360522
ROC-AUC Score: 0.9939722638615625

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     11776
           1       0.97      0.94      0.96      9005

    accuracy                           0.96     20781
   macro avg       0.96      0.96      0.96     20781
weighted avg       0.96      0.96      0.96     20781


Confusion Matrix:
[[11486   290]
 [  498  8507]]


In [25]:
#Model 3
from sklearn.svm import SVC

print("\n" + "="*50)
print("MODEL 3: SUPPORT VECTOR MACHINE (SVM)")
print("="*50)

# Train the model
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)




MODEL 3: SUPPORT VECTOR MACHINE (SVM)


In [26]:
#SVC
# Make predictions
y_pred_svm = svm_model.predict(X_test_scaled)
y_pred_proba_svm = svm_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate the model
print("\nAccuracy:", accuracy_score(y_test, y_pred_svm))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))



Accuracy: 0.9560656368798421
ROC-AUC Score: 0.989803219226034

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     11776
           1       0.96      0.94      0.95      9005

    accuracy                           0.96     20781
   macro avg       0.96      0.95      0.96     20781
weighted avg       0.96      0.96      0.96     20781


Confusion Matrix:
[[11413   363]
 [  550  8455]]


In [27]:
#Model 4
from sklearn.ensemble import GradientBoostingClassifier

print("\n" + "="*50)
print("MODEL 4: GRADIENT BOOSTING CLASSIFIER")
print("="*50)

# Train the model
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test)
y_pred_proba_gb = gb_model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("\nAccuracy:", accuracy_score(y_test, y_pred_gb))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba_gb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))


MODEL 4: GRADIENT BOOSTING CLASSIFIER

Accuracy: 0.9414850103459891
ROC-AUC Score: 0.987864875982244

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.95     11776
           1       0.94      0.92      0.93      9005

    accuracy                           0.94     20781
   macro avg       0.94      0.94      0.94     20781
weighted avg       0.94      0.94      0.94     20781


Confusion Matrix:
[[11237   539]
 [  677  8328]]


In [28]:
#Model 5
from sklearn.neighbors import KNeighborsClassifier

print("\n" + "="*50)
print("MODEL 5: K-NEAREST NEIGHBORS (KNN)")
print("="*50)

# Train the model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_knn = knn_model.predict(X_test_scaled)
y_pred_proba_knn = knn_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate the model
print("\nAccuracy:", accuracy_score(y_test, y_pred_knn))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba_knn))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_knn))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))


MODEL 5: K-NEAREST NEIGHBORS (KNN)

Accuracy: 0.9310909003416582
ROC-AUC Score: 0.9719708904548802

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94     11776
           1       0.95      0.89      0.92      9005

    accuracy                           0.93     20781
   macro avg       0.93      0.93      0.93     20781
weighted avg       0.93      0.93      0.93     20781


Confusion Matrix:
[[11344   432]
 [ 1000  8005]]


In [29]:
# Create comparison dataframe
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'SVM', 'Gradient Boosting', 'KNN'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_svm),
        accuracy_score(y_test, y_pred_gb),
        accuracy_score(y_test, y_pred_knn)
    ],
    'ROC-AUC': [
        roc_auc_score(y_test, y_pred_proba_lr),
        roc_auc_score(y_test, y_pred_proba_rf),
        roc_auc_score(y_test, y_pred_proba_svm),
        roc_auc_score(y_test, y_pred_proba_gb),
        roc_auc_score(y_test, y_pred_proba_knn)
    ]
})

results = results.sort_values('Accuracy', ascending=False)
print("\n", results.to_string(index=False))
print("\n" + "="*50)
print(f"Best Model: {results.iloc[0]['Model']} with Accuracy: {results.iloc[0]['Accuracy']:.4f}")
print("="*50)


               Model  Accuracy  ROC-AUC
      Random Forest  0.962081 0.993972
                SVM  0.956066 0.989803
  Gradient Boosting  0.941485 0.987865
                KNN  0.931091 0.971971
Logistic Regression  0.876955 0.927858

Best Model: Random Forest with Accuracy: 0.9621
