# Modeling: Airline Passenger Satisfaction

In [None]:

# Load and basic preprocess
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
import matplotlib.pyplot as plt

# Data
df = pd.read_csv('cleaned_airline_passenger_satisfaction.csv', encoding='ascii')
if 'Arrival Delay in Minutes' in df.columns:
    df['Arrival Delay in Minutes'] = df['Arrival Delay in Minutes'].fillna(0)

# Target
df['target'] = (df['satisfaction'].astype(str).str.lower() == 'satisfied').astype(int)
X = df.drop(columns=['satisfaction','target'])
y = df['target']

# Features
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()

pre = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', 'passthrough', num_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape)
print(X_test.shape)


In [None]:

# Logistic Regression baseline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay

logit = Pipeline([
    ('pre', pre),
    ('clf', LogisticRegression(max_iter=1000))
])
logit.fit(X_train, y_train)

y_pred = logit.predict(X_test)
y_proba = logit.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_proba))

ConfusionMatrixDisplay.from_estimator(logit, X_test, y_test)
plt.title('Logistic Regression Confusion Matrix')
plt.show()

RocCurveDisplay.from_estimator(logit, X_test, y_test)
plt.title('Logistic Regression ROC')
plt.show()


In [None]:

# XGBoost model
from xgboost import XGBClassifier
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay

xgb = Pipeline([
    ('pre', pre),
    ('clf', XGBClassifier(n_estimators=300, max_depth=4, learning_rate=0.1, subsample=0.9, colsample_bytree=0.8, eval_metric='logloss', n_jobs=4))
])

xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
y_proba = xgb.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_proba))

ConfusionMatrixDisplay.from_estimator(xgb, X_test, y_test)
plt.title('XGBoost Confusion Matrix')
plt.show()

RocCurveDisplay.from_estimator(xgb, X_test, y_test)
plt.title('XGBoost ROC')
plt.show()


In [None]:

# LightGBM model
from lightgbm import LGBMClassifier
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay

lgbm = Pipeline([
    ('pre', pre),
    ('clf', LGBMClassifier(n_estimators=400, learning_rate=0.05, num_leaves=31, subsample=0.9, colsample_bytree=0.8, random_state=42))
])

lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
y_proba = lgbm.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_proba))

ConfusionMatrixDisplay.from_estimator(lgbm, X_test, y_test)
plt.title('LightGBM Confusion Matrix')
plt.show()

RocCurveDisplay.from_estimator(lgbm, X_test, y_test)
plt.title('LightGBM ROC')
plt.show()


In [None]:

# Save best model (choose by ROC AUC here) and feature list for app
import joblib
from sklearn.metrics import roc_auc_score

models = [('logit', logit), ('xgb', xgb), ('lgbm', lgbm)]
aucs = []
for name, m in models:
    proba = m.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, proba)
    aucs.append((name, auc))

best_name, best_auc = sorted(aucs, key=lambda t: t[1], reverse=True)[0]
print(best_name)
print(best_auc)

best_model = dict(models)[best_name]
joblib.dump(best_model, 'best_model.joblib')

# Save columns meta
meta = {
    'cat_cols': cat_cols,
    'num_cols': num_cols
}
joblib.dump(meta, 'model_meta.joblib')
