In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import numpy as np
train = pd.read_csv('train.csv')
test= pd.read_csv('test.csv')


In [None]:
train.head()

In [None]:
test.head()

In [None]:
train = train.drop(columns=['Unnamed: 0','id'])
test= test.drop(columns=['Unnamed: 0','id'])

In [None]:
train.columns[train.isna().any()].tolist()

In [None]:
test.columns[train.isna().any()].tolist()

In [None]:
def detect_outliers(column):
    threshold = 3,
    z_value = (column- column.mean())/column.std()
    return (abs(z_value)> threshold).any()

In [None]:
bool_train_outliers = train[['Arrival Delay in Minutes']].apply(lambda x: detect_outliers(x.dropna()))
bool_test_outliers = test[['Arrival Delay in Minutes']].apply(lambda x: detect_outliers(x.dropna()))
print(bool_train_outliers)
print(bool_test_outliers)

In [None]:
#Imputation using median
train[['Arrival Delay in Minutes']] = train[['Arrival Delay in Minutes']].apply(lambda x: x.fillna(x.median()))
test[['Arrival Delay in Minutes']] = train[['Arrival Delay in Minutes']].apply(lambda x: x.fillna(x.median()))

In [None]:
# Scalling
num_col = ['Age','Flight Distance','Departure Delay in Minutes','Arrival Delay in Minutes']
ord_col =['Inflight wifi service',
    'Departure/Arrival time convenient',
    'Ease of Online booking',
    'Gate location',
    'Food and drink',
    'Online boarding',
    'Seat comfort',
    'Inflight entertainment',
    'On-board service',
    'Leg room service',
    'Baggage handling',
    'Checkin service',
    'Inflight service',
    'Cleanliness']
nominal_col =['Gender','Customer Type','Type of Travel','Class']


In [None]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train['satisfaction'])
y_test = label_encoder.fit_transform(test['satisfaction'])

In [None]:
ordinal_encoder = OrdinalEncoder()
nominal_encoder = OneHotEncoder(sparse_output = False, drop = 'first')
numeric_encoder = StandardScaler()
preprocessor = ColumnTransformer(
    transformers =[
        ('numeric',numeric_encoder,num_col),
        ('ordinal',ordinal_encoder,ord_col),
        ('nominal',nominal_encoder,nominal_col)
    ]
)
pipeline = Pipeline(steps=[('preprocessor',preprocessor)])
X_train = pipeline.fit_transform(train)
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

feature_names_adjusted=[name.split('__',1)[-1] for name in feature_names]

X_train = pd.DataFrame(X_train,columns=feature_names_adjusted)
X_train.head()


In [None]:
X_test = pipeline.fit_transform(test)
X_test = pd.DataFrame(X_test,columns=feature_names_adjusted )
X_test.head()

In [None]:
from sklearn.metrics import roc_curve,roc_auc_score,accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
xgb_classifier = xgb.XGBClassifier(random_state=42)
xgb_classifier.fit(X_train,y_train)

In [None]:
model_report = classification_report(y_test,xgb_classifier.predict(X_test), target_names=np.unique(y_test).astype(str))
print("Classification Report for XGBoost:\n",model_report)

In [None]:
importances= xgb_classifier.feature_importances_
indices = np.argsort(importances)[::-1]

feature_names = X_train.columns
df_importances = pd.DataFrame({
    'Feature':feature_names[indices],
    'Importance': importances[indices]
})

plt.figure(figsize=(12,8))
plt.title('Feature importances for Xgboost model')
plt.bar(range(X_train.shape[1]),importances[indices],align = 'center')
plt.xticks(range(X_train.shape[1]),feature_names[indices],rotation=90)
plt.xlim([-1,X_train.shape[1]])
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

In [None]:
top_5_indices = indices[:5]
X_train_top5=X_train.iloc[:,top_5_indices]
X_test_top5=X_test.iloc[:,top_5_indices]

In [None]:
top_xgb_clf = xgb.XGBClassifier(random_state=42)
top_xgb_clf.fit(X_train_top5, y_train)

y_train_pred_top_xgb = top_xgb_clf.predict(X_train_top5)
y_test_pred_top_xgb = top_xgb_clf.predict(X_test_top5)
print("Classification Report for  XGBoost with top predictors:\n",classification_report(y_test, y_test_pred_top_xgb))

In [None]:
import pickle
with open('xgb.pkl', 'wb') as file:
    pickle.dump(xgb_classifier, file)