# Tabular Prototype

By Monday 6/30, make an attempt at formulating and "solving" your proposed problem.


### Problem Formulation

* Remove unneed columns, for example:
    * duplicated
    * categorical features that were turned into one-hot.
    * features that identify specific rows, like ID number.
    * make sure your target is properly encoded also.
* Split training sample into train, validation, and test sub-samples.

In [None]:
# Drop unecessary columns
train.drop(['Unnamed: 0', 'id'], axis = 1, inplace = True)
test.drop(['Unnamed: 0', 'id'], axis = 1, inplace = True)

# Replace satisfied = 1 | neutral or dissatisfied = 0
train['satisfaction'].replace({'neutral or dissatisfied': 0, 'satisfied': 1},inplace = True)
test['satisfaction'].replace({'neutral or dissatisfied': 0, 'satisfied': 1},inplace = True)

In [None]:
# Detecting Outliers (only numerical variables)
outliers_summary = []
numerical_columns = ['Age', 'Flight Distance','Arrival Delay in Minutes', 'Departure Delay in Minutes']
for column in numerical_columns:
  if column in train.columns:
    Q1 = train[column].quantile(0.25)
    Q3 = train[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = ((train[column] < lower_bound) | (train[column] > upper_bound)).sum()
    outliers_summary.append([column, outliers])

outliers_table = pd.DataFrame(outliers_summary, columns=['Feature', 'Outliers Count'])
print(outliers_table)

# Create Box-Plot to visualize outliers
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(2, 2, figsize=(10, 8))

sns.boxplot(data=train, x='Age', ax=ax[0,0])
sns.boxplot(data=train, x='Flight Distance', ax=ax[0,1])
sns.boxplot(data=train, x='Arrival Delay in Minutes', ax=ax[1,0])
sns.boxplot(data=train, x='Departure Delay in Minutes', ax=ax[1,1])

plt.tight_layout()
plt.show()

In [None]:
# Create new features in both train and test
train['Delay Difference'] = train['Arrival Delay in Minutes'] - train['Departure Delay in Minutes']
test['Delay Difference'] = test['Arrival Delay in Minutes'] - test['Departure Delay in Minutes']

# Get current column list and remove 'satisfaction' in train
cols = train.columns.tolist()
cols.remove('satisfaction')

# Reorder with new features + satisfaction at the end in train
new_order = cols + ['satisfaction']

# Apply new order
train = train[new_order]

# Drop highly correlated features to avoid multi-collinearity (Inflight wifi service, Arrival Delay in Minutes)
train.drop(['Inflight wifi service', 'Arrival Delay in Minutes'], axis = 1, inplace = True)
test.drop(['Inflight wifi service', 'Arrival Delay in Minutes'], axis=1, inplace=True)

print(train.columns)

In [None]:
# Standardize numerical variables (StandardScaler)
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# List of features to scale
features_to_scale = train.drop(columns=['satisfaction']).select_dtypes(include='number').columns.tolist()

# Applying StandardScaler to the selected features in train
scaled_features = scaler.fit_transform(train[features_to_scale])

# Applying StandardScaler to the selected features in test
scaled_features_test = scaler.transform(test[features_to_scale])

# Create a DataFrame from the scaled features
scaled_columns = [f'Scaled {col}' for col in features_to_scale]
scaled_features_train = pd.DataFrame(scaled_features, columns=scaled_columns, index=train.index)
scaled_features_test_df = pd.DataFrame(scaled_features_test, columns=scaled_columns, index=test.index)

# Concatenate the scaled features with the original dataframe
train = pd.concat([train, scaled_features_train], axis=1)
test = pd.concat([test, scaled_features_test_df], axis=1)

# Display the first few rows
pd.set_option('display.max_columns', None)
print("Train sample:")
print(train.head())

print("\nTest sample:")
print(test.head())

In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Identify categorical columns
categorical_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()

# Initialize encoder
one_hot_encoder = OneHotEncoder()

# Fit-transform on train set
encoded_array = one_hot_encoder.fit_transform(train[categorical_cols]).toarray()

# Transform test set
encoded_test_array = one_hot_encoder.transform(test[categorical_cols]).toarray()

# Create a DataFrame from the dense array
encoded_df = pd.DataFrame(encoded_array,
                          columns=one_hot_encoder.get_feature_names_out(categorical_cols),
                          index=train.index)

# Create DataFrame from encoded test array
encoded_test_df = pd.DataFrame(encoded_test_array,
                               columns=one_hot_encoder.get_feature_names_out(categorical_cols),
                               index=test.index)

# Drop original categorical columns and add encoded columns in train
train_encoded = train.drop(columns=categorical_cols)
train_encoded = pd.concat([train_encoded, encoded_df], axis=1)

# Drop original categorical columns and add encoded columns in test
test_encoded = test.drop(columns=categorical_cols)
test_encoded = pd.concat([test_encoded, encoded_test_df], axis=1)

print(train_encoded.head())
print(test_encoded.head())

### Train ML Algorithm

* You only need one algorithm for now. You can do more if you like.
* For now, focus on making it work, rather than best result.
* Try to get a non-trivial result.

### **KNN and Logistic Regression**

In [None]:
# Retrain KNN and Logistic Regression with Hyperparameters Tuning
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Scaled numerical features and encoded categorical features
scaled_cols = [col for col in train_encoded.columns if col.startswith('Scaled ')]
categorical_encoded_cols = one_hot_encoder.get_feature_names_out(categorical_cols).tolist()

# Final features and target in train
X = train_encoded[scaled_cols + categorical_encoded_cols]
y = train_encoded['satisfaction']

# Final features and target in test
X_test_1 = test_encoded[scaled_cols + categorical_encoded_cols]
y_test_1 = test_encoded['satisfaction']

# Train-test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Hyperparameter Grids
lr_params = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [200]
}

knn_params = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # Manhattan and Euclidean
}

# Initialize Models
lr = LogisticRegression()
knn = KNeighborsClassifier()

# Grid Search
lr_grid = GridSearchCV(lr, lr_params, cv=3, n_jobs=-1, verbose=1)
knn_grid = GridSearchCV(knn, knn_params, cv=3, n_jobs=-1, verbose=1)

# Fit Models
lr_grid.fit(X_train, y_train)
knn_grid.fit(X_train, y_train)

# Best Models and Predictions
lr_best = lr_grid.best_estimator_
knn_best = knn_grid.best_estimator_

y_pred_lr = lr_best.predict(X_test)
y_pred_knn = knn_best.predict(X_test)

# Predict on external test set
y_test_pred_lr = lr_best.predict(X_test_1)
y_test_pred_knn = knn_best.predict(X_test_1)

### **XGBoost and Random Forest**

In [None]:
# Choose another 2 models (Random Forest, XGBoost) with Hyperparameter Tuning
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Rating Features
rating_features = [
    'Departure/Arrival time convenient',
    'Ease of Online booking', 'Gate location', 'Food and drink',
    'Online boarding', 'Seat comfort', 'Inflight entertainment',
    'On-board service', 'Leg room service', 'Baggage handling',
    'Checkin service', 'Inflight service', 'Cleanliness'
]

# Original and Encoded Categorical columns
original_num_cols = rating_features + ['Age', 'Flight Distance', 'Departure Delay in Minutes', 'Delay Difference']
categorical_encoded_cols = one_hot_encoder.get_feature_names_out(categorical_cols).tolist()

# Final features and target (train split)
X = train_encoded[original_num_cols + categorical_encoded_cols]
y = train_encoded['satisfaction']

# Final features and target (external test)
X_test_1 = test_encoded[original_num_cols + categorical_encoded_cols]
y_test_1 = test_encoded['satisfaction']

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Hyperparameter Grids
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'bootstrap': [True]
}

xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8]
}

# Initialize Models
rf_classifier = RandomForestClassifier(random_state=42)
xgb_classifier = XGBClassifier(random_state=42)

# Grid Search
rf_grid = GridSearchCV(rf_classifier, rf_params, cv=3, n_jobs=-1, verbose=1)
xgb_grid = GridSearchCV(xgb_classifier, xgb_params, cv=3, n_jobs=-1, verbose=1)

# Fit Models
rf_grid.fit(X_train, y_train)
xgb_grid.fit(X_train, y_train)

# Best Models and Predictions
rf_best = rf_grid.best_estimator_
xgb_best = xgb_grid.best_estimator_

y_pred_rf = rf_best.predict(X_test)
y_pred_xgb = xgb_best.predict(X_test)

# Predict on external test set
y_test_pred_rf = rf_best.predict(X_test_1)
y_test_pred_xgb = xgb_best.predict(X_test_1)

### Evaluate Performance on Validation Sample

* Compute the usual metric for your ML task.
* Compute the score for the kaggle challenge.

### **KNN and Logistic Regression**

In [None]:
# Evaluation for train split
print("Best Logistic Regression Hyperparameters:", lr_grid.best_params_)
print("Classification Report - Logistic Regression")
print(classification_report(y_test, y_pred_lr))

print("Best KNN Hyperparameters:", knn_grid.best_params_)
print("Classification Report - KNN")
print(classification_report(y_test, y_pred_knn))

In [None]:
# Evaluation for external test
print("\nClassification Report - Logistic Regression (External Test Set)")
print(classification_report(y_test_1, y_test_pred_lr))

print("Classification Report - KNN (External Test Set)")
print(classification_report(y_test_1, y_test_pred_knn))

### **XGBoost and Random Forest**

In [None]:
# Evaluation on train split set
print("Best XGBoost Hyperparameters:", xgb_grid.best_params_)
print("Classification Report - XGBoost")
print(classification_report(y_test, y_pred_xgb))

print("Best Random Forest Hyperparameters:", rf_grid.best_params_)
print("Classification Report - Random Forest")
print(classification_report(y_test, y_pred_rf))

In [None]:
# Evaluate on external test set
print("Classification Report - XGBoost (External Test Set)")
print(classification_report(y_test_1, y_test_pred_xgb))

print("Classification Report - Random Forest (External Test Set)")
print(classification_report(y_test_1, y_test_pred_rf))