In [1]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

import datetime

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier # Model building
import lightgbm as lgb # Feature selection

In [3]:
df_survey = pd.read_csv('/content/drive/MyDrive/GreatLearning_DSBA/Hackathon_Bullet_Train/Surveydata_train.csv')
df_travel = pd.read_csv('/content/drive/MyDrive/GreatLearning_DSBA/Hackathon_Bullet_Train/Traveldata_train.csv')

df_train = pd.merge(df_survey, df_travel, on='ID')

In [4]:
df_survey_test = pd.read_csv('/content/drive/MyDrive/GreatLearning_DSBA/Hackathon_Bullet_Train/Surveydata_test.csv')
df_travel_test = pd.read_csv('/content/drive/MyDrive/GreatLearning_DSBA/Hackathon_Bullet_Train/Traveldata_test.csv')

df_test = pd.merge(df_survey_test, df_travel_test, on='ID')

In [5]:
df_train = df_train.drop('ID', axis=1)
df_test = df_test.drop('ID', axis=1)

In [6]:
# Mapping categorical columns to numerical
mapping_quality = {'Extremely Poor': 1, 'Poor': 2, 'Needs Improvement': 3, 'Acceptable': 4, 'Good': 5, 'Excellent': 6}
mapping_platform = {'Very Inconvenient': 1, 'Inconvenient': 2, 'Needs Improvement': 3, 'Manageable': 4, 'Convenient': 5, 'Very Convenient': 6}

quality_cols = ['Seat_Comfort','Arrival_Time_Convenient','Catering','Onboard_Wifi_Service','Onboard_Entertainment','Online_Support','Ease_of_Online_Booking','Onboard_Service','Legroom','Baggage_Handling','CheckIn_Service','Cleanliness','Online_Boarding']

# Re-load the original dataframes to ensure the latest data is used
for col in quality_cols:
    df_train[col] = df_train[col].map(mapping_quality)
    df_test[col] = df_test[col].map(mapping_quality)

df_train['Platform_Location'] = df_train['Platform_Location'].map(mapping_platform)
df_test['Platform_Location'] = df_test['Platform_Location'].map(mapping_platform)

In [7]:
# Identify categorical columns (excluding those already mapped)
categorical_cols_train = df_train.select_dtypes(include='object').columns
categorical_cols_test = df_test.select_dtypes(include='object').columns

# Impute NaN values with 'Missing' in categorical columns
for col in categorical_cols_train:
    df_train[col].fillna('Missing', inplace=True)

for col in categorical_cols_test:
    df_test[col].fillna('Missing', inplace=True)

In [8]:
# Create X_train, y_train from df_train
X_train = df_train.drop('Overall_Experience', axis=1)
y_train = df_train['Overall_Experience']

In [9]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Seat_Comfort,94318.0,3.839182,1.392526,1.0,3.0,4.0,5.0,6.0
Arrival_Time_Convenient,85449.0,3.994991,1.52628,1.0,3.0,4.0,5.0,6.0
Catering,85638.0,3.853511,1.443945,1.0,3.0,4.0,5.0,6.0
Platform_Location,94349.0,3.990864,1.308233,1.0,3.0,4.0,5.0,6.0
Onboard_Wifi_Service,94349.0,4.248227,1.31952,1.0,3.0,4.0,5.0,6.0
Onboard_Entertainment,94361.0,4.38251,1.34619,1.0,3.0,5.0,5.0,6.0
Online_Support,94288.0,4.51925,1.308174,1.0,4.0,5.0,6.0,6.0
Ease_of_Online_Booking,94306.0,4.470108,1.305546,1.0,3.0,5.0,6.0,6.0
Onboard_Service,86778.0,4.470799,1.268574,1.0,4.0,5.0,5.0,6.0
Legroom,94289.0,4.482994,1.29226,1.0,3.0,5.0,6.0,6.0


In [10]:
# One hot encoding for categorical columns
X_train = pd.get_dummies(X_train, columns=categorical_cols_train)
df_test = pd.get_dummies(df_test, columns=categorical_cols_test)

In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 31 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Seat_Comfort                     94318 non-null  float64
 1   Arrival_Time_Convenient          85449 non-null  float64
 2   Catering                         85638 non-null  float64
 3   Platform_Location                94349 non-null  float64
 4   Onboard_Wifi_Service             94349 non-null  float64
 5   Onboard_Entertainment            94361 non-null  float64
 6   Online_Support                   94288 non-null  float64
 7   Ease_of_Online_Booking           94306 non-null  float64
 8   Onboard_Service                  86778 non-null  float64
 9   Legroom                          94289 non-null  float64
 10  Baggage_Handling                 94237 non-null  float64
 11  CheckIn_Service                  94302 non-null  float64
 12  Cleanliness       

# LGB Feature Selection

In [12]:
lgb_features = lgb.LGBMClassifier(n_estimators=100, random_state=42)
lgb_features.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 51593, number of negative: 42786
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 976
[LightGBM] [Info] Number of data points in the train set: 94379, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.546658 -> initscore=0.187175
[LightGBM] [Info] Start training from score 0.187175


In [13]:
# Get feature importances
feature_imp = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': lgb_features.feature_importances_
}).sort_values(by='Importance', ascending=False)

feature_imp

Unnamed: 0,Feature,Importance
0,Seat_Comfort,432
7,Ease_of_Online_Booking,178
5,Onboard_Entertainment,171
3,Platform_Location,163
10,Baggage_Handling,157
14,Age,144
12,Cleanliness,139
6,Online_Support,138
1,Arrival_Time_Convenient,133
11,CheckIn_Service,124


In [14]:
features_to_consider = feature_imp.head(26)["Feature"].tolist()

# Filter top N features (e.g., top 15 contributing features)
top_features = feature_imp.head(20)["Feature"].tolist()
other_features = features_to_consider[:13] + features_to_consider[20:]

X_train_top = X_train[top_features]
X_test_top = df_test[top_features]
X_train_support = X_train[other_features]
X_test_support = df_test[other_features]

In [15]:
X_train_top.head()

Unnamed: 0,Seat_Comfort,Ease_of_Online_Booking,Onboard_Entertainment,Platform_Location,Baggage_Handling,Age,Cleanliness,Online_Support,Arrival_Time_Convenient,CheckIn_Service,Travel_Class_Business,Type_Travel_Personal Travel,Travel_Distance,Legroom,Catering,Customer_Type_Disloyal Customer,Onboard_Service,Arrival_Delay_in_Mins,Online_Boarding,Type_Travel_Business Travel
0,3.0,3.0,3.0,6.0,3.0,52.0,3.0,4.0,6.0,5.0,True,False,272,4.0,6.0,False,3.0,5.0,2.0,False
1,2.0,5.0,2.0,3.0,2.0,48.0,5.0,5.0,6.0,3.0,False,True,2200,3.0,2.0,False,6.0,0.0,5.0,False
2,3.0,6.0,5.0,3.0,6.0,43.0,6.0,6.0,3.0,5.0,True,False,1061,6.0,3.0,False,6.0,119.0,6.0,True
3,4.0,4.0,3.0,3.0,4.0,44.0,4.0,4.0,3.0,5.0,True,False,780,4.0,,False,4.0,18.0,4.0,True
4,4.0,5.0,5.0,4.0,5.0,50.0,5.0,6.0,4.0,5.0,True,False,1981,5.0,4.0,False,5.0,0.0,5.0,True


In [16]:
X_train_support.head()

Unnamed: 0,Seat_Comfort,Ease_of_Online_Booking,Onboard_Entertainment,Platform_Location,Baggage_Handling,Age,Cleanliness,Online_Support,Arrival_Time_Convenient,CheckIn_Service,Travel_Class_Business,Type_Travel_Personal Travel,Travel_Distance,Customer_Type_Loyal Customer,Gender_Female,Onboard_Wifi_Service,Departure_Delay_in_Mins,Gender_Male,Type_Travel_Missing
0,3.0,3.0,3.0,6.0,3.0,52.0,3.0,4.0,6.0,5.0,True,False,272,True,True,5.0,0.0,False,True
1,2.0,5.0,2.0,3.0,2.0,48.0,5.0,5.0,6.0,3.0,False,True,2200,True,False,5.0,9.0,True,False
2,3.0,6.0,5.0,3.0,6.0,43.0,6.0,6.0,3.0,5.0,True,False,1061,True,True,3.0,77.0,False,False
3,4.0,4.0,3.0,3.0,4.0,44.0,4.0,4.0,3.0,5.0,True,False,780,True,True,4.0,13.0,False,False
4,4.0,5.0,5.0,4.0,5.0,50.0,5.0,6.0,4.0,5.0,True,False,1981,True,True,3.0,0.0,False,False


## XGB model

In [17]:
# Define the parameter distribution for XGBoost hyperparameter tuning
# Using distributions instead of a grid for RandomizedSearchCV
xgb_param_dist_support = {
    'n_estimators': [200, 250],  # Focus on reducing noise
    'max_depth': [6, 7],
    'learning_rate': [0.08, 0.1, 0.12],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.85, 0.9, 0.95],
    'gamma': [0.05, 0.08],
    'min_child_weight': [2, 3, 4]
}

# Create an XGBoost Classifier model with GPU support
xgb_model_support = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, tree_method='gpu_hist')

# Perform RandomizedSearchCV for hyperparameter tuning with XGBoost
# n_iter controls the number of random combinations to try
xgb_random_search_support = RandomizedSearchCV(estimator=xgb_model_support, param_distributions=xgb_param_dist_support, n_iter=25, cv=3, n_jobs=-1, scoring='accuracy', random_state=42)
xgb_random_search_support.fit(X_train_support, y_train)

In [18]:
# Get the best parameters and the best XGBoost model
best_xgb_params_support = xgb_random_search_support.best_params_
best_xgb_model_support = xgb_random_search_support.best_estimator_

print("Best XGBoost Hyperparameters:", best_xgb_params_support)
#print("Best XGBoost model:", best_xgb_model_support)

Best XGBoost Hyperparameters: {'subsample': 0.8, 'n_estimators': 200, 'min_child_weight': 2, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.05, 'colsample_bytree': 0.95}


In [19]:

# Define the parameter distribution for XGBoost hyperparameter tuning
# Using distributions instead of a grid for RandomizedSearchCV
xgb_param_dist_top = {
    'n_estimators': [200, 250],  # center around best
    'max_depth': [10, 11, 12],
    'learning_rate': [0.08, 0.1, 0.12],
    'subsample': [0.9, 1.0],
    'colsample_bytree': [0.85, 0.9, 0.95],
    'gamma': [0.05, 0.1, 0.15],
    'min_child_weight': [2, 3, 4]
}

# Create an XGBoost Classifier model with GPU support
xgb_model_top = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, tree_method='gpu_hist')

# Perform RandomizedSearchCV for hyperparameter tuning with XGBoost
# n_iter controls the number of random combinations to try
xgb_random_search_top = RandomizedSearchCV(estimator=xgb_model_top, param_distributions=xgb_param_dist_top, n_iter=50, cv=3, n_jobs=-1, scoring='accuracy', random_state=42)
xgb_random_search_top.fit(X_train_top, y_train)

In [20]:
# Get the best parameters and the best XGBoost model
best_xgb_params_top = xgb_random_search_top.best_params_
best_xgb_model_top = xgb_random_search_top.best_estimator_

print("Best XGBoost Hyperparameters:", best_xgb_params_top)
#print("Best XGBoost model:", best_xgb_model_top)

Best XGBoost Hyperparameters: {'subsample': 0.9, 'n_estimators': 200, 'min_child_weight': 3, 'max_depth': 12, 'learning_rate': 0.08, 'gamma': 0.15, 'colsample_bytree': 0.85}


In [21]:
# Function to predict and evaluate accuracy
def predict_and_evaluate(model, X_data, y_data=None):
    """
    Makes predictions using the provided model and evaluates accuracy if y_data is provided.

    Args:
        model: Trained machine learning model.
        X_data: Features to make predictions on.
        y_data: True labels for evaluation (optional).

    Returns:
        predictions: Array of predictions.
        accuracy: Accuracy score if y_data is provided, None otherwise.
    """
    predictions = model.predict(X_data)
    accuracy = None
    if y_data is not None:
        accuracy = accuracy_score(y_data, predictions)
    return predictions, accuracy


In [23]:
train_predictions_top = xgb_random_search_top.predict(X_train_top)
train_predictions_support = xgb_random_search_support.predict(X_train_support)

accuracy_top = accuracy_score(y_train, train_predictions_top)
accuracy_support = accuracy_score(y_train, train_predictions_support)

print(f"XGBoost Training Accuracy (top): {accuracy_top}")
print(f"XGBoost Training Accuracy (support): {accuracy_support}")

XGBoost Training Accuracy (top): 0.991904978861823
XGBoost Training Accuracy (support): 0.9666451223259411


In [44]:
prob1 = best_xgb_model_top.predict_proba(X_test_top)[:, 1]
prob2 = best_xgb_model_support.predict_proba(X_test_support)[:, 1]

In [60]:
# Weighted probability
final_probs = 0.65 * prob1 + 0.35 * prob2

# Convert to binary labels using threshold (0.5 here, adjustable)
final_labels = (final_probs >= 0.5).astype(int)

In [61]:
# Create submission dataframe
submission_df = pd.DataFrame({
    'ID': df_travel_test['ID'],
    'Overall_Experience': final_labels
})

submission_df.to_csv('xgb_pred.csv', index=False)

# Write to drive as well for backup
#timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
#submission_df.to_csv(f"/content/drive/MyDrive/GreatLearning_DSBA/Hackathon_Bullet_Train/pred_{timestamp}.csv", index=False)

In [62]:
submission_df

Unnamed: 0,ID,Overall_Experience
0,99900001,1
1,99900002,1
2,99900003,1
3,99900004,0
4,99900005,1
...,...,...
35597,99935598,0
35598,99935599,1
35599,99935600,1
35600,99935601,1


In [None]:
# Example usage of the predict and evaluate function:
# Predict on training data
#train_predictions, train_accuracy = predict_and_evaluate(best_xgb_model, X_train, y_train)
#print(f"XGBoost Training Accuracy (using function): {train_accuracy}")

# To make predictions on the test data (df_test), first ensure it's preprocessed
# and columns are aligned with X_train as done previously.
# Example: Assuming X_test_aligned is already created
#X_test_aligned = df_test.reindex(columns=X_train.columns, fill_value=0)
#test_predictions, test_accuracy = predict_and_evaluate(best_xgb_model, X_test_aligned)

# Note: We can't calculate test accuracy here as y_test is not available.


In [None]:
# Create the final DataFrame with 'ID' and 'Overall_Experience'
#xgb_pred_df = pd.DataFrame({
#    'ID': df_travel_test['ID'],
#    'Overall_Experience': test_predictions
#})

#xgb_pred_df.to_csv('xgb_pred.csv', index=False)

#timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
#xgb_pred_df.to_csv(f"/content/drive/MyDrive/GreatLearning_DSBA/Hackathon_Bullet_Train/pred_{timestamp}.csv", index=False)

# Additional tasks

In [None]:
# Create a Random Forest Classifier model
#rf_model = RandomForestClassifier(random_state=42)

# Train the Random Forest model on the training data
#rf_model.fit(X_train, y_train)

# Make predictions on the training data
#y_train_pred_rf = rf_model.predict(X_train)

# Evaluate the Random Forest model's accuracy on the training data
#accuracy_rf = accuracy_score(y_train, y_train_pred_rf)
#print("Random Forest Training Accuracy:", accuracy_rf)

In [None]:
# Instantiate an XGBoost Classifier model with the best hyperparameters and GPU support
#best_xgb_model_selected = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, tree_method='gpu_hist', **best_xgb_params)

# Train the XGBoost model on the selected features of the training data
#best_xgb_model_selected.fit(X_train_selected, y_train)