In [56]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier
from sklearn.metrics import accuracy_score  
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from pprint import pprint


In [45]:
# Load datasets
df = pd.read_csv("Surveydata_train_(2).csv")
dftest = pd.read_csv("Surveydata_test_(2).csv")
dftravel = pd.read_csv('Traveldata_train_(2).csv')
dftraveltest = pd.read_csv('Traveldata_test_(2).csv')

In [46]:
train_data = pd.merge(df, dftravel, on='ID', how='inner')
test_data = pd.merge(dftest, dftraveltest, on='ID', how='inner')

In [26]:
train_data.isnull().sum()

ID                            0
Overall_Experience            0
Seat_Comfort                 61
Seat_Class                    0
Arrival_Time_Convenient    8930
Catering                   8741
Platform_Location            30
Onboard_Wifi_Service         30
Onboard_Entertainment        18
Online_Support               91
Ease_of_Online_Booking       73
Onboard_Service            7601
Legroom                      90
Baggage_Handling            142
CheckIn_Service              77
Cleanliness                   6
Online_Boarding               6
Gender                       77
Customer_Type              8951
Age                          33
Type_Travel                9226
Travel_Class                  0
Travel_Distance               0
Departure_Delay_in_Mins      57
Arrival_Delay_in_Mins       357
dtype: int64

In [28]:
train_data.loc[train_data['Seat_Comfort'].isnull(), 'Seat_Comfort'] = train_data['Seat_Comfort'].mode()[0]
train_data.loc[train_data['Arrival_Time_Convenient'].isnull(), 'Arrival_Time_Convenient'] = train_data['Arrival_Time_Convenient'].mode()[0]
train_data.loc[train_data['Catering'].isnull(), 'Catering'] = train_data['Catering'].mode()[0]
train_data.loc[train_data['Platform_Location'].isnull(), 'Platform_Location'] = train_data['Platform_Location'].mode()[0]
train_data.loc[train_data['Onboard_Wifi_Service'].isnull(), 'Onboard_Wifi_Service'] = train_data['Onboard_Wifi_Service'].mode()[0]
train_data.loc[train_data['Onboard_Entertainment'].isnull(), 'Onboard_Entertainment'] = train_data['Onboard_Entertainment'].mode()[0]
train_data.loc[train_data['Online_Support'].isnull(), 'Online_Support'] = train_data['Online_Support'].mode()[0]
train_data.loc[train_data['Ease_of_Online_Booking'].isnull(), 'Ease_of_Online_Booking'] = train_data['Ease_of_Online_Booking'].mode()[0]
train_data.loc[train_data['Onboard_Service'].isnull(), 'Onboard_Service'] = train_data['Onboard_Service'].mode()[0]
train_data.loc[train_data['Legroom'].isnull(), 'Legroom'] = train_data['Legroom'].mode()[0]
train_data.loc[train_data['Baggage_Handling'].isnull(), 'Baggage_Handling'] = train_data['Baggage_Handling'].mode()[0]
train_data.loc[train_data['CheckIn_Service'].isnull(), 'CheckIn_Service'] = train_data['CheckIn_Service'].mode()[0]
train_data.loc[train_data['Cleanliness'].isnull(), 'Cleanliness'] = train_data['Cleanliness'].mode()[0]
train_data.loc[train_data['Online_Boarding'].isnull(), 'Online_Boarding'] = train_data['Online_Boarding'].mode()[0]
train_data.loc[train_data['Gender'].isnull(), 'Gender'] = train_data['Gender'].mode()[0]
train_data.loc[train_data['Customer_Type'].isnull(), 'Customer_Type'] = train_data['Customer_Type'].mode()[0]
train_data.loc[train_data['Age'].isnull(), 'Age'] = train_data['Age'].mode()[0]
train_data.loc[train_data['Type_Travel'].isnull(), 'Type_Travel'] = train_data['Type_Travel'].mode()[0]
train_data.loc[train_data['Departure_Delay_in_Mins'].isnull(), 'Departure_Delay_in_Mins'] = train_data['Departure_Delay_in_Mins'].mode()[0]
train_data.loc[train_data['Arrival_Delay_in_Mins'].isnull(), 'Arrival_Delay_in_Mins'] = train_data['Arrival_Delay_in_Mins'].mode()[0]

In [47]:
X_train = train_data.drop(columns=["Overall_Experience"])
y_train = train_data["Overall_Experience"]

test_ID = test_data["ID"]
X_test = test_data.drop(columns=["ID"]) 

combined_data = pd.concat([X_train, X_test], keys=["train", "test"])
combined_data = pd.get_dummies(combined_data)

X_train_encoded = combined_data.loc["train"]
X_test_encoded = combined_data.loc["test"]

X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [30]:
# Initialize and train the decision tree model
model = DecisionTreeClassifier()  # Or DecisionTreeRegressor for regression
model.fit(X_train_encoded, y_train)

# Make predictions on the encoded test set
test_predictions = model.predict(X_test_encoded)

# Save predictions to CSV with ID and predicted outcome
output_df = pd.DataFrame({"ID": test_ID, "Overall_Experience": test_predictions})
output_df.to_csv("predictions.csv", index=False)


In [48]:
model = RandomForestClassifier(n_estimators=1500, random_state=42)  # Adjust n_estimators as needed
model.fit(X_train_encoded, y_train)

# Make predictions on the encoded test set
test_predictions = model.predict(X_test_encoded)

# Save predictions to CSV with ID and predicted outcome
output_df = pd.DataFrame({"ID": test_ID, "Overall_Experience": test_predictions})
output_df.to_csv("predictions_rf.csv", index=False)

In [50]:
print('Parameters currently in use:\n')
pprint(model.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 1500,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [None]:
n_estimators = [int(x) for x in np.linspace(start=1000, stop=2500, num=16)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

# Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

pprint(random_grid)

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)  # Use RandomForestRegressor if the target variable is continuous

# Random search of parameters, using 3-fold cross-validation
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=random_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the random search model
rf_random.fit(X_train_encoded, y_train)

# Make predictions on the encoded test set using the best estimator
test_predictions = rf_random.predict(X_test_encoded)

# Save predictions to CSV with ID and predicted outcome
output_df = pd.DataFrame({"ID": test_ID, "Overall_Experience": test_predictions})
output_df.to_csv("predictions_rf_random_search.csv", index=False)

# Print best parameters and accuracy score on training set
print("Best Parameters:", rf_random.best_params_)
train_predictions = rf_random.predict(X_train_encoded)
print("Training Accuracy Score:", accuracy_score(y_train, train_predictions))

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [1000,
                  1100,
                  1200,
                  1300,
                  1400,
                  1500,
                  1600,
                  1700,
                  1800,
                  1900,
                  2000,
                  2100,
                  2200,
                  2300,
                  2400,
                  2500]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits
