In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import primary_cleaning

In [3]:
import features

In [4]:
import importlib

In [5]:
importlib.reload(primary_cleaning)

<module 'primary_cleaning' from 'C:\\Users\\ggowt\\ML_CC_Project-1\\primary_cleaning.py'>

In [6]:
importlib.reload(features)

<module 'features' from 'C:\\Users\\ggowt\\ML_CC_Project-1\\features.py'>

# Creating X_train and Y_train by pre-built functions

In [7]:
train_df = pd.read_csv("tour_logs_train.csv")

In [8]:
df = train_df.copy()

In [9]:
df = primary_cleaning.cleaning_data(df)
df = features.features_preparation(df)

In [10]:
df

Unnamed: 0,Day_of_Week,Volume_Level,Ticket_Price,Crowd_Size,Opener_Rating,Crowd_Energy,Venue_ID_V_Beta,Venue_ID_V_Delta,Venue_ID_V_Gamma,Weather_Cloudy,Weather_Rainy,Weather_Stormy,Show_DateTime_Evening,Show_DateTime_Late Night,Show_DateTime_Morning
0,3,5,52,464,2,64,0,0,0,0,1,0,1,0,0
1,3,2,66,388,5,66,1,0,0,0,0,0,0,1,0
2,5,1,51,679,1,42,0,1,0,0,1,0,0,0,0
3,0,10,64,454,1,0,1,0,0,0,1,0,0,0,1
4,4,8,47,654,5,67,0,1,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,6,5,57,699,5,40,0,0,0,0,0,1,1,0,0
1996,3,7,26,588,5,65,1,0,0,0,0,1,0,1,0
1997,4,2,57,649,1,60,1,0,0,0,1,0,0,1,0
1998,1,9,53,503,1,150,1,0,0,0,1,0,0,1,0


In [11]:
df = df[(df["Crowd_Energy"] > 0) & (df["Crowd_Energy"]  <= 100) ]

In [12]:
Y_train = df["Crowd_Energy"]

In [13]:
Y_train

0       64
1       66
2       42
4       67
5       67
        ..
1994    61
1995    40
1996    65
1997    60
1999    60
Name: Crowd_Energy, Length: 1879, dtype: int64

In [14]:
X_train = df.drop(columns=["Crowd_Energy"])

In [15]:
X_train

Unnamed: 0,Day_of_Week,Volume_Level,Ticket_Price,Crowd_Size,Opener_Rating,Venue_ID_V_Beta,Venue_ID_V_Delta,Venue_ID_V_Gamma,Weather_Cloudy,Weather_Rainy,Weather_Stormy,Show_DateTime_Evening,Show_DateTime_Late Night,Show_DateTime_Morning
0,3,5,52,464,2,0,0,0,0,1,0,1,0,0
1,3,2,66,388,5,1,0,0,0,0,0,0,1,0
2,5,1,51,679,1,0,1,0,0,1,0,0,0,0
4,4,8,47,654,5,0,1,0,1,0,0,1,0,0
5,2,3,69,320,4,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,6,4,62,432,4,1,0,0,1,0,0,1,0,0
1995,6,5,57,699,5,0,0,0,0,0,1,1,0,0
1996,3,7,26,588,5,1,0,0,0,0,1,0,1,0
1997,4,2,57,649,1,1,0,0,0,1,0,0,1,0


## Choosing GBRegressor as model
### Justification
Gradient Boosting Regressor was selected as the final model because it  achieved the lowest RMSE among the evaluated models. Baseline linear regression produced higher error, indicating that the relationship between input features and crowd energy is non-linear. Gradient Boosting is well-suited for such tabular data as it captures complex feature by combining multiple decision trees

In [16]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)

In [17]:
scores = cross_val_score(model,X_train,Y_train,scoring="neg_root_mean_squared_error",cv=10)

rmse = -scores.mean()
print("RMSE:", rmse)

RMSE: 13.364052077573508


## Hyperparameter tuning using GRIDSearchCV

In [18]:
gbr = GradientBoostingRegressor(random_state=42)

In [19]:
p_g = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [2, 3, 4]
}

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
gds_cv = GridSearchCV(gbr,p_g,scoring="neg_root_mean_squared_error",cv=10,n_jobs=-1)

In [22]:
gds_cv.fit(X_train, Y_train)

0,1,2
,estimator,GradientBoost...ndom_state=42)
,param_grid,"{'learning_rate': [0.01, 0.05, ...], 'max_depth': [2, 3, ...], 'n_estimators': [100, 200, ...]}"
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,4
,min_impurity_decrease,0.0


In [23]:
best_rmse = -gds_cv.best_score_
best_params = gds_cv.best_params_

print("Best CV RMSE:", best_rmse)
print("Best Parameters:", best_params)

Best CV RMSE: 13.312938652454733
Best Parameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}


# Predicting Test data

In [24]:
test_df = pd.read_csv("tour_logs_test_input.csv")

In [25]:
test_df.count()

Gig_ID                   500
Venue_ID                 500
Show_DateTime            500
Day_of_Week              500
Volume_Level             458
Ticket_Price             500
Crowd_Size               489
Opener_Rating            500
Weather                  500
Moon_Phase               500
Band_Outfit              500
Merch_Sales_Post_Show    500
dtype: int64

### Cleaning test  data

In [26]:
X_test = test_df.copy()

In [27]:
X_test = primary_cleaning.cleaning_data(X_test)
X_test = features.features_preparation(X_test)

In [28]:
X_test.count()

Day_of_Week                 500
Volume_Level                500
Ticket_Price                500
Crowd_Size                  500
Opener_Rating               500
Venue_ID_V_Beta             500
Venue_ID_V_Delta            500
Venue_ID_V_Gamma            500
Weather_Cloudy              500
Weather_Rainy               500
Weather_Stormy              500
Show_DateTime_Evening       500
Show_DateTime_Late Night    500
Show_DateTime_Morning       500
dtype: int64

### Training model and predicting

In [29]:
model.fit(X_train, Y_train)

0,1,2
,loss,'squared_error'
,learning_rate,0.05
,n_estimators,200
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,4
,min_impurity_decrease,0.0


In [30]:
test_predicts = model.predict(X_test)

In [37]:
test_predicts = test_predicts.round(2)

In [38]:
submission_df = pd.DataFrame({
    "Gig_ID": test_df["Gig_ID"],
    "Crowd_Energy": test_predicts
})

In [39]:
submission_df.to_csv("predictions.csv", index = False)