In [52]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [53]:
file_path = '/content/drive/MyDrive/Colab Notebooks/Pollstar_all_genres.csv'
df = pd.read_csv(file_path)

## Fill the null values in State using Geopy:

In [54]:
from geopy.geocoders import Nominatim
def get_state(city):
  geolocator = Nominatim(user_agent="ARND")
  location = geolocator.geocode(city)
  #print(f'Address: {location.address}')
  if 'display_name' in location.raw:
      display_name_parts = location.raw['display_name'].split(',')

      if len(display_name_parts) > 2:
          state = display_name_parts[2].strip()
          return state
  else:
      return None

In [55]:
# Apply function to fill missing states
df['State'] = df.apply(lambda x: get_state(x['City']) if pd.isnull(x['State']) else x['State'], axis=1)

## Filter out multi-artists data and feature engineering:

In [56]:
df['Event Date'] = pd.to_datetime(df['Event Date'])
df['day_of_year'] = df['Event Date'].dt.dayofyear
multi_artist = df[df["Headliner"].str.contains('"', na=False)]

data = multi_artist[['State', 'Ticket Price Min USD', 'Ticket Price Max USD', 'day_of_year', 'Avg. Gross USD', 'Avg. Event Capacity']].dropna()
data['State'] = data['State'].astype('category')
data

Unnamed: 0,State,Ticket Price Min USD,Ticket Price Max USD,day_of_year,Avg. Gross USD,Avg. Event Capacity
7,Pennsylvania,25.0,35.0,255,1240.00,220.0
25,Massachusetts,5.0,10.0,251,4285.00,1400.0
54,Massachusetts,22.0,35.0,249,9113.00,1600.0
58,Virginia,25.0,99.5,249,108716.75,3048.0
76,South Dakota,20.0,125.0,247,36846.00,1881.0
...,...,...,...,...,...,...
704599,Washington,8.0,8.0,9,3008.00,368.0
704616,New Hampshire,38.0,44.0,9,259742.00,6427.0
704655,Pennsylvania,10.0,15.0,7,18656.57,2711.0
704674,Washington,30.0,50.0,4,249551.00,10288.0


## Train-Test Split:

In [57]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=['Avg. Gross USD'])
y = data['Avg. Gross USD']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## XGBoost

In [58]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Initialize the XGBoost model
model = XGBRegressor(
    objective='reg:squarederror',
    max_depth=7,
    alpha=10,
    learning_rate=0.1,
    n_estimators=200,
    seed=42,
    enable_categorical=True
)

# Define scoring metrics
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Perform 5-fold cross-validation for MSE and compute RMSE
mse_scores = cross_val_score(model, X, y, cv=5, scoring=mse_scorer)
rmse_scores = np.sqrt(-mse_scores)

# Perform 5-fold cross-validation for R2 score
r2_scores = cross_val_score(model, X, y, cv=5, scoring=r2_scorer)

# Perform 5-fold cross-validation for MAE
mae_scores = -cross_val_score(model, X, y, cv=5, scoring=mae_scorer)  # Convert negative MAE back to positive

# Print results
print("5-Fold Cross-Validation Results:")
print("\nRMSE scores for each fold:", rmse_scores)
print("Average RMSE:", np.mean(rmse_scores))
print("\nR-squared scores for each fold:", r2_scores)
print("Average R-squared (R2) Score:", np.mean(r2_scores))
print("\nMAE scores for each fold:", mae_scores)
print("Average MAE:", np.mean(mae_scores))

5-Fold Cross-Validation Results:

RMSE scores for each fold: [498386.07857358 746949.10138221 629837.17749798 339197.95942768
 264706.38366102]
Average RMSE: 495815.34010849416

R-squared scores for each fold: [0.52114005 0.40729118 0.47965147 0.62496228 0.68052001]
Average R-squared (R2) Score: 0.5427129976673671

MAE scores for each fold: [81491.50821748 70373.03977386 61879.74118719 52427.81022293
 62169.71209985]
Average MAE: 65668.36230026146


## LightGBM:

In [65]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Initialize the LightGBM model
model = LGBMRegressor(
    objective='regression',
    max_depth=5,
    learning_rate=0.1,
    n_estimators=200,
    random_state=42,
    verbose=-1
)

# Define scoring metrics
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Perform 5-fold cross-validation for MSE and compute RMSE
mse_scores = cross_val_score(model, X, y, cv=5, scoring=mse_scorer)
rmse_scores = np.sqrt(-mse_scores)

# Perform 5-fold cross-validation for R2 score
r2_scores = cross_val_score(model, X, y, cv=5, scoring=r2_scorer)

# Perform 5-fold cross-validation for MAE
mae_scores = -cross_val_score(model, X, y, cv=5, scoring=mae_scorer)

# Print results
print("5-Fold Cross-Validation Results:")
print("\nRMSE scores for each fold:", rmse_scores)
print("Average RMSE:", np.mean(rmse_scores))
print("\nR-squared scores for each fold:", r2_scores)
print("Average R-squared (R2) Score:", np.mean(r2_scores))
print("\nMAE scores for each fold:", mae_scores)
print("Average MAE:", np.mean(mae_scores))

5-Fold Cross-Validation Results:

RMSE scores for each fold: [352076.1451668  525179.6686824  294590.90592243 204795.67399752
 287335.70879825]
Average RMSE: 332795.62051348074

R-squared scores for each fold: [0.76102641 0.7069949  0.88616484 0.86328693 0.6235615 ]
Average R-squared (R2) Score: 0.7682069150592056

MAE scores for each fold: [75792.79489352 66956.64446384 56948.23588024 51073.66091294
 64662.23732928]
Average MAE: 63086.71469596449


## Voting Regressor Ensemble (XGBoost + LightGBM):

In [78]:
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Initialize the base models
xgb_model = XGBRegressor(
    objective='reg:squarederror',
    max_depth=5,
    learning_rate=0.1,
    n_estimators=200,
    random_state=42,
    enable_categorical=True
)

lgbm_model = LGBMRegressor(
    objective='regression',
    max_depth=5,
    learning_rate=0.1,
    n_estimators=200,
    random_state=42
)

# Create a Voting Regressor ensemble
voting_model = VotingRegressor([('xgb', xgb_model), ('lgbm', lgbm_model)], weights=[0.3,0.7])

# Define scoring metrics
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Perform 5-fold cross-validation for MSE and compute RMSE
mse_scores = cross_val_score(voting_model, X, y, cv=5, scoring=mse_scorer)
rmse_scores = np.sqrt(-mse_scores)

# Perform 5-fold cross-validation for R2 score
r2_scores = cross_val_score(voting_model, X, y, cv=5, scoring=r2_scorer)

# Perform 5-fold cross-validation for MAE
mae_scores = -cross_val_score(voting_model, X, y, cv=5, scoring=mae_scorer)

# Print results
print("5-Fold Cross-Validation Results for Voting Regressor Ensemble:")
print("\nRMSE scores for each fold:", rmse_scores)
print("Average RMSE:", np.mean(rmse_scores))
print("\nR-squared scores for each fold:", r2_scores)
print("Average R-squared (R2) Score:", np.mean(r2_scores))
print("\nMAE scores for each fold:", mae_scores)
print("Average MAE:", np.mean(mae_scores))

5-Fold Cross-Validation Results for Voting Regressor Ensemble:

RMSE scores for each fold: [355458.74571411 572858.31513141 291994.74904563 192430.52561164
 269025.12405067]
Average RMSE: 336353.49191069213

R-squared scores for each fold: [0.75641243 0.65137878 0.8881624  0.87929746 0.67001021]
Average R-squared (R2) Score: 0.7690522555880901

MAE scores for each fold: [74245.68496927 66142.3109717  56070.88775422 49407.83814095
 62636.8793098 ]
Average MAE: 61700.72022918755
