In [23]:
import pandas as pd
import json
import time
import numpy as np
from IPython.display import display
import ipywidgets as widgets
import holidays

In [24]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

import seaborn as sns
import matplotlib.pyplot as plt

In [25]:
from flaml import AutoML
import xgboost as xgb
from xgboost import XGBRegressor

In [28]:
# Load data
df = pd.read_csv('allgenre_combined_df.csv')
# Filter the dataset without dropping NaN values (for training set)
filtered_data = df[
    (df['Year'] >= 2020) &
    (~(df['Headliner'].str.contains('"', na=False))) &
    (df['Genre'] != 'Family Entertainment') &
    (df['Ticket Price Min USD'] > 0) &
    (df['Ticket Price Min USD'] < df['Ticket Price Max USD'])
]

# Filter the dataset with NaN rows dropped (for testing set)
filtered_data_no_na = filtered_data.dropna()

# Split 30% of the cleaned dataset (no NaNs) into the testing set
_, test_data = train_test_split(filtered_data_no_na, test_size=0.3, random_state=42)

# Ensure the training set is mutually exclusive by removing test rows from the original filtered dataset
train_data = filtered_data.loc[~filtered_data.index.isin(test_data.index)]

# Check the results
print(f"Total filtered data size: {len(filtered_data)}")
print(f"Training set size (including NaNs): {len(train_data)}")
print(f"Testing set size (no NaNs): {len(test_data)}")

# ADJUST THE TRAINING SET AS YOU NEED
# Remember to check data availability




# APPLY ML MODEL, for example:
# Extract features and target for the model
feature_columns = ['Avg. Event Capacity', 'Ticket Price Min USD','Ticket Price Max USD','headliner_monthly_listeners'] 
X_train = train_data[feature_columns]
y_train = train_data['Avg. Gross USD']
# Define the XGBoost model
xgb_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
# Define scoring metrics
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)  # RMSE
r2_scorer = make_scorer(r2_score)  # R²


# CROSS VALIDATION
# Set up cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Perform cross-validation for RMSE
cv_rmse_scores = cross_val_score(xgb_model, X_train, y_train, cv=kf, scoring=rmse_scorer)
formatted_rmse_scores = [int(round(-score)) for score in cv_rmse_scores]  # Negate each score, round, and convert to integer
mean_cv_rmse = int(round(-np.mean(cv_rmse_scores)))  # Negate the mean, round, and convert to integer
print("Cross-validation RMSE scores:", formatted_rmse_scores)
print("Mean CV RMSE:", mean_cv_rmse)
# Perform cross-validation for R²
cv_r2_scores = cross_val_score(xgb_model, X_train, y_train, cv=kf, scoring=r2_scorer)
formatted_r2_scores = [round(score,3) for score in cv_r2_scores] 
print("Cross-validation R² scores:", formatted_r2_scores)
print("Mean CV R²:", round(np.mean(cv_r2_scores),3))

# TEST ON TESTING SET
# Make predictions on the testing data
X_test = test_data[feature_columns]
y_test = test_data['Avg. Gross USD']
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
# Calculate R²
test_r2 = r2_score(y_test, y_pred)
print(f"Test R²: {test_r2:.3f}")
# Calculate RMSE
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Test RMSE: {test_rmse:.2f}")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Total filtered data size: 79691
Training set size (including NaNs): 78682
Testing set size (no NaNs): 1009
Cross-validation RMSE scores: [201432, 171126, 181695, 174557, 202660]
Mean CV RMSE: 186294
Cross-validation R² scores: [0.917, 0.937, 0.938, 0.937, 0.936]
Mean CV R²: 0.933
Test R²: 0.928
Test RMSE: 177043.05
