In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

dataset_path = '/content/drive/MyDrive/combined_df.csv'
df = pd.read_csv(dataset_path)
train_df = df[
        (df['Year'] >= 2020) &
        (df['Headliner'].str.contains('"', na=False)) &
        (~df['Support'].isna()) &
        (df['Genre'] != 'Family Entertainment')
    ]
print("First 5 rows of the dataset:")
print(df.head())



  df = pd.read_csv(dataset_path)


First 5 rows of the dataset:
   Event Date                              Headliner  \
0  2024-09-18                                  Creed   
1  2024-09-14                                  Creed   
2  2024-09-13  Bruce Springsteen & The E Street Band   
3  2024-09-13                                  Creed   
4  2024-09-13                Billy Joel, Rod Stewart   

                          sp artist_name  \
0                                  Creed   
1                                  Creed   
2  Bruce Springsteen & The E Street Band   
3                                  Creed   
4                Billy Joel, Rod Stewart   

                                     sp artist_genre  sp followers  \
0  ['alternative metal', 'nu metal', 'post-grunge...     3527070.0   
1  ['alternative metal', 'nu metal', 'post-grunge...     3527070.0   
2  ['heartland rock', 'mellow gold', 'permanent w...     6567386.0   
3  ['alternative metal', 'nu metal', 'post-grunge...     3527070.0   
4  ['album rock', '

In [None]:
print("\nSummary statistics of the dataset:")
print(df.describe())


Summary statistics of the dataset:
       sp followers  sp popularity  yt View Count  yt Subscriber Count  \
count  3.245600e+04   32456.000000   3.210200e+04         3.210200e+04   
mean   2.498679e+06      51.025357   7.947109e+08         1.368386e+06   
std    9.005023e+06      20.372168   3.018964e+09         5.633076e+06   
min    0.000000e+00       0.000000   0.000000e+00         0.000000e+00   
25%    6.217250e+04      40.000000   2.319628e+06         7.340000e+03   
50%    3.271130e+05      53.000000   3.686581e+07         7.660000e+04   
75%    1.490683e+06      65.000000   3.129296e+08         4.710000e+05   
max    1.236853e+08     100.000000   6.104600e+10         3.210000e+08   

       yt Video Count  Total population  Under 5 years population  \
count    32102.000000      3.249400e+04              32494.000000   
mean       255.025263      8.631460e+05              50734.760448   
std       1892.338983      1.569479e+06              95171.317352   
min          0.000000

In [3]:
# Display missing value counts for each column
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
print("Missing Values and Percentage:")
print(pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage}).sort_values(by='Percentage', ascending=False))


Missing Values and Percentage:
                                 Missing Values  Percentage
yt Description                           682831   96.495758
yt Channel ID                            675526   95.463436
yt Title                                 675526   95.463436
yt name                                  675526   95.463436
yt View Count                            675526   95.463436
yt Subscriber Count                      675526   95.463436
yt Published At                          675526   95.463436
yt Video Count                           675526   95.463436
sp popularity                            675172   95.413409
sp artist_name                           675172   95.413409
sp followers                             675172   95.413409
sp artist_genre                          675172   95.413409
Total population                         675134   95.408039
10 to 14 years population                675134   95.408039
15 to 19 years population                675134   95.408039
Under 5 y

Handling Missing Values

In [4]:
from geopy.geocoders import Nominatim

# Define a function to infer 'State' based on 'City'
def fill_state(city):
    if pd.isnull(city) or city == "Unknown":
        return "Unknown"
    geolocator = Nominatim(user_agent="geoapiExercises")
    try:
        # Attempt to fetch the location information using geopy
        location = geolocator.geocode(city, timeout=10)
        if location:
            # Extract state information from the address
            return location.address.split(',')[-3].strip()
        else:
            return "Unknown"
    except:
        # If geocoding fails, return 'Unknown'
        return "Unknown"

# Handle missing values in the 'State' column using the 'fill_state' function
df['State'] = df.apply(lambda x: fill_state(x['City']) if pd.isnull(x['State']) else x['State'], axis=1)

# Print to check remaining missing values in the 'State' column
print(f"Remaining missing values in 'State': {df['State'].isnull().sum()}")

# Fill missing values in categorical columns with 'Missing'
categorical_cols = ['Support', 'Market', 'Genre', 'Promoter', 'Company Type', 'Venue']
for col in categorical_cols:
    df[col] = df[col].fillna("Missing")

# Fill missing values in numerical columns with mean
df['Ticket Price Avg. USD'] = df['Ticket Price Avg. USD'].fillna(df['Ticket Price Avg. USD'].mean())
df['Avg. Event Capacity'] = df['Avg. Event Capacity'].fillna(df['Avg. Event Capacity'].mean())

# Check if any missing values remain in the dataset
print("Remaining missing values in the dataset:")
print(df.isnull().sum())



Remaining missing values in 'State': 0
Remaining missing values in the dataset:
Event Date                              0
Headliner                               0
sp artist_name                     675172
sp artist_genre                    675172
sp followers                       675172
sp popularity                      675172
yt name                            675526
yt Channel ID                      675526
yt Title                           675526
yt Description                     682831
yt Published At                    675526
yt View Count                      675526
yt Subscriber Count                675526
yt Video Count                     675526
Total population                   675134
Under 5 years population           675134
5 to 9 years population            675134
10 to 14 years population          675134
15 to 19 years population          675134
20 to 24 years population          675134
25 to 34 years population          675134
35 to 44 years population          675

In [5]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Convert 'Event Date' to datetime format and extract features
df['Event Date'] = pd.to_datetime(df['Event Date'])

# Feature engineering: extract useful time-related features
df['Year'] = df['Event Date'].dt.year
df['Month'] = df['Event Date'].dt.month
df['Day'] = df['Event Date'].dt.day
df['Day_of_Week'] = df['Event Date'].dt.dayofweek  # 0 = Monday, 6 = Sunday
df['Day_of_Year'] = df['Event Date'].dt.dayofyear
df['Is_Weekend'] = (df['Day_of_Week'] >= 5).astype(int)  # 1 for Saturday/Sunday

# Create interaction features between price and capacity
df['Price_Range'] = df['Ticket Price Max USD'] - df['Ticket Price Min USD']

# Scale/normalize numerical features
scaler = StandardScaler()

# Select numerical columns for normalization
numerical_features = [
    'Ticket Price Avg. USD',
    'Avg. Event Capacity',
    'Price_Range',
]

# Normalize the numerical features
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Drop the original 'Event Date' column after processing
df = df.drop(columns=['Event Date'])
df = df.drop(columns=['day_of_week'])
# Final check: View the processed data
print(df.head())


                               Headliner  \
0                                  Creed   
1                                  Creed   
2  Bruce Springsteen & The E Street Band   
3                                  Creed   
4                Billy Joel, Rod Stewart   

                          sp artist_name  \
0                                  Creed   
1                                  Creed   
2  Bruce Springsteen & The E Street Band   
3                                  Creed   
4                Billy Joel, Rod Stewart   

                                     sp artist_genre  sp followers  \
0  ['alternative metal', 'nu metal', 'post-grunge...     3527070.0   
1  ['alternative metal', 'nu metal', 'post-grunge...     3527070.0   
2  ['heartland rock', 'mellow gold', 'permanent w...     6567386.0   
3  ['alternative metal', 'nu metal', 'post-grunge...     3527070.0   
4  ['album rock', 'classic rock', 'mellow gold', ...     6312751.0   

   sp popularity                                y

Distribution-balanced Stratified Cross-Validation

In [None]:
import numpy as np

def dbscv(x: np.array, y: np.array, params: dict) -> (list, list, list, list):
    """
    Distribution-balanced stratified cross-validation (DBSCV) splitting method.

    Parameters:
    - x: np.array, feature matrix of shape (N, M)
    - y: np.array, labels array of shape (N,)
    - params: dict, should contain 'K' (number of folds)

    Returns:
    - x_fit: list of np.array, training feature sets for each fold
    - x_val: list of np.array, validation feature sets for each fold
    - y_fit: list of np.array, training label sets for each fold
    - y_val: list of np.array, validation label sets for each fold
    """
    k = params.get('K', 5)  # Number of folds, default is 5
    N, M = x.shape  # N: number of samples, M: number of features
    classes = np.unique(y)  # Unique class labels

    # Initialize reference feature vector X0 (zeros for continuous attributes)
    X0 = np.zeros(M)

    # Initialize folds: T[0] corresponds to fold 1
    T = [[] for _ in range(k)]

    # List to hold remaining samples after main distribution
    L_r = []

    # For each class, construct the sorted list L_i
    for c in classes:
        # Indices of samples with class label c
        S_i = np.where(y == c)[0].tolist()
        Li = []
        last_sample = X0

        # Step (2): Sort the cases of each class
        while S_i:
            # Extract samples of class c
            samples = x[S_i, :]
            # Compute Euclidean distances to the last sample
            distances = np.linalg.norm(samples - last_sample, axis=1)
            # Find the sample with the minimum distance
            min_idx = np.argmin(distances)
            sample_index = S_i[min_idx]
            # Add to the sorted list Li
            Li.append(sample_index)
            # Update last_sample and remove the selected sample from S_i
            last_sample = x[sample_index]
            S_i.pop(min_idx)

        # Step (3): Partition each Li into k folds
        idx = 0
        while idx + k <= len(Li):
            for j in range(k):
                index = Li[idx + j]
                T[j].append(index)
            idx += k

        # Collect remaining samples to L_r
        Li_remain = Li[idx:]
        if Li_remain:
            L_r.extend(Li_remain)

    # Distribute remaining samples in L_r into folds T_j
    for i, index in enumerate(L_r):
        T[i % k].append(index)

    # Prepare training and validation sets for each fold
    x_fit = []
    x_val = []
    y_fit = []
    y_val = []

    indices_all = np.arange(N)

    for j in range(k):
        val_indices = np.array(T[j])
        train_indices = np.setdiff1d(indices_all, val_indices)

        x_val.append(x[val_indices])
        y_val.append(y[val_indices])

        x_fit.append(x[train_indices])
        y_fit.append(y[train_indices])

    return x_fit, x_val, y_fit, y_val


Prepare Dataset

In [38]:
from sklearn.preprocessing import LabelEncoder
categorical_features = ['Headliner', 'Support', 'Market', 'Genre', 'Promoter', 'Company Type', 'Venue', 'City', 'State']
numerical_features = ['Avg. Gross USD', 'Ticket Price Avg. USD', 'Avg. Event Capacity',
                      'Price_Range', 'Year', 'Month', 'Day',
                      'Day_of_Year', 'Is_Weekend']
# Encode categorical features with LabelEncoder
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Save the encoder for later use

# Concatenate encoded categorical features with numerical features
final_df = df[numerical_features + categorical_features]

# Ensure no missing values remain in the final dataset
final_df = final_df.fillna(0)

# Features (X) and target (y)
X = final_df.drop(columns=['Avg. Gross USD']).values  # Drop the target column from features
y = final_df['Avg. Gross USD'].values                # Target variable



In [39]:
from sklearn.model_selection import train_test_split

# Stratified split to maintain distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,       # 20% for final testing
    random_state=42,     # Seed for reproducibility
    shuffle=True         # Shuffle before splitting
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Training set shape: (566102, 17)
Test set shape: (141526, 17)


**Optuna XGBoost**

In [8]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [40]:
import optuna
from optuna.samplers import TPESampler
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from sklearn.base import RegressorMixin

class XGBWrapper(XGBRegressor, RegressorMixin):
    pass
# ===============================
# Example: Optuna for XGBoost
# Minimizing MAE via cross-validation
# ===============================

def objective(trial):
    """
    Objective function for Optuna.
    We define the hyperparameter search space
    and train an XGBoost model using those hyperparameters.

    The function returns the cross-validated MAE (Mean Absolute Error).
    """
    # 1) Define the hyperparameter search space
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.1),
        # Feel free to add more hyperparameters if needed
    }

    # 2) Initialize the XGBoost model
    model = XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        **param
    )

    # 3) Create a 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # 4) Perform cross-validation using negative MAE
    #    (since sklearn uses "maximize" by default, we negate MAE)
    scores = cross_val_score(
        model,
        X_train,
        y_train,
        scoring='neg_mean_absolute_error',
        cv=kf,
        n_jobs=-1
    )

    # 5) Convert negative MAE to positive, then compute average
    avg_mae = -scores.mean()

    # 6) Return the metric that needs to be minimized
    return avg_mae


# Create a study object.
# "direction='minimize'" means we want to minimize the objective.
study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))

# Start optimization.
# "n_trials=20" means we will try 20 different hyperparameter sets.
study.optimize(objective, n_trials=20, show_progress_bar=True)

print("\n[Optuna] Best trial object:")
print(study.best_trial)

print("\n[Optuna] Best params found:")
print(study.best_params)

# ===============================
# Train an XGBoost model with the best hyperparameters
# ===============================

best_params = study.best_params
optuna_xgb = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    **best_params
)

# Fit the model using the entire training set
optuna_xgb.fit(X_train, y_train)

# ===============================
# Evaluate on the test set
# ===============================
y_pred_optuna = optuna_xgb.predict(X_test)

mae_optuna = mean_absolute_error(y_test, y_pred_optuna)
mse_optuna = mean_squared_error(y_test, y_pred_optuna)
r2_optuna  = r2_score(y_test, y_pred_optuna)

print("\n[Optuna XGBoost] Test set performance:")
print(f"MAE: {mae_optuna:.4f}")
print(f"MSE: {mse_optuna:.4f}")
print(f"R² : {r2_optuna:.4f}")

[I 2025-02-14 06:17:36,725] A new study created in memory with name: no-name-41fbb064-dffb-4e86-901d-b2e1a34627c8


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-02-14 06:17:53,299] Trial 0 finished with value: 22700.820290350177 and parameters: {'n_estimators': 144, 'max_depth': 10, 'learning_rate': 0.06504856968981275, 'subsample': 0.8, 'colsample_bytree': 0.6}. Best is trial 0 with value: 22700.820290350177.
[I 2025-02-14 06:17:57,898] Trial 1 finished with value: 29487.54818589813 and parameters: {'n_estimators': 89, 'max_depth': 3, 'learning_rate': 0.13983740016490973, 'subsample': 0.9, 'colsample_bytree': 0.9}. Best is trial 0 with value: 22700.820290350177.
[I 2025-02-14 06:18:05,176] Trial 2 finished with value: 23906.321706625313 and parameters: {'n_estimators': 55, 'max_depth': 10, 'learning_rate': 0.11536162338241392, 'subsample': 0.7, 'colsample_bytree': 0.6}. Best is trial 0 with value: 22700.820290350177.
[I 2025-02-14 06:18:10,472] Trial 3 finished with value: 43183.11504582285 and parameters: {'n_estimators': 96, 'max_depth': 5, 'learning_rate': 0.0199473547030745, 'subsample': 0.8, 'colsample_bytree': 0.7}. Best is tria

**Stacking Model**

In [41]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
import numpy as np

# Initialize containers for out-of-fold predictions
meta_features_train = np.zeros((X_train.shape[0], 2))  # RF and XGB columns
kf = KFold(n_splits=5, shuffle=True, random_state=42)

print("\nGenerating meta-features through 5-fold cross-validation:")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"\nProcessing Fold {fold+1}/5")

    # Split training data into training/validation subsets
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    # ----- Random Forest Training -----
    rf = RandomForestRegressor(
        n_estimators=150,
        max_depth=10,
        min_samples_split=5,
        n_jobs=-1,              # Use all cores
        random_state=42
    )
    rf.fit(X_tr, y_tr)
    meta_features_train[val_idx, 0] = rf.predict(X_val)

    # ----- XGBoost Training -----
    xgb = XGBRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        random_state=42
    )
    xgb.fit(X_tr, y_tr)
    meta_features_train[val_idx, 1] = xgb.predict(X_val)


Generating meta-features through 5-fold cross-validation:

Processing Fold 1/5

Processing Fold 2/5

Processing Fold 3/5

Processing Fold 4/5

Processing Fold 5/5


In [42]:
# ==================================
#  Meta-Model Training (ElasticNet)
# ==================================
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train meta-model
meta_model = ElasticNet(
    alpha=0.001,        # Regularization strength
    l1_ratio=0.7,       # Balance between L1/L2
    max_iter=10000,     # Ensure convergence
    random_state=42
)
meta_model.fit(meta_features_train, y_train)

# Evaluate on training meta-features
train_pred = meta_model.predict(meta_features_train)
print("\nMeta-model training performance:")
print(f"- MSE: {mean_squared_error(y_train, train_pred):.4f}")
print(f"- R²: {r2_score(y_train, train_pred):.4f}")


Meta-model training performance:
- MSE: 10444581972.6508
- R²: 0.9460


In [43]:
# ===================================
#  Final Evaluation on Test Set
# ===================================
# Retrain base models on full training data
print("\nRetraining base models on full training set...")

# Random Forest
final_rf = RandomForestRegressor(
    n_estimators=150,
    max_depth=10,
    min_samples_split=5,
    n_jobs=-1,
    random_state=42
).fit(X_train, y_train)

# XGBoost
final_xgb = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    random_state=42
).fit(X_train, y_train)

# Generate test set meta-features
meta_features_test = np.column_stack([
    final_rf.predict(X_test),
    final_xgb.predict(X_test)
])

# Final prediction and evaluation
test_pred = meta_model.predict(meta_features_test)
print("\nFinal test set performance:")
print(f"- MSE: {mean_squared_error(y_test, test_pred):.4f}")
print(f"- R²: {r2_score(y_test, test_pred):.4f}")


Retraining base models on full training set...

Final test set performance:
- MSE: 6565166562.3304
- R²: 0.9619


In [None]:
# Parameters for DBSCV
params = {'K': 5}  # Number of folds

# Apply DBSCV to the dataset
x_fit, x_val, y_fit, y_val = dbscv(X_train, y_train, params)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train and evaluate using DBSCV splits
rf_scores = []  # To store evaluation metrics for each fold

for i in range(len(x_fit)):
    # Train the model on training set
    rf_model.fit(x_fit[i], y_fit[i])

    # Predict on validation set
    predictions = rf_model.predict(x_val[i])

    # Calculate metrics
    mse = mean_squared_error(y_val[i], predictions)
    r2 = r2_score(y_val[i], predictions)
    rf_scores.append((mse, r2))

# Print evaluation results for each fold
for fold, (mse, r2) in enumerate(rf_scores, 1):
    print(f"Random Forest - Fold {fold}: MSE = {mse:.4f}, R^2 = {r2:.4f}")

Random Forest - Fold 1: MSE = 0.0447, R^2 = 0.9537
Random Forest - Fold 2: MSE = 0.0201, R^2 = 0.9802
Random Forest - Fold 3: MSE = 0.0694, R^2 = 0.9397
Random Forest - Fold 4: MSE = 0.0056, R^2 = 0.9939
Random Forest - Fold 5: MSE = 0.0031, R^2 = 0.9968


In [None]:
from xgboost import XGBRegressor

# Initialize XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, objective='reg:squarederror')

# Train and evaluate using DBSCV splits
xgb_scores = []  # To store evaluation metrics for each fold

for i in range(len(x_fit)):
    # Train the model on training set
    xgb_model.fit(x_fit[i], y_fit[i])

    # Predict on validation set
    predictions = xgb_model.predict(x_val[i])

    # Calculate metrics
    mse = mean_squared_error(y_val[i], predictions)
    r2 = r2_score(y_val[i], predictions)
    xgb_scores.append((mse, r2))

# Print evaluation results for each fold
for fold, (mse, r2) in enumerate(xgb_scores, 1):
    print(f"XGBoost - Fold {fold}: MSE = {mse:.4f}, R^2 = {r2:.4f}")


XGBoost - Fold 1: MSE = 16559011190.2853, R^2 = 0.9162
XGBoost - Fold 2: MSE = 39192167325.5710, R^2 = 0.8278
XGBoost - Fold 3: MSE = 11240037901.6352, R^2 = 0.9360
XGBoost - Fold 4: MSE = 11700924396.8403, R^2 = 0.9350
XGBoost - Fold 5: MSE = 16185369833.5226, R^2 = 0.9135


In [None]:
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Root Mean Squared Error (RMSE): 0.2159


In [None]:
def dbscv2(x: np.array, y: np.array, params: dict) -> list:
    k = params.get('K', 5)
    N, M = x.shape
    classes = np.unique(y)
    X0 = np.zeros(M)
    T = [[] for _ in range(k)]
    L_r = []

    for c in classes:
        S_i = np.where(y == c)[0].tolist()
        Li = []
        last_sample = X0

        while S_i:
            samples = x[S_i]
            distances = np.linalg.norm(samples - last_sample, axis=1)
            min_idx = np.argmin(distances)
            sample_idx = S_i.pop(min_idx)
            Li.append(sample_idx)
            last_sample = x[sample_idx]

        idx = 0
        while idx + k <= len(Li):
            for j in range(k):
                T[j].append(Li[idx + j])
            idx += k
        L_r.extend(Li[idx:])

    for i, idx in enumerate(L_r):
        T[i % k].append(idx)

    cv_folds = []
    all_indices = np.arange(N)
    for j in range(k):
        val_indices = np.array(T[j])
        train_indices = np.setdiff1d(all_indices, val_indices)
        cv_folds.append((train_indices, val_indices))

    return cv_folds

In [9]:
!pip install --upgrade scikit-learn xgboost lightgbm dask[dataframe]

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting dask[dataframe]
  Downloading dask-2025.2.0-py3-none-any.whl.metadata (3.8 kB)
Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl (223.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.6/223.6 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dask-2025.2.0-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost, dask
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.1.3
    Uninstalling xgboost-2.1.3:
      Successfully uninstalled xgboost-2.1.3
  Attempting uninstall: dask
    Found existing installation: dask 2024.10.0
    Uninstalling dask-2024.10.0:
      Successfully uninstalled dask-2024.10.0
Successfully installed dask-2025.2.0 xgboost-2.1.4


**GridSearchCV for XGBoost and LightGBM**

In [44]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Define the parameter grids
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1]
}

param_grid_lgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1]
}

# Initialize the models
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
lgb = LGBMRegressor(random_state=42)

# Perform GridSearchCV for XGBoost
grid_xgb = GridSearchCV(
    xgb,
    param_grid_xgb,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    error_score='raise'
)
grid_xgb.fit(X_train, y_train)

grid_lgb = GridSearchCV(
    lgb,
    param_grid_lgb,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    error_score='raise'
)
grid_lgb.fit(X_train, y_train)

# Get the best estimators
best_xgb = grid_xgb.best_estimator_
best_lgb = grid_lgb.best_estimator_

# Print the best parameters
print("Best XGBoost parameters:", grid_xgb.best_params_)
print("Best LightGBM parameters:", grid_lgb.best_params_)

# Evaluate the best XGBoost model on the test set
y_pred_xgb = best_xgb.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Evaluate the best LightGBM model on the test set
y_pred_lgb = best_lgb.predict(X_test)
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
r2_lgb = r2_score(y_test, y_pred_lgb)

# Print the evaluation results
print("\nBest XGBoost model:")
print(f"- MAE: {mae_xgb:.4f}")
print(f"- R²: {r2_xgb:.4f}")

print("\nBest LightGBM model:")
print(f"- MAE: {mae_lgb:.4f}")
print(f"- R²: {r2_lgb:.4f}")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031356 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2767
[LightGBM] [Info] Number of data points in the train set: 566102, number of used features: 17
[LightGBM] [Info] Start training from score 118358.566623
Best XGBoost parameters: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200}
Best LightGBM parameters: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200}





Best XGBoost model:
- MAE: 22217.5524
- R²: 0.9313

Best LightGBM model:
- MAE: 23452.1467
- R²: 0.9273


In [51]:
import pickle
model_paths = {
    "optuna_xgb.pkl": "/content/optuna_xgb.pkl",
    "meta_model.pkl": "/content/meta_model.pkl",
    "grid_lgb.pkl": "/content/grid_lgb.pkl",
    "grid_xgb.pkl": "/content/grid_xgb.pkl",
    "final_rf.pkl": "/content/final_rf.pkl",
    "final_xgb.pkl": "/content/final_xgb.pkl"
}
# Define the trained models to be saved
model_objects = {
    "optuna_xgb.pkl": optuna_xgb,  # Optuna-tuned XGBoost model
    "meta_model.pkl": meta_model,  # Stacking meta-model (ElasticNet)
    "grid_lgb.pkl": grid_lgb.best_estimator_,  # Best LightGBM model from GridSearchCV
    "grid_xgb.pkl": grid_xgb.best_estimator_ ,  # Best XGBoost model from GridSearchCV
    "final_rf.pkl": final_rf,  # Final Random Forest model
    "final_xgb.pkl": final_xgb   # Final XGBoost model
}

for filename, model in model_objects.items():
    with open(model_paths[filename], "wb") as file:
        pickle.dump(model, file)

# Print and return the model file paths
print("✅ Models have been successfully saved at the following locations:")
for name, path in model_paths.items():
    print(f"{name}: {path}")


✅ Models have been successfully saved at the following locations:
optuna_xgb.pkl: /content/optuna_xgb.pkl
meta_model.pkl: /content/meta_model.pkl
grid_lgb.pkl: /content/grid_lgb.pkl
grid_xgb.pkl: /content/grid_xgb.pkl
final_rf.pkl: /content/final_rf.pkl
final_xgb.pkl: /content/final_xgb.pkl


In [24]:
#Load test dataset
test_file_path = '/test_all_genre.csv'
test_df = pd.read_csv(test_file_path)
print("First 5 rows of the dataset:")
print(test_df.head())

First 5 rows of the dataset:
   Event Date  Number of Shows  \
0  2025-01-11                1   
1  2025-01-10                1   
2  2025-01-10                1   
3  2025-01-06                1   
4  2024-12-31                1   

                                           Headliner                  Support  \
0                                      Buddha Trixie  Sports Coach, Herr God.   
1                                       Bonnie Hayes                      NaN   
2  The Drifters, Cornell Gunter's Coasters, The P...                      NaN   
3                     Gary Lucas & Gods And Monsters                      NaN   
4                                 Straight No Chaser                      NaN   

                           Venue             City       State        Country  \
0     McMenamins Mission Theater         Portland      Oregon  United States   
1          Sweetwater Music Hall      Mill Valley  California  United States   
2  Bergen Performing Arts Center       

In [25]:
# Display missing values before processing
missing_values_before = test_df.isnull().sum()
missing_percentage_before = (missing_values_before / len(test_df)) * 100
print("🔍 Missing Values Before Processing:")
print(pd.DataFrame({'Missing Values': missing_values_before, 'Percentage': missing_percentage_before}).sort_values(by='Percentage', ascending=False))


🔍 Missing Values Before Processing:
                                 Missing Values  Percentage
yt Description                             2729   71.759138
yt Title                                   2502   65.790166
yt name                                    2502   65.790166
yt Channel ID                              2502   65.790166
yt View Count                              2502   65.790166
yt Subscriber Count                        2502   65.790166
yt Video Count                             2502   65.790166
yt Published At                            2502   65.790166
Total population                           2501   65.763871
sp popularity                              2501   65.763871
sp artist_genre                            2501   65.763871
sp followers                               2501   65.763871
15 to 19 years population                  2501   65.763871
10 to 14 years population                  2501   65.763871
5 to 9 years population                    2501   65.763871
Unde

In [29]:
# Convert 'Event Date' to datetime format
test_df['Event Date'] = pd.to_datetime(test_df['Event Date'])

# Extract time-based features
test_df['Year'] = test_df['Event Date'].dt.year
test_df['Month'] = test_df['Event Date'].dt.month
test_df['Day'] = test_df['Event Date'].dt.day
test_df['Day_of_Year'] = test_df['Event Date'].dt.dayofyear
test_df['Is_Weekend'] = (test_df['Event Date'].dt.dayofweek >= 5).astype(int)  # 1 for Sat/Sun

# Create 'Price_Range' feature
test_df['Price_Range'] = test_df['Ticket Price Max USD'] - test_df['Ticket Price Min USD']



In [30]:
# Fill missing values in categorical features with "Missing"
categorical_features = ['Headliner', 'Support', 'Market', 'Genre', 'Promoter', 'Company Type', 'Venue', 'City', 'State']
for col in categorical_features:
    test_df[col] = test_df[col].fillna("Missing")

# Fill missing values in numerical features with the column mean
numerical_features = ['Ticket Price Avg. USD', 'Avg. Event Capacity', 'Price_Range', 'Year', 'Month', 'Day', 'Day_of_Year', 'Is_Weekend']
for col in numerical_features:
    test_df[col] = test_df[col].fillna(test_df[col].mean())


In [31]:
# Apply StandardScaler (ensure same transformation as training)
scaler = StandardScaler()
test_df[numerical_features] = scaler.fit_transform(test_df[numerical_features])


In [33]:
for col in categorical_features:
    if col in label_encoders:
        le = label_encoders[col]  # Use the pre-fitted encoder from training
    else:
        le = LabelEncoder()
        le.fit(test_df[col].astype(str))  # Fit on test data if missing

    # Transform the test dataset, mapping unseen labels to -1
    test_df[col] = test_df[col].astype(str).apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

    # Store the encoder
    label_encoders[col] = le


In [35]:
# Define final feature set used in training
final_features = numerical_features + categorical_features
X_test_final = test_df[final_features].values
# Save the processed test dataset
processed_test_file = "/content/processed_test_dataset.csv"
test_df.to_csv(processed_test_file, index=False)

print(f"✅ Processed test dataset saved at: {processed_test_file}")

✅ Processed test dataset saved at: /content/processed_test_dataset.csv


In [52]:
# Load trained models
models = {}
for name, path in model_paths.items():
    with open(path, "rb") as file:
        models[name] = pickle.load(file)

# Load the processed test dataset
test_file_path = "/content/processed_test_dataset.csv"
test_df = pd.read_csv(test_file_path)

# Extract feature set for prediction
X_test_final = test_df[numerical_features + categorical_features].values

# Generate predictions for each individual model
predictions = {
    "grid_lgb": models["grid_lgb"].predict(X_test_final),
    "grid_xgb": models["grid_xgb"].predict(X_test_final),
    "optuna_xgb": models["optuna_xgb"].predict(X_test_final),
    "final_rf": models["final_rf"].predict(X_test_final),
    "final_xgb": models["final_xgb"].predict(X_test_final)
}

# Generate meta-features for stacking model (using final_rf and final_xgb)
meta_features_test = np.column_stack([
    predictions["final_rf"],  # Correct RandomForest predictions
    predictions["final_xgb"]  # Correct XGBoost predictions
])

# Use meta-model to predict final output (stacking)
predictions["meta_model"] = models["meta_model"].predict(meta_features_test)

# Generate submission CSV files for each model
submission_files = {}
for name, preds in predictions.items():
    submission_df = pd.DataFrame({"id": test_df.index, "Average Gross": preds})
    submission_path = f"/mnt/data/submission_{name}.csv"
    submission_df.to_csv(submission_path, index=False)
    submission_files[name] = submission_path

# Return paths to submission files
submission_files


KeyError: "['Avg. Gross USD'] not in index"