In [31]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
file_path = '/content/drive/MyDrive/Colab Notebooks/Pollstar_all_genres.csv'
df = pd.read_csv(file_path)
df = df[
    (df['Genre'] != 'Family Entertainment') &
    (df['Ticket Price Min USD'] > 0) &
    (df['Ticket Price Min USD'] < df['Ticket Price Max USD'])
]
df = df.dropna()
df

Unnamed: 0,Event Date,Number of Shows,Headliner,Support,Venue,City,State,Country,Market,Company Type,Currency,Promoter,Genre,Avg. Tickets Sold,Avg. Gross USD,Avg. Event Capacity,Avg. Capacity Sold,Ticket Price Min USD,Ticket Price Max USD,Ticket Price Avg. USD
0,2024-09-18,1,Creed,"3 Doors Down, Finger Eleven",Coastal Credit Union Music Park at Walnut Creek,Raleigh,North Carolina,United States,Raleigh-Durham (Fayetteville),Amphitheatre,US DOLLAR,Live Nation,Pop / Rock,20295.0,1228939.0,20295.0,100%,39.50,225.00,60.55
1,2024-09-14,1,Creed,"3 Doors Down, Finger Eleven",The Cynthia Woods Mitchell Pavilion,The Woodlands,Texas,United States,Houston,Amphitheatre,US DOLLAR,Live Nation,Pop / Rock,16308.0,1374174.0,16308.0,100%,39.50,225.00,84.26
3,2024-09-13,1,Creed,"3 Doors Down, Finger Eleven",Frost Bank Center,San Antonio,Texas,United States,San Antonio,Arena,US DOLLAR,Live Nation,Pop / Rock,14995.0,1402969.0,14995.0,100%,39.50,225.00,93.56
6,2024-09-11,1,Creed,"3 Doors Down, Finger Eleven",Dos Equis Pavilion,Dallas,Texas,United States,Dallas-Ft. Worth,Amphitheatre,US DOLLAR,Live Nation,Pop / Rock,19303.0,1512310.0,19303.0,100%,39.50,159.50,78.35
8,2024-09-11,1,$uicideboy$,"Denzel Curry, Pouya, EKKSTACY, Shakewell, Haarper",Paycom Center,Oklahoma City,Oklahoma,United States,Oklahoma City,Arena,US DOLLAR,AEG Presents,Rap / HipHop,9505.0,956748.0,11329.0,83%,34.50,195.50,100.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
704672,1999-01-05,2,Black Sabbath,"Pantera, Incubus, System Of A Down",Forum,Inglewood,California,United States,Los Angeles,Arena,Bahraini Dinar,"Avalon Attractions, Jennifer Perry Presents, R...",Pop / Rock,14981.0,670227.5,14981.0,100%,25.00,65.00,44.74
704686,1999-01-02,1,Shania Twain,Leahy,MGM Grand Garden Arena,Las Vegas,Nevada,United States,Las Vegas,Arena,US DOLLAR,Evening Star Productions,Country,11169.0,872210.0,13694.0,81%,40.00,100.00,78.09
704697,1999-01-02,1,Pat McGee,"Kyle Davis, Vertical Horizon",Irving Plaza,New York,New York,United States,New York,Club,US DOLLAR,Ron Delsener Presents,Pop / Rock,1101.0,15334.0,1101.0,100%,13.50,15.00,13.93
704698,1999-01-02,1,Black Sabbath,"Pantera, Incubus",Thomas & Mack Center,Las Vegas,Nevada,United States,Las Vegas,Arena,US DOLLAR,"Bill Graham Presents, Evening Star Productions",Pop / Rock,10804.0,447470.0,13366.0,80%,30.00,65.00,41.42


In [33]:
cat_cols = ["Headliner","Support","Venue","City","State","Country","Market",
            "Company Type","Currency","Promoter","Genre"]
for col in cat_cols:
    n_unique = df[col].nunique()
    sample_vals = df[col].unique()[:5]  # just show first 5 unique examples
    print(f"{col}: {n_unique} unique values. Example values: {sample_vals}")

Headliner: 28719 unique values. Example values: ['Creed' '$uicideboy$' 'Meghan Trainor' 'Cody Johnson' 'Chad Perrone']
Support: 87186 unique values. Example values: ['3 Doors Down, Finger Eleven'
 'Denzel Curry, Pouya, EKKSTACY, Shakewell, Haarper'
 'Daughtry, Finger Eleven' 'Paul Russell' 'Wade Bowen, Ashland Craft']
Venue: 5359 unique values. Example values: ['Coastal Credit Union Music Park at Walnut Creek'
 'The Cynthia Woods Mitchell Pavilion' 'Frost Bank Center'
 'Dos Equis Pavilion' 'Paycom Center']
City: 1320 unique values. Example values: ['Raleigh' 'The Woodlands' 'San Antonio' 'Dallas' 'Oklahoma City']
State: 51 unique values. Example values: ['North Carolina' 'Texas' 'Oklahoma' 'Arkansas' 'Tennessee']
Country: 1 unique values. Example values: ['United States']
Market: 208 unique values. Example values: ['Raleigh-Durham (Fayetteville)\xa0' 'Houston' 'San Antonio'
 'Dallas-Ft. Worth' 'Oklahoma City']
Company Type: 41 unique values. Example values: ['Amphitheatre' 'Arena' 'Clu

In [34]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

target_col = "Avg. Gross USD"

#------------------------------------------------------------------------------
# 2) BASIC CLEANING/PREPROCESSING
#------------------------------------------------------------------------------
# Drop rows with missing target
df = df.dropna(subset=[target_col])

# Example: label-encode some categorical columns.
label_encoders = {}

# Fit and store label encoders using the original training data
cat_cols = ["Headliner","Support","Venue","City","State","Market",
            "Company Type","Currency","Promoter","Genre"]
for col in cat_cols:
    df[col] = df[col].astype(str)
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])  # Fit on df and transform
    label_encoders[col] = encoder  # Store the fitted encoder

#------------------------------------------------------------------------------
# 3) SEPARATE FEATURES AND TARGET
#------------------------------------------------------------------------------
X = df.drop(columns=[target_col, "Event Date", "Country", "Avg. Tickets Sold", "Avg. Capacity Sold", "Ticket Price Avg. USD"])
y = df[target_col].values

#------------------------------------------------------------------------------
# 4) TRAIN-TEST SPLIT
#------------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#------------------------------------------------------------------------------
# 5) SCALE THE FEATURES
#------------------------------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

#------------------------------------------------------------------------------
# 6) RESHAPE FOR 1D CNN
#------------------------------------------------------------------------------
# In PyTorch, we'll shape data as (batch_size, num_features, 1).
# But inside the CNN, we’ll do a permutation to (batch_size, 1, num_features)
num_features = X_train_scaled.shape[1]

# Convert to numpy float32 (for PyTorch compatibility)
X_train_scaled = X_train_scaled.astype(np.float32)
X_test_scaled  = X_test_scaled.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test  = y_test.astype(np.float32)

In [35]:
import torch
from torch.utils.data import Dataset, DataLoader

class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create Dataset objects
train_dataset = TabularDataset(X_train_scaled, y_train)
test_dataset  = TabularDataset(X_test_scaled,  y_test)

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=64, shuffle=False)

In [36]:
import torch.nn as nn
import torch.nn.functional as F

class CNNRegressor(nn.Module):
    def __init__(self, num_features):
        super(CNNRegressor, self).__init__()

        # Convolution layers
        self.conv1 = nn.Conv1d(in_channels=1,  out_channels=32, kernel_size=2)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=2)

        # After 2 convs (kernel_size=2 each), the "length" dimension is (num_features-2).
        # The channel dimension is 64. So the flattened size = 64 * (num_features - 2).
        flattened_size = 64 * (num_features - 2)

        # Fully connected layers
        self.fc1 = nn.Linear(flattened_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.out = nn.Linear(32, 1)

        self.dropout = nn.Dropout(0.25)

    def forward(self, x):
        # x shape = (batch_size, num_features) in our dataset
        #  → we reshape to (batch_size, num_features, 1)
        #  → but for Conv1D, we want (batch_size, channels=1, length=num_features)
        # Let's just insert a channel dimension and reorder:
        x = x.unsqueeze(-1)             # (batch_size, num_features, 1)
        x = x.permute(0, 2, 1)         # (batch_size, 1, num_features)

        x = F.relu(self.conv1(x))      # (batch_size, 32, num_features - 1)
        x = F.relu(self.conv2(x))      # (batch_size, 64, num_features - 2)

        x = x.view(x.size(0), -1)      # flatten to (batch_size, 64*(num_features-2))

        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.out(x)  # shape = (batch_size, 1)
        return x

In [37]:
import torch.optim as optim

# Instantiate the model
model = CNNRegressor(num_features=num_features)

# Choose device (CPU or GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# Training hyperparameters
epochs = 100

for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for batch_X, batch_y in train_loader:
        # Move data to device
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        # Forward pass
        preds = model(batch_X).squeeze()  # shape: (batch_size,)
        loss  = criterion(preds, batch_y)

        # Backward + update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{epochs}], MSE Loss: {epoch_loss:.4f}")

Epoch [1/100], MSE Loss: 209939298315.1517
Epoch [2/100], MSE Loss: 129796781735.2755
Epoch [3/100], MSE Loss: 121490693065.4154
Epoch [4/100], MSE Loss: 108631259169.6507
Epoch [5/100], MSE Loss: 107885611147.0050
Epoch [6/100], MSE Loss: 103105089178.7543
Epoch [7/100], MSE Loss: 102891868296.4616
Epoch [8/100], MSE Loss: 98338221297.4245
Epoch [9/100], MSE Loss: 97376627807.6698
Epoch [10/100], MSE Loss: 84883946332.5885
Epoch [11/100], MSE Loss: 86948140168.7551
Epoch [12/100], MSE Loss: 90026927111.1410
Epoch [13/100], MSE Loss: 85437449885.4444
Epoch [14/100], MSE Loss: 84254255143.3244
Epoch [15/100], MSE Loss: 85395226186.0512
Epoch [16/100], MSE Loss: 79004857004.6068
Epoch [17/100], MSE Loss: 83111231169.2961
Epoch [18/100], MSE Loss: 77895661050.6687
Epoch [19/100], MSE Loss: 77319373950.1903
Epoch [20/100], MSE Loss: 74183672376.6389
Epoch [21/100], MSE Loss: 76506973808.9354
Epoch [22/100], MSE Loss: 72753213028.6099
Epoch [23/100], MSE Loss: 66285551876.6465
Epoch [24/100

In [38]:
model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(device)
        preds = model(batch_X).squeeze()  # (batch_size,)
        all_preds.append(preds.cpu().numpy())
        all_targets.append(batch_y.numpy())

# Concatenate
all_preds = np.concatenate(all_preds)
all_targets = np.concatenate(all_targets)

mae = mean_absolute_error(all_targets, all_preds)
mse = mean_squared_error(all_targets, all_preds)
rmse = np.sqrt(mse)
r2 = r2_score(all_targets, all_preds)

print(f"Test MAE  : {mae:.2f}")
print(f"Test RMSE : {rmse:.2f}")
print(f"Test R^2  : {r2:.4f}")

Test MAE  : 51959.92
Test RMSE : 335824.41
Test R^2  : 0.7281


In [39]:
df2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/final_combined_data.csv')
df2 = df2.dropna()
dollar_columns = ['Avg. Gross USD', 'Ticket Price Min USD', 'Ticket Price Max USD', 'Ticket Price Avg. USD']

# Convert dollar columns to numeric (removing '$' and commas)
for col in dollar_columns:
    df2[col] = df2[col].replace('[\$,]', '', regex=True).astype(float).astype(int)
df2 = df2[
    (df2['Genre'] != 'Family Entertainment') &
    (df2['Ticket Price Min USD'] > 0) &
    (df2['Ticket Price Min USD'] < df2['Ticket Price Max USD'])
]
df2

Unnamed: 0,Event Date,Number of Shows,Headliner,Support,Venue,City,State,Country,Market,Company Type,Currency,Promoter,Genre,Avg. Tickets Sold,Avg. Gross USD,Avg. Event Capacity,Avg. Capacity Sold,Ticket Price Min USD,Ticket Price Max USD,Ticket Price Avg. USD
0,2025-01-11,1,Buddha Trixie,"Sports Coach, Herr God.",McMenamins Mission Theater,Portland,Oregon,United States,"Portland, OR",Auditorium / Theatre,US DOLLAR,McMenamins Presents,Pop / Rock,131,2454,290,45%,18,20,18
7,2024-12-31,1,Billy Joel,Jason Bonham’s Led Zeppelin Evening,UBS Arena,Belmont Park,New York,United States,New York,Arena,US DOLLAR,(In-House Promotion),Pop / Rock,16597,3780358,16597,100%,69,499,227
12,2024-12-27,1,Friko,"OK Cool, The Courts",Thalia Hall,Chicago,Illinois,United States,Chicago,Club,US DOLLAR,(In-House Promotion),Pop / Rock,941,22141,941,100%,23,40,23
13,2024-12-27,1,Adam Ezra Group,Sirsy,Towne Crier Cafe,Beacon,New York,United States,New York,Club,US DOLLAR,(In-House Promotion),Pop / Rock,156,5540,180,86%,35,40,35
21,2024-12-18,1,Suki Waterhouse,Bully,The Salt Shed,Chicago,Illinois,United States,Chicago,Auditorium / Theatre,US DOLLAR,"(In-House Promotion), 16 On Center",Pop / Rock,3326,141424,3326,100%,37,99,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5489,2024-09-01,1,Gogol Bordello,"Puzzled Panther, Crazy And The Brains",The Orange Peel,Asheville,North Carolina,United States,Charlotte,Club,US DOLLAR,(In-House Promotion),Pop / Rock,1050,33000,1050,100%,32,35,31
5490,2024-09-01,1,Aaron Frazer,The Tailspins,The Crescent Ballroom,Phoenix,Arizona,United States,Phoenix (Prescott),Club,US DOLLAR,Live Nation,Pop / Rock,510,13425,513,99%,25,38,26
5492,2024-09-01,1,Tems,Naomi Sharon,The Anthem,Washington,District of Columbia,United States,"Washington, D.C. (Hagerstown)",Auditorium / Theatre,US DOLLAR,Live Nation,Pop / Rock,6000,291440,6000,100%,45,75,48
5493,2024-09-01,1,The Beach Boys,John Stamos,The Rady Shell at Jacobs Park,San Diego,California,United States,San Diego,Amphitheatre,US DOLLAR,(In-House Promotion),Pop / Rock,4237,432680,4366,97%,10,225,102


In [40]:
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

def transform_with_unknowns(encoder, series, unknown_label="UNK"):
    """
    Replaces any category not seen in encoder.classes_ with `unknown_label`.
    Then uses the (already-fitted) encoder to transform the series.
    """
    known_labels = set(encoder.classes_)
    safe_values = [val if val in known_labels else unknown_label for val in series]
    return encoder.transform(safe_values)

# Suppose these were your categorical columns during training
cat_cols = [
    "Headliner","Support","Venue","City","State","Country","Market",
    "Company Type","Currency","Promoter","Genre"
]

# Make a copy so we don't alter df2 in-place
df2_processed = df2.copy()

# 1) Apply the SAME label encoders (i.e., from training phase)
for col in cat_cols:
    df2_processed[col] = df2_processed[col].astype(str)
    try:
        # First try the normal transform
        df2_processed[col] = label_encoders[col].transform(df2_processed[col])
    except ValueError as e:
        # Catch the "unseen labels" error
        if "previously unseen labels" in str(e):
            # Replace unseen with "UNK" (or however you want to handle)
            df2_processed[col] = transform_with_unknowns(label_encoders[col],
                                                         df2_processed[col],
                                                         unknown_label="UNK")
        else:
            # It's some other error, so re-raise
            raise e

# 2) Drop columns that were not used as features in training
target_col = "Avg. Gross USD"
drop_cols = [
    col for col in [
        target_col, "Event Date", "Country", "Avg. Tickets Sold",
        "Avg. Capacity Sold", "Ticket Price Avg. USD"
    ]
    if col in df2_processed.columns
]
df2_processed.drop(columns=drop_cols, inplace=True)

# 3) Scale numeric features using the SAME scaler
X2 = scaler.transform(df2_processed.values.astype(np.float32))

# Convert to PyTorch tensor
X2_tensor = torch.from_numpy(X2).to(device)

ValueError: y contains previously unseen labels: 'UNK'