In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, make_scorer, root_mean_squared_error

In [12]:
df = pd.read_csv("Pollstar_all_genres.csv")

df = df[
    (df['Genre'] != 'Family Entertainment') &
    (df['Ticket Price Min USD'] > 0) &
    (df['Ticket Price Min USD'] < df['Ticket Price Max USD'])
].dropna()

# add time info
df['Event Date'] = pd.to_datetime(df['Event Date'])
df['Year'] = df['Event Date'].dt.year
df['Month'] = df['Event Date'].dt.month
df['Weekday'] = df['Event Date'].dt.weekday
df['Is_Weekend'] = df['Weekday'].apply(lambda x: 1 if x >= 5 else 0)

# Remove the extremely high values (top 0.5%)
threshold = df["Avg. Gross USD"].quantile(0.995)
df = df[df["Avg. Gross USD"] < threshold]

target = "Avg. Gross USD"
drop_cols = ["Event Date", "Country", "Avg. Tickets Sold", "Avg. Capacity Sold", "Ticket Price Avg. USD"]
df = df.drop(columns=drop_cols)

cat_cols = df.select_dtypes(include="object").columns.tolist()
num_cols = [col for col in df.columns if col not in cat_cols + [target]]

# categorical variables Label encoding 
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# Target variable log1p conversion
y = np.log1p(df[target].values.astype(np.float32))

# Standardization
scaler = StandardScaler()
X_num = scaler.fit_transform(df[num_cols].values.astype(np.float32))

# classification features keep same
X_cat = df[cat_cols].values.astype(np.int64)

X_num_train, X_num_test, X_cat_train, X_cat_test, y_train, y_test = train_test_split(
    X_num, X_cat, y, test_size=0.2, random_state=42
)

# transfer to tensor
X_num_train = torch.tensor(X_num_train)
X_cat_train = torch.tensor(X_cat_train)
y_train = torch.tensor(y_train).unsqueeze(1)

X_num_test = torch.tensor(X_num_test)
X_cat_test = torch.tensor(X_cat_test)
y_test = torch.tensor(y_test).unsqueeze(1)

train_dataset = TensorDataset(X_num_train, X_cat_train, y_train)
test_dataset = TensorDataset(X_num_test, X_cat_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512)

class TabTransformer(nn.Module):
    def __init__(self, num_numeric, cat_dims, embed_dim=16):
        super().__init__()
        self.cat_embeds = nn.ModuleList([
            nn.Embedding(dim, embed_dim) for dim in cat_dims
        ])
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=embed_dim, nhead=4, dim_feedforward=128, dropout=0.1, activation='relu', batch_first=True
            ), num_layers=2
        )
        self.numeric_proj = nn.Linear(num_numeric, embed_dim)
        self.regressor = nn.Sequential(
            nn.Linear(embed_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x_num, x_cat):
        embeds = [emb(x_cat[:, i]) for i, emb in enumerate(self.cat_embeds)]
        x_cat_embed = torch.stack(embeds, dim=1)  # (B, T, D)
        x_cat_trans = self.transformer(x_cat_embed).mean(dim=1)  # (B, D)

        x_num_proj = self.numeric_proj(x_num)
        x_combined = torch.cat([x_num_proj, x_cat_trans], dim=1)
        return self.regressor(x_combined)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

model = TabTransformer(
    num_numeric=X_num.shape[1],
    cat_dims=[df[col].nunique() for col in cat_cols]
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

for epoch in range(1, 101):
    model.train()
    losses = []
    for x_num, x_cat, y in train_loader:
        x_num, x_cat, y = x_num.to(device), x_cat.to(device), y.to(device)
        preds = model(x_num, x_cat)
        loss = loss_fn(preds, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f"Epoch {epoch}/100, Loss: {np.mean(losses):.4f}")

model.eval()
with torch.no_grad():
    preds_list = []
    y_list = []
    for x_num, x_cat, y in test_loader:
        x_num, x_cat = x_num.to(device), x_cat.to(device)
        preds = model(x_num, x_cat).cpu().numpy()
        preds_list.append(preds)
        y_list.append(y.numpy())

preds = np.expm1(np.vstack(preds_list))
y_true = np.expm1(np.vstack(y_list))

print("\n TabTransformer Evaluation:")
print("MAE:", mean_absolute_error(y_true, preds))
print("RMSE:", root_mean_squared_error(y_true, preds))
print("R²:", r2_score(y_true, preds))

Epoch 1/100, Loss: 10.9685
Epoch 2/100, Loss: 0.9818
Epoch 3/100, Loss: 0.4806
Epoch 4/100, Loss: 0.3289
Epoch 5/100, Loss: 0.2821
Epoch 6/100, Loss: 0.2568
Epoch 7/100, Loss: 0.2360
Epoch 8/100, Loss: 0.2210
Epoch 9/100, Loss: 0.2139
Epoch 10/100, Loss: 0.1980
Epoch 11/100, Loss: 0.1872
Epoch 12/100, Loss: 0.1684
Epoch 13/100, Loss: 0.1653
Epoch 14/100, Loss: 0.1492
Epoch 15/100, Loss: 0.1394
Epoch 16/100, Loss: 0.1507
Epoch 17/100, Loss: 0.1368
Epoch 18/100, Loss: 0.1365
Epoch 19/100, Loss: 0.1245
Epoch 20/100, Loss: 0.1236
Epoch 21/100, Loss: 0.1109
Epoch 22/100, Loss: 0.0993
Epoch 23/100, Loss: 0.0934
Epoch 24/100, Loss: 0.0922
Epoch 25/100, Loss: 0.0943
Epoch 26/100, Loss: 0.0875
Epoch 27/100, Loss: 0.0885
Epoch 28/100, Loss: 0.0791
Epoch 29/100, Loss: 0.0792
Epoch 30/100, Loss: 0.0742
Epoch 31/100, Loss: 0.0808
Epoch 32/100, Loss: 0.0733
Epoch 33/100, Loss: 0.0657
Epoch 34/100, Loss: 0.0619
Epoch 35/100, Loss: 0.0616
Epoch 36/100, Loss: 0.0646
Epoch 37/100, Loss: 0.0587
Epoch 38/