##### Install CatBoost

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


# **Regular CatBoost**

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

df = pd.read_csv("/content/drive/MyDrive/A Round Ent/combined_df.csv")
df = df.select_dtypes(['number']).dropna()
df = df.drop(columns=["Avg. Tickets Sold", "Avg. Capacity Sold", "Ticket Price Avg. USD"], errors='ignore')
df = df.dropna(subset=["Avg. Gross USD"])
X = pd.get_dummies(df.drop(columns=["Avg. Gross USD"]), drop_first=True)
y = df["Avg. Gross USD"]
X_scaled = StandardScaler().fit_transform(X)

cat_model = CatBoostRegressor(verbose=0, random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

mae_scores, rmse_scores, r2_scores = [], [], []
for train_idx, test_idx in kf.split(X_scaled):
    cat_model.fit(X_scaled[train_idx], y.iloc[train_idx])
    y_pred = cat_model.predict(X_scaled[test_idx])
    mae_scores.append(mean_absolute_error(y.iloc[test_idx], y_pred))
    rmse_scores.append(np.sqrt(mean_squared_error(y.iloc[test_idx], y_pred)))
    r2_scores.append(r2_score(y.iloc[test_idx], y_pred))

print("Average MAE (CatBoost):", np.mean(mae_scores))
print("Average RMSE (CatBoost):", np.mean(rmse_scores))
print("Average R² (CatBoost):", np.mean(r2_scores))

  df = pd.read_csv("/content/drive/MyDrive/A Round Ent/combined_df.csv")


Average MAE (CatBoost): 44705.08564106424
Average RMSE (CatBoost): 143837.66405476135
Average R² (CatBoost): 0.9407529576260487


# **Ensemble: CatBoost, RandomForest**

In [None]:
from sklearn.ensemble import RandomForestRegressor

X = pd.get_dummies(df.drop(columns=["Avg. Gross USD"]), drop_first=True)
y = df["Avg. Gross USD"]
X_scaled = StandardScaler().fit_transform(X)

cat_params = {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
cat_model = CatBoostRegressor(**cat_params, verbose=0, random_state=42)
rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

mae_scores, rmse_scores, r2_scores = [], [], []

for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    cat_model.fit(X_train, y_train)
    rf_model.fit(X_train, y_train)

    cat_preds = cat_model.predict(X_test)
    rf_preds = rf_model.predict(X_test)
    ensemble_preds = (cat_preds + rf_preds) / 2

    mae_scores.append(mean_absolute_error(y_test, ensemble_preds))
    rmse_scores.append(np.sqrt(mean_squared_error(y_test, ensemble_preds)))
    r2_scores.append(r2_score(y_test, ensemble_preds))

print("Average MAE (Ensemble):", np.mean(mae_scores))
print("Average RMSE (Ensemble):", np.mean(rmse_scores))
print("Average R² (Ensemble):", np.mean(r2_scores))

Average MAE (Ensemble): 44767.931115351974
Average RMSE (Ensemble): 145760.7985403009
Average R² (Ensemble): 0.939873014414432


# **Ensemble: CatBoost, GradientBoost, RandomForest**

In [None]:
X = pd.get_dummies(df.drop(columns=["Avg. Gross USD"]), drop_first=True)
y = df["Avg. Gross USD"]
X_scaled = StandardScaler().fit_transform(X)

cat = CatBoostRegressor(depth=6, iterations=500, l2_leaf_reg=5, learning_rate=0.1, verbose=0, random_state=42)
rf = RandomForestRegressor(n_estimators=150, max_depth=10, random_state=42)
gb = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores, rmse_scores, r2_scores = [], [], []

for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    cat.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    gb.fit(X_train, y_train)

    y_pred = 0.6 * cat.predict(X_test) + 0.2 * rf.predict(X_test) + 0.2 * gb.predict(X_test)

    mae_scores.append(mean_absolute_error(y_test, y_pred))
    rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    r2_scores.append(r2_score(y_test, y_pred))

print("Weighted Average Ensemble Results:")
print("MAE:", np.mean(mae_scores))
print("RMSE:", np.mean(rmse_scores))
print("R²:", np.mean(r2_scores))

Weighted Average Ensemble Results:
MAE: 45075.63469559883
RMSE: 142275.95747059822
R²: 0.9427272127820097


# **Ensemble: CatBoost, GradientBoost, Voting Regressor, Parameter Tuning**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, VotingRegressor
from sklearn.model_selection import KFold, GridSearchCV

X = pd.get_dummies(df.drop(columns=["Avg. Gross USD"]), drop_first=True)
y = df["Avg. Gross USD"]
X_scaled = StandardScaler().fit_transform(X)

cat_model = CatBoostRegressor(verbose=0, random_state=42)
param_grid = {
    'depth': [6, 8],
    'learning_rate': [0.05, 0.1],
    'l2_leaf_reg': [3, 5],
    'iterations': [500]
}
grid_search = GridSearchCV(cat_model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_scaled, y)
best_cat = grid_search.best_estimator_

gbr = GradientBoostingRegressor(random_state=42)
ensemble = VotingRegressor([('cat', best_cat), ('gbr', gbr)])

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores, rmse_scores, r2_scores = [], [], []

for train_idx, test_idx in kf.split(X_scaled):
    ensemble.fit(X_scaled[train_idx], y.iloc[train_idx])
    y_pred = ensemble.predict(X_scaled[test_idx])
    mae_scores.append(mean_absolute_error(y.iloc[test_idx], y_pred))
    rmse_scores.append(np.sqrt(mean_squared_error(y.iloc[test_idx], y_pred)))
    r2_scores.append(r2_score(y.iloc[test_idx], y_pred))

print("Best CatBoost Params:", grid_search.best_params_)
print("Average MAE (Ensemble):", np.mean(mae_scores))
print("Average RMSE (Ensemble):", np.mean(rmse_scores))
print("Average R² (Ensemble):", np.mean(r2_scores))

  df = pd.read_csv("/content/drive/MyDrive/A Round Ent/combined_df.csv")


Best CatBoost Params: {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
Average MAE (Ensemble): 48103.96145619062
Average RMSE (Ensemble): 143105.75343200954
Average R² (Ensemble): 0.9423975526548748


In [None]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor

X = pd.get_dummies(df.drop(columns=["Avg. Gross USD"]), drop_first=True)
y = df["Avg. Gross USD"]
X_scaled = StandardScaler().fit_transform(X)

# Define models
cat_params = {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
cat_model = CatBoostRegressor(**cat_params, verbose=0, random_state=42)
ridge_model = Ridge(alpha=1.0, random_state=42)

# Meta-model
meta_model = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)

# Build stacking regressor
stacking_model = StackingRegressor(
    estimators=[
        ('catboost', cat_model),
        ('ridge', ridge_model)
    ],
    final_estimator=meta_model,
    passthrough=True,
    cv=5
)

# Evaluate using KFold CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores, rmse_scores, r2_scores = [], [], []

for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    stacking_model.fit(X_train, y_train)
    y_pred = stacking_model.predict(X_test)

    mae_scores.append(mean_absolute_error(y_test, y_pred))
    rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    r2_scores.append(r2_score(y_test, y_pred))

# Print average metrics
print("Average MAE (Stacked Ensemble):", np.mean(mae_scores))
print("Average RMSE (Stacked Ensemble):", np.mean(rmse_scores))print("Average R² (Stacked Ensemble):", np.mean(r2_scores))

Average MAE (Stacked Ensemble): 50859.8929662181
Average RMSE (Stacked Ensemble): 159338.51671106013
Average R² (Stacked Ensemble): 0.9284766371698417


# **Ensemble: CatBoost, GradientBoost, Voting Regressor, RandomForest**

In [None]:
X = pd.get_dummies(df.drop(columns=["Avg. Gross USD"]), drop_first=True)
y = df["Avg. Gross USD"]
X_scaled = StandardScaler().fit_transform(X)

cat_params = {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
cat_model = CatBoostRegressor(**cat_params, verbose=0, random_state=42)
rf_model = RandomForestRegressor(n_estimators=150, max_depth=10, random_state=42)
gb_model = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)

voting_ensemble = VotingRegressor([
    ('cat', cat_model),
    ('rf', rf_model),
    ('gb', gb_model)
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores, rmse_scores, r2_scores = [], [], []

for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    voting_ensemble.fit(X_train, y_train)
    y_pred = voting_ensemble.predict(X_test)

    mae_scores.append(mean_absolute_error(y_test, y_pred))
    rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    r2_scores.append(r2_score(y_test, y_pred))

print("Average MAE (Voting Ensemble):", np.mean(mae_scores))
print("Average RMSE (Voting Ensemble):", np.mean(rmse_scores))
print("Average R² (Voting Ensemble):", np.mean(r2_scores))

Average MAE (Voting Ensemble): 45670.25084503471
Average RMSE (Voting Ensemble): 144232.56207094147
Average R² (Voting Ensemble): 0.941346230077782


# **Bagging**

In [None]:
from sklearn.ensemble import BaggingRegressor

X = pd.get_dummies(df.drop(columns=["Avg. Gross USD"]), drop_first=True)
y = df["Avg. Gross USD"]
X_scaled = StandardScaler().fit_transform(X)

cat_params = {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
base_cat = CatBoostRegressor(**cat_params, verbose=0, random_state=42)
bagging_model = BaggingRegressor(n_estimators=10, random_state=42, n_jobs=-1)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores, rmse_scores, r2_scores = [], [], []

for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    bagging_model.fit(X_train, y_train)
    y_pred = bagging_model.predict(X_test)

    mae_scores.append(mean_absolute_error(y_test, y_pred))
    rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    r2_scores.append(r2_score(y_test, y_pred))

print("Bagging Regressor Results:")
print("MAE:", np.mean(mae_scores))
print("RMSE:", np.mean(rmse_scores))
print("R²:", np.mean(r2_scores))

Bagging Regressor Results:
MAE: 48968.188651607066
RMSE: 165306.668473105
R²: 0.9231272649646118


# **Ensemble + Meta Models: CatBoost, GradientBoost, RandomForest**

In [None]:
X = pd.get_dummies(df.drop(columns=["Avg. Gross USD"]), drop_first=True)
y = df["Avg. Gross USD"]
X_scaled = StandardScaler().fit_transform(X)

cat = CatBoostRegressor(depth=6, iterations=500, l2_leaf_reg=5, learning_rate=0.1, verbose=0, random_state=42)
rf = RandomForestRegressor(n_estimators=150, max_depth=10, random_state=42)
meta_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores, rmse_scores, r2_scores = [], [], []

for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_val = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

    cat.fit(X_train, y_train)
    rf.fit(X_train, y_train)

    # Meta models
    cat_preds = cat.predict(X_val)
    rf_preds = rf.predict(X_val)
    meta_features = np.column_stack((cat_preds, rf_preds))
    meta_model.fit(meta_features, y_val)

    final_preds = meta_model.predict(meta_features)

    mae_scores.append(mean_absolute_error(y_val, final_preds))
    rmse_scores.append(np.sqrt(mean_squared_error(y_val, final_preds)))
    r2_scores.append(r2_score(y_val, final_preds))

print("Blending Ensemble with CV Results:")
print("Average MAE:", np.mean(mae_scores))
print("Average RMSE:", np.mean(rmse_scores))
print("Average R²:", np.mean(r2_scores))

Blending Ensemble with CV Results:
Average MAE: 30408.678646922705
Average RMSE: 70089.12977572375
Average R²: 0.9861268718908203
