In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

dataset_path = '/content/drive/MyDrive/combined_df.csv'
df = pd.read_csv(dataset_path)
train_df = df[
        (df['Year'] >= 2020) &
        (df['Headliner'].str.contains('"', na=False)) &
        (~df['Support'].isna()) &
        (df['Genre'] != 'Family Entertainment')
    ]
print("First 5 rows of the dataset:")
print(df.head())



  df = pd.read_csv(dataset_path)


First 5 rows of the dataset:
   Event Date                              Headliner  \
0  2024-09-18                                  Creed   
1  2024-09-14                                  Creed   
2  2024-09-13  Bruce Springsteen & The E Street Band   
3  2024-09-13                                  Creed   
4  2024-09-13                Billy Joel, Rod Stewart   

                          sp artist_name  \
0                                  Creed   
1                                  Creed   
2  Bruce Springsteen & The E Street Band   
3                                  Creed   
4                Billy Joel, Rod Stewart   

                                     sp artist_genre  sp followers  \
0  ['alternative metal', 'nu metal', 'post-grunge...     3527070.0   
1  ['alternative metal', 'nu metal', 'post-grunge...     3527070.0   
2  ['heartland rock', 'mellow gold', 'permanent w...     6567386.0   
3  ['alternative metal', 'nu metal', 'post-grunge...     3527070.0   
4  ['album rock', '

In [None]:
print("\nSummary statistics of the dataset:")
print(df.describe())


Summary statistics of the dataset:
       sp followers  sp popularity  yt View Count  yt Subscriber Count  \
count  3.245600e+04   32456.000000   3.210200e+04         3.210200e+04   
mean   2.498679e+06      51.025357   7.947109e+08         1.368386e+06   
std    9.005023e+06      20.372168   3.018964e+09         5.633076e+06   
min    0.000000e+00       0.000000   0.000000e+00         0.000000e+00   
25%    6.217250e+04      40.000000   2.319628e+06         7.340000e+03   
50%    3.271130e+05      53.000000   3.686581e+07         7.660000e+04   
75%    1.490683e+06      65.000000   3.129296e+08         4.710000e+05   
max    1.236853e+08     100.000000   6.104600e+10         3.210000e+08   

       yt Video Count  Total population  Under 5 years population  \
count    32102.000000      3.249400e+04              32494.000000   
mean       255.025263      8.631460e+05              50734.760448   
std       1892.338983      1.569479e+06              95171.317352   
min          0.000000

In [None]:
# Display missing value counts for each column
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
print("Missing Values and Percentage:")
print(pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage}).sort_values(by='Percentage', ascending=False))


Missing Values and Percentage:
                                 Missing Values  Percentage
yt Description                           682831   96.495758
yt Published At                          675526   95.463436
yt name                                  675526   95.463436
yt Channel ID                            675526   95.463436
yt Title                                 675526   95.463436
yt View Count                            675526   95.463436
yt Subscriber Count                      675526   95.463436
yt Video Count                           675526   95.463436
sp artist_name                           675172   95.413409
sp artist_genre                          675172   95.413409
sp followers                             675172   95.413409
sp popularity                            675172   95.413409
65 to 74 years population                675134   95.408039
55 to 59 years population                675134   95.408039
60 to 64 years population                675134   95.408039
75 to 84 

Handling Missing Values

In [None]:
from geopy.geocoders import Nominatim

# Define a function to infer 'State' based on 'City'
def fill_state(city):
    if pd.isnull(city) or city == "Unknown":
        return "Unknown"
    geolocator = Nominatim(user_agent="geoapiExercises")
    try:
        # Attempt to fetch the location information using geopy
        location = geolocator.geocode(city, timeout=10)
        if location:
            # Extract state information from the address
            return location.address.split(',')[-3].strip()
        else:
            return "Unknown"
    except:
        # If geocoding fails, return 'Unknown'
        return "Unknown"

# Handle missing values in the 'State' column using the 'fill_state' function
df['State'] = df.apply(lambda x: fill_state(x['City']) if pd.isnull(x['State']) else x['State'], axis=1)

# Print to check remaining missing values in the 'State' column
print(f"Remaining missing values in 'State': {df['State'].isnull().sum()}")

# Fill missing values in categorical columns with 'Missing'
categorical_cols = ['Support', 'Market', 'Genre', 'Promoter', 'Company Type', 'Venue']
for col in categorical_cols:
    df[col] = df[col].fillna("Missing")

# Fill missing values in numerical columns with mean
df['Ticket Price Avg. USD'] = df['Ticket Price Avg. USD'].fillna(df['Ticket Price Avg. USD'].mean())
df['Avg. Event Capacity'] = df['Avg. Event Capacity'].fillna(df['Avg. Event Capacity'].mean())

# Check if any missing values remain in the dataset
print("Remaining missing values in the dataset:")
print(df.isnull().sum())



Remaining missing values in 'State': 0
Remaining missing values in the dataset:
Event Date                              0
Headliner                               0
sp artist_name                     675172
sp artist_genre                    675172
sp followers                       675172
sp popularity                      675172
yt name                            675526
yt Channel ID                      675526
yt Title                           675526
yt Description                     682831
yt Published At                    675526
yt View Count                      675526
yt Subscriber Count                675526
yt Video Count                     675526
Total population                   675134
Under 5 years population           675134
5 to 9 years population            675134
10 to 14 years population          675134
15 to 19 years population          675134
20 to 24 years population          675134
25 to 34 years population          675134
35 to 44 years population          675

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Convert 'Event Date' to datetime format and extract features
df['Event Date'] = pd.to_datetime(df['Event Date'])

# Feature engineering: extract useful time-related features
df['Year'] = df['Event Date'].dt.year
df['Month'] = df['Event Date'].dt.month
df['Day'] = df['Event Date'].dt.day
df['Day_of_Week'] = df['Event Date'].dt.dayofweek  # 0 = Monday, 6 = Sunday
df['Day_of_Year'] = df['Event Date'].dt.dayofyear
df['Is_Weekend'] = (df['Day_of_Week'] >= 5).astype(int)  # 1 for Saturday/Sunday

# Create interaction features between price and capacity
df['Price_Range'] = df['Ticket Price Max USD'] - df['Ticket Price Min USD']
df['Gross_Per_Capacity'] = df['Avg. Gross USD'] / df['Avg. Event Capacity']

# Handle infinite or NaN values created during feature engineering
df['Gross_Per_Capacity'] = df['Gross_Per_Capacity'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Scale/normalize numerical features
scaler = StandardScaler()

# Select numerical columns for normalization
numerical_features = [
    'Avg. Gross USD',
    'Ticket Price Avg. USD',
    'Avg. Event Capacity',
    'Price_Range',
    'Gross_Per_Capacity'
]

# Normalize the numerical features
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Drop the original 'Event Date' column after processing
df = df.drop(columns=['Event Date'])
df = df.drop(columns=['day_of_week'])
# Final check: View the processed data
print(df.head())


                               Headliner  \
0                                  Creed   
1                                  Creed   
2  Bruce Springsteen & The E Street Band   
3                                  Creed   
4                Billy Joel, Rod Stewart   

                          sp artist_name  \
0                                  Creed   
1                                  Creed   
2  Bruce Springsteen & The E Street Band   
3                                  Creed   
4                Billy Joel, Rod Stewart   

                                     sp artist_genre  sp followers  \
0  ['alternative metal', 'nu metal', 'post-grunge...     3527070.0   
1  ['alternative metal', 'nu metal', 'post-grunge...     3527070.0   
2  ['heartland rock', 'mellow gold', 'permanent w...     6567386.0   
3  ['alternative metal', 'nu metal', 'post-grunge...     3527070.0   
4  ['album rock', 'classic rock', 'mellow gold', ...     6312751.0   

   sp popularity                                y

Distribution-balanced Stratified Cross-Validation

In [None]:
import numpy as np

def dbscv(x: np.array, y: np.array, params: dict) -> (list, list, list, list):
    """
    Distribution-balanced stratified cross-validation (DBSCV) splitting method.

    Parameters:
    - x: np.array, feature matrix of shape (N, M)
    - y: np.array, labels array of shape (N,)
    - params: dict, should contain 'K' (number of folds)

    Returns:
    - x_fit: list of np.array, training feature sets for each fold
    - x_val: list of np.array, validation feature sets for each fold
    - y_fit: list of np.array, training label sets for each fold
    - y_val: list of np.array, validation label sets for each fold
    """
    k = params.get('K', 5)  # Number of folds, default is 5
    N, M = x.shape  # N: number of samples, M: number of features
    classes = np.unique(y)  # Unique class labels

    # Initialize reference feature vector X0 (zeros for continuous attributes)
    X0 = np.zeros(M)

    # Initialize folds: T[0] corresponds to fold 1
    T = [[] for _ in range(k)]

    # List to hold remaining samples after main distribution
    L_r = []

    # For each class, construct the sorted list L_i
    for c in classes:
        # Indices of samples with class label c
        S_i = np.where(y == c)[0].tolist()
        Li = []
        last_sample = X0

        # Step (2): Sort the cases of each class
        while S_i:
            # Extract samples of class c
            samples = x[S_i, :]
            # Compute Euclidean distances to the last sample
            distances = np.linalg.norm(samples - last_sample, axis=1)
            # Find the sample with the minimum distance
            min_idx = np.argmin(distances)
            sample_index = S_i[min_idx]
            # Add to the sorted list Li
            Li.append(sample_index)
            # Update last_sample and remove the selected sample from S_i
            last_sample = x[sample_index]
            S_i.pop(min_idx)

        # Step (3): Partition each Li into k folds
        idx = 0
        while idx + k <= len(Li):
            for j in range(k):
                index = Li[idx + j]
                T[j].append(index)
            idx += k

        # Collect remaining samples to L_r
        Li_remain = Li[idx:]
        if Li_remain:
            L_r.extend(Li_remain)

    # Distribute remaining samples in L_r into folds T_j
    for i, index in enumerate(L_r):
        T[i % k].append(index)

    # Prepare training and validation sets for each fold
    x_fit = []
    x_val = []
    y_fit = []
    y_val = []

    indices_all = np.arange(N)

    for j in range(k):
        val_indices = np.array(T[j])
        train_indices = np.setdiff1d(indices_all, val_indices)

        x_val.append(x[val_indices])
        y_val.append(y[val_indices])

        x_fit.append(x[train_indices])
        y_fit.append(y[train_indices])

    return x_fit, x_val, y_fit, y_val


Prepare Dataset

In [None]:
from sklearn.preprocessing import LabelEncoder
categorical_features = ['Headliner', 'Support', 'Market', 'Genre', 'Promoter', 'Company Type', 'Venue', 'City', 'State']
numerical_features = ['Avg. Gross USD', 'Ticket Price Avg. USD', 'Avg. Event Capacity',
                      'Price_Range', 'Gross_Per_Capacity', 'Year', 'Month', 'Day',
                      'Day_of_Year', 'Is_Weekend']
# Encode categorical features with LabelEncoder
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Save the encoder for later use

# Concatenate encoded categorical features with numerical features
final_df = df[numerical_features + categorical_features]

# Ensure no missing values remain in the final dataset
final_df = final_df.fillna(0)

# Features (X) and target (y)
X = final_df.drop(columns=['Avg. Gross USD']).values  # Drop the target column from features
y = final_df['Avg. Gross USD'].values                # Target variable



In [None]:
# Parameters for DBSCV
params = {'K': 5}  # Number of folds

# Apply DBSCV to the dataset
x_fit, x_val, y_fit, y_val = dbscv(X, y, params)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train and evaluate using DBSCV splits
rf_scores = []  # To store evaluation metrics for each fold

for i in range(len(x_fit)):
    # Train the model on training set
    rf_model.fit(x_fit[i], y_fit[i])

    # Predict on validation set
    predictions = rf_model.predict(x_val[i])

    # Calculate metrics
    mse = mean_squared_error(y_val[i], predictions)
    r2 = r2_score(y_val[i], predictions)
    rf_scores.append((mse, r2))

# Print evaluation results for each fold
for fold, (mse, r2) in enumerate(rf_scores, 1):
    print(f"Random Forest - Fold {fold}: MSE = {mse:.4f}, R^2 = {r2:.4f}")

Random Forest - Fold 1: MSE = 0.0447, R^2 = 0.9537
Random Forest - Fold 2: MSE = 0.0201, R^2 = 0.9802
Random Forest - Fold 3: MSE = 0.0694, R^2 = 0.9397
Random Forest - Fold 4: MSE = 0.0056, R^2 = 0.9939
Random Forest - Fold 5: MSE = 0.0031, R^2 = 0.9968


In [None]:
from xgboost import XGBRegressor

# Initialize XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, objective='reg:squarederror')

# Train and evaluate using DBSCV splits
xgb_scores = []  # To store evaluation metrics for each fold

for i in range(len(x_fit)):
    # Train the model on training set
    xgb_model.fit(x_fit[i], y_fit[i])

    # Predict on validation set
    predictions = xgb_model.predict(x_val[i])

    # Calculate metrics
    mse = mean_squared_error(y_val[i], predictions)
    r2 = r2_score(y_val[i], predictions)
    xgb_scores.append((mse, r2))

# Print evaluation results for each fold
for fold, (mse, r2) in enumerate(xgb_scores, 1):
    print(f"XGBoost - Fold {fold}: MSE = {mse:.4f}, R^2 = {r2:.4f}")


XGBoost - Fold 1: MSE = 0.0512, R^2 = 0.9469
XGBoost - Fold 2: MSE = 0.1158, R^2 = 0.8857
XGBoost - Fold 3: MSE = 0.1835, R^2 = 0.8404
XGBoost - Fold 4: MSE = 0.0723, R^2 = 0.9223
XGBoost - Fold 5: MSE = 0.0466, R^2 = 0.9506


In [None]:
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Root Mean Squared Error (RMSE): 0.2159


In [None]:


from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

# Define the parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],       # Number of trees
    'max_depth': [10, 20, None],          # Depth of each tree
    'min_samples_split': [2, 5, 10],      # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4]         # Minimum samples in a leaf node
}

# Custom scoring metrics
scoring = {
    'MSE': make_scorer(mean_squared_error, greater_is_better=False),
    'R2': make_scorer(r2_score)
}

# Initialize the Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Perform GridSearchCV
rf_grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_param_grid,
    scoring=scoring,
    refit='R2',  # Optimize for R2 score
    cv=list(zip(x_fit, y_fit, x_val, y_val)),  # Use DBSCV splits
    verbose=2,
    n_jobs=-1  # Use all available CPU cores
)

# Fit the GridSearchCV model
rf_grid_search.fit(X, y)

# Best parameters and scores
print("Best Parameters for Random Forest:", rf_grid_search.best_params_)
print("Best R^2 Score for Random Forest:", rf_grid_search.best_score_)



In [None]:
!pip install --upgrade scikit-learn xgboost




In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

# Define the parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize XGBoost
xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')

scoring = {
    'MSE': make_scorer(mean_squared_error, greater_is_better=False),
    'R2': make_scorer(r2_score)
}
# Perform GridSearchCV
xgb_grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=xgb_param_grid,
    scoring=scoring,
    refit='R2',  # Optimize for R^2 score
    cv=5,        # Number of cross-validation folds
    verbose=2,
    n_jobs=-1
)

# Fit the model
xgb_grid_search.fit(X, y)

# Best parameters and scores
print("Best Parameters for XGBoost:", xgb_grid_search.best_params_)
print("Best R^2 Score for XGBoost:", xgb_grid_search.best_score_)


AttributeError: 'super' object has no attribute '__sklearn_tags__'