In [31]:
import pandas as pd
import numpy as np
from numpy import hstack
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier


In [33]:

#1. read dataset
df = pd.read_csv("Spotify Youtube Dataset.csv")

#display first examples
df.head()

#Dropping features from the dataset that add no predictive value to the model
df = df.drop(columns=[
    'Track', 'Artist', 'Url_spotify', 'Uri', 'Url_youtube', 'Title', 'Channel', 'Description', 'Album', 'Album_type'])

#Sanity check and dropping duplicates
df = df.drop_duplicates()

#Check for missing data
print("NULL data: ",df.isnull().sum())

# Drop rows with missing target features
df = df.dropna(subset=['Views'])

# Fill numeric columns with median
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
numeric_cols_wo_views = [c for c in numeric_cols if c != 'Views']
df[numeric_cols_wo_views] = df[numeric_cols_wo_views].fillna(df[numeric_cols_wo_views].median())

# Fill categorical/boolean columns with mode
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Create popularity classes based on quantiles
df['popularity_class'] = pd.qcut(df['Views'], q=3, labels=['Low', 'Medium', 'High'])

# Drop the original 'views' column (to avoid leakage of target)
df = df.drop(columns=['Views'])

NULL data:  Unnamed: 0            0
Danceability          2
Energy                2
Key                   2
Loudness              2
Speechiness           2
Acousticness          2
Instrumentalness      2
Liveness              2
Valence               2
Tempo                 2
Duration_ms           2
Views               470
Likes               541
Comments            569
Licensed            470
official_video      470
Stream              576
dtype: int64


  df[col] = df[col].fillna(df[col].mode()[0])


In [35]:

# 2. Train/test split
RANDOM_STATE = 42
target_col = 'popularity_class'

X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# Identify numeric & categorical
numeric_features = X.select_dtypes(include=['float64', 'int64', 'bool']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Scale numeric
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[numeric_features])
X_test_num = scaler.transform(X_test[numeric_features])

# Keep categorical (if any)
X_train_cat = X_train[categorical_features]
X_test_cat = X_test[categorical_features]

# Combine
X_train_final = hstack([X_train_num, X_train_cat])
X_test_final = hstack([X_test_num, X_test_cat])

In [38]:
# 3. Baseline Random Forest
baseline_rf = RandomForestClassifier(random_state=RANDOM_STATE)
baseline_rf.fit(X_train_final, y_train)

y_pred_rf_base = baseline_rf.predict(X_test_final)

acc_rf_b = accuracy_score(y_test, y_pred_rf_base)
prec_rf_b, rec_rf_b, f1_rf_b, _ = precision_recall_fscore_support(
    y_test, y_pred_rf_base, average='macro', zero_division=0
)

print("******** BASELINE Random Forest ********")
print(f"Accuracy: {acc_rf_b:.4f}")
print(f"Precision: {prec_rf_b:.4f}")
print(f"Recall: {rec_rf_b:.4f}")
print(f"F1 (macro): {f1_rf_b:.4f}\n")
print("Classification report:\n", classification_report(y_test, y_pred_rf_base, zero_division=0))

cm_rf_b = confusion_matrix(y_test, y_pred_rf_base, labels=sorted(y.unique()))
print("Confusion matrix:\n", cm_rf_b)

******** BASELINE Random Forest ********
Accuracy: 0.8941
Precision: 0.8965
Recall: 0.8941
F1 (macro): 0.8949

Classification report:
               precision    recall  f1-score   support

        High       0.93      0.90      0.91      1350
         Low       0.94      0.91      0.92      1350
      Medium       0.82      0.87      0.85      1350

    accuracy                           0.89      4050
   macro avg       0.90      0.89      0.89      4050
weighted avg       0.90      0.89      0.89      4050

Confusion matrix:
 [[1218    0  132]
 [   0 1224  126]
 [  95   76 1179]]


In [53]:
# 4. Tuned Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': [None, 'balanced']
}

grid_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=RANDOM_STATE),
    param_grid=param_grid,
    scoring='f1_macro',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_rf.fit(X_train_final, y_train)

print("Best Params:", grid_rf.best_params_)
print("Best CV f1_macro:", grid_rf.best_score_)

best_rf = grid_rf.best_estimator_
y_pred_rf_tuned = best_rf.predict(X_test_final)

acc_rf_t = accuracy_score(y_test, y_pred_rf_tuned)
prec_rf_t, rec_rf_t, f1_rf_t, _ = precision_recall_fscore_support(
    y_test, y_pred_rf_tuned, average='macro', zero_division=0
)

print("\n******** TUNED Random Forest ********")
print(f"Accuracy: {acc_rf_t:.4f}")
print(f"Precision: {prec_rf_t:.4f}")
print(f"Recall: {rec_rf_t:.4f}")
print(f"F1 (macro): {f1_rf_t:.4f}\n")
print("Classification report:\n", classification_report(y_test, y_pred_rf_tuned, zero_division=0))

cm_rf_t = confusion_matrix(y_test, y_pred_rf_tuned, labels=sorted(y.unique()))
print("Confusion matrix:\n", cm_rf_t)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best Params: {'class_weight': None, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV f1_macro: 0.8839022767088589

******** TUNED Random Forest ********
Accuracy: 0.8943
Precision: 0.8969
Recall: 0.8943
F1 (macro): 0.8951

Classification report:
               precision    recall  f1-score   support

        High       0.93      0.90      0.92      1350
         Low       0.94      0.91      0.92      1350
      Medium       0.82      0.87      0.85      1350

    accuracy                           0.89      4050
   macro avg       0.90      0.89      0.90      4050
weighted avg       0.90      0.89      0.90      4050

Confusion matrix:
 [[1215    0  135]
 [   0 1226  124]
 [  89   80 1181]]


In [54]:
# 5. Hypothesis test: Remove engagement features
df_no_engagement = df.drop(columns=['Likes', 'Comments', 'Stream'], errors='ignore')

X2 = df_no_engagement.drop(columns=[target_col])
y2 = df_no_engagement[target_col]

X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, stratify=y2, random_state=RANDOM_STATE
)

num_cols2 = X2_train.select_dtypes(include=['float64', 'int64', 'bool']).columns.tolist()
cat_cols2 = X2_train.select_dtypes(include=['object', 'category']).columns.tolist()

scaler2 = StandardScaler()
X2_train_num = scaler2.fit_transform(X2_train[num_cols2])
X2_test_num = scaler2.transform(X2_test[num_cols2])

from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

# One-hot encode categorical features
ohe2 = OneHotEncoder(handle_unknown="ignore", sparse_output=True)


X2_train_cat = ohe2.fit_transform(X2_train[cat_cols2])
X2_test_cat = ohe2.transform(X2_test[cat_cols2])

# Combine numeric + categorical (convert to dense for RandomForest)
X2_train_final = hstack([X2_train_num, X2_train_cat]).toarray()
X2_test_final = hstack([X2_test_num, X2_test_cat]).toarray()


rf2 = RandomForestClassifier(random_state=RANDOM_STATE)
grid_rf2 = GridSearchCV(
    estimator=rf2,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_rf2.fit(X2_train_final, y2_train)

print("\nBest Params (no engagement):", grid_rf2.best_params_)
print("Best CV f1_macro (no engagement):", grid_rf2.best_score_)

best_rf2 = grid_rf2.best_estimator_
y2_pred = best_rf2.predict(X2_test_final)

acc_rf2 = accuracy_score(y2_test, y2_pred)
prec_rf2, rec_rf2, f1_rf2, _ = precision_recall_fscore_support(
    y2_test, y2_pred, average='macro', zero_division=0
)

print("\n***** TUNED RF (NO ENGAGEMENT FEATURES) *****")
print(f"Accuracy: {acc_rf2:.4f}")
print(f"Precision (macro): {prec_rf2:.4f}")
print(f"Recall (macro): {rec_rf2:.4f}")
print(f"F1 (macro): {f1_rf2:.4f}\n")
print("Classification report:\n", classification_report(y2_test, y2_pred, zero_division=0))

cm_rf2 = confusion_matrix(y2_test, y2_pred, labels=sorted(y2.unique()))
print(f"Confusion matrix:\n", cm_rf2)


Fitting 3 folds for each of 48 candidates, totalling 144 fits

Best Params (no engagement): {'class_weight': None, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV f1_macro (no engagement): 0.5577104636210594

***** TUNED RF (NO ENGAGEMENT FEATURES) *****
Accuracy: 0.5906
Precision (macro): 0.5852
Recall (macro): 0.5906
F1 (macro): 0.5858

Classification report:
               precision    recall  f1-score   support

        High       0.60      0.68      0.64      1350
         Low       0.64      0.67      0.65      1350
      Medium       0.51      0.43      0.47      1350

    accuracy                           0.59      4050
   macro avg       0.59      0.59      0.59      4050
weighted avg       0.59      0.59      0.59      4050

Confusion matrix:
 [[913 140 297]
 [201 901 248]
 [404 368 578]]


In [55]:
# 6. Comparison summary
print("\nCHANGE IN METRICS AFTER FEATURE REMOVAL (with vs. without engagement):")
print(f"Accuracy Change: {acc_rf2 - acc_rf_t:+.4f}")
print(f"Precision Change: {prec_rf2 - prec_rf_t:+.4f}")
print(f"Recall Change: {rec_rf2 - rec_rf_t:+.4f}")
print(f"F1 Change: {f1_rf2 - f1_rf_t:+.4f}")


CHANGE IN METRICS AFTER FEATURE REMOVAL (with vs. without engagement):
Accuracy Change: -0.3037
Precision Change: -0.3117
Recall Change: -0.3037
F1 Change: -0.3094
