In [None]:
import numpy as np
from numpy import hstack
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, label_binarize
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, classification_report,
    confusion_matrix, roc_auc_score)

In [1]:
#read dataset
df = pd.read_csv("Spotify_Youtube.csv")

#display first examples
df.head()

#Dropping features from the dataset that add no predictive value to the model
df = df.drop(columns=[
    'Track', 'Artist', 'Url_spotify', 'Uri', 'Url_youtube', 'Title', 'Channel', 'Description', 'Album', 'Album_type'])

#Sanity check and dropping duplicates
df = df.drop_duplicates()

#Check for missing data
print("NULL data: ",df.isnull().sum())

# Drop rows with missing target features
df = df.dropna(subset=['Views'])

# Fill numeric columns with median
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
numeric_cols_wo_views = [c for c in numeric_cols if c != 'Views']
df[numeric_cols_wo_views] = df[numeric_cols_wo_views].fillna(df[numeric_cols_wo_views].median())

# Fill categorical/boolean columns with mode
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Create popularity classes based on quantiles
df['popularity_class'] = pd.qcut(df['Views'], q=3, labels=['Low', 'Medium', 'High'])

# Drop the original 'views' column (to avoid leakage of target)
df = df.drop(columns=['Views'])

NULL data:  Unnamed: 0            0
Danceability          2
Energy                2
Key                   2
Loudness              2
Speechiness           2
Acousticness          2
Instrumentalness      2
Liveness              2
Valence               2
Tempo                 2
Duration_ms           2
Views               470
Likes               541
Comments            569
Licensed            470
official_video      470
Stream              576
dtype: int64


  df[col] = df[col].fillna(df[col].mode()[0])


In [8]:
RANDOM_STATE = 42

#Feature and Target split 
target_col = 'popularity_class'
X = df.drop(columns=[target_col])
y = df[target_col]

#identifying numerical features, bool = numerical as Pandas stores True=1 and False=0
numeric_features = X.select_dtypes(include=['float64', 'int64','bool']).columns.tolist()
#Identifying any categorical features
categorical_features = X.select_dtypes(include=['object','category']).columns.tolist()

# 2) Train/test split (stratified to preserve Low/Medium/High ratios)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

#Scaling numeric features
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[numeric_features])
X_test_num = scaler.transform(X_test[numeric_features])
X_train_cat = X_train[categorical_features]
X_test_cat = X_test[categorical_features]

#Recombining numerical + categorical data
X_train_final = hstack([X_train_num, X_train_cat])
X_test_final = hstack([X_test_num, X_test_cat])

print("Final train shape:", X_train_final.shape)
print("Final test shape:", X_test_final.shape)

#Performing Baseline Logistic Regression
baseline_lr = LogisticRegression(max_iter=1000,solver='lbfgs',random_state=RANDOM_STATE)
baseline_lr.fit(X_train_final, y_train)
y_pred_base = baseline_lr.predict(X_test_final)

acc_b  = accuracy_score(y_test, y_pred_base)
prec_b, rec_b, f1_b, _ = precision_recall_fscore_support(y_test, y_pred_base, average='macro', zero_division=0)

print("********BASELINE Logistic Regression (no tuning)********")
print(f"Accuracy: {acc_b:.4f}")
print(f"Precision: {prec_b:.4f}")
print(f"Recall: {rec_b:.4f}")
print(f"F1: {f1_b:.4f}\n")
print("Classification report:\n", classification_report(y_test, y_pred_base, zero_division=0))

cm_b = confusion_matrix(y_test, y_pred_base, labels=sorted(y.unique()))
print("Confusion matrix (rows=true, cols=pred):\n", cm_b)

#Performing logistic regression with GridSearchCV
log_reg = LogisticRegression(max_iter=1000,solver='lbfgs',random_state=RANDOM_STATE)

#tuning parameters
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'newton-cg', 'saga'],
    'class_weight': [None, 'balanced'],
    'penalty': ['l2']
}

grid = GridSearchCV(
    estimator=baseline_lr,
    param_grid=param_grid,
    scoring='f1_macro',
    n_jobs=-1,
    refit=True,
    verbose=1
)

grid.fit(X_train_final, y_train)

print("Best params:", grid.best_params_)
print("Best CV f1_macro:", grid.best_score_)

best_lr = grid.best_estimator_
y_pred_tuned = best_lr.predict(X_test_final)

acc_t  = accuracy_score(y_test, y_pred_tuned)
prec_t, rec_t, f1_t, _ = precision_recall_fscore_support(y_test, y_pred_tuned, average='macro', zero_division=0)

print("********TUNED (GridSearchCV)********")
print(f"Accuracy: {acc_t:.4f}")
print(f"Precision (macro): {prec_t:.4f}")
print(f"Recall (macro): {rec_t:.4f}")
print(f"F1 (macro): {f1_t:.4f}\n")
print("Classification report:\n", classification_report(y_test, y_pred_tuned, zero_division=0))

cm_t = confusion_matrix(y_test, y_pred_tuned, labels=sorted(y.unique()))
print("Confusion matrix (rows=true, cols=pred):\n", cm_t)

Final train shape: (16198, 17)
Final test shape: (4050, 17)
********BASELINE Logistic Regression (no tuning)********
Accuracy: 0.8432
Precision: 0.8466
Recall: 0.8432
F1: 0.8425

Classification report:
               precision    recall  f1-score   support

        High       0.94      0.83      0.88      1350
         Low       0.83      0.95      0.89      1350
      Medium       0.77      0.75      0.76      1350

    accuracy                           0.84      4050
   macro avg       0.85      0.84      0.84      4050
weighted avg       0.85      0.84      0.84      4050

Confusion matrix (rows=true, cols=pred):
 [[1122    1  227]
 [   0 1281   69]
 [  76  262 1012]]
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best params: {'C': 10, 'class_weight': None, 'penalty': 'l2', 'solver': 'newton-cg'}
Best CV f1_macro: 0.8516163727237446
********TUNED (GridSearchCV)********
Accuracy: 0.8573
Precision (macro): 0.8598
Recall (macro): 0.8573
F1 (macro): 0.8572

Classificati

In [10]:
#Hypothesis - Engagement metrics such as Likes, Comments, and Streams do not impact the popularity of a track.

In [12]:
RANDOM_STATE = 42
target_col = 'popularity_class'

#Dropping engagement features as per hypothesis
df_no_engagement = df.drop(columns=['Likes', 'Comments', 'Stream'], errors='ignore').copy()

#Train and Test split
X2 = df_no_engagement.drop(columns=[target_col])
y2 = df_no_engagement[target_col]

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.20, stratify=y2, random_state=RANDOM_STATE)

#Identify Numeric column types
num_cols2 = X2_train.select_dtypes(include=['float64', 'int64','bool']).columns.tolist()
cat_cols2 = X2_train.select_dtypes(include=['object','category']).columns.tolist()

#Scaling numeric features
scaler2 = StandardScaler()
X2_train_num = scaler2.fit_transform(X2_train[num_cols2])
X2_test_num  = scaler2.transform(X2_test[num_cols2])
X2_train_cat = X2_train[categorical_features]
X2_test_cat = X2_test[categorical_features]

#Recombining numerical + categorical data
X2_train_final = hstack([X2_train_num, X2_train_cat])
X2_test_final  = hstack([X2_test_num,  X2_test_cat])

#Using GridSearchCV again to tune parameters as removing features may have changed the best params
lr2 = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)

param_grid2 = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'newton-cg', 'saga'],
    'class_weight': [None, 'balanced'],
    'penalty': ['l2']
}

grid2 = GridSearchCV(
    estimator=lr2,
    param_grid=param_grid2,
    scoring='f1_macro',
    n_jobs=-1,
    refit=True,
    verbose=1
)

grid2.fit(X2_train_final, y2_train)

print("Best params (no engagement):", grid2.best_params_)
print("Best CV f1_macro (no engagement):", grid2.best_score_)

best_lr2 = grid2.best_estimator_
y2_pred = best_lr2.predict(X2_test_final)

acc2 = accuracy_score(y2_test, y2_pred)
prec2, rec2, f12, _ = precision_recall_fscore_support(y2_test, y2_pred, average='macro', zero_division=0)

print("\n*****TUNED (NO ENGAGEMENT FEATURES) *****")
print(f"Accuracy: {acc2:.4f}")
print(f"Precision (macro): {prec2:.4f}")
print(f"Recall (macro): {rec2:.4f}")
print(f"F1 (macro): {f12:.4f}\n")
print("Classification report:\n", classification_report(y2_test, y2_pred, zero_division=0))

cm2 = confusion_matrix(y2_test, y2_pred, labels=sorted(y2.unique()))
print("Confusion matrix (rows=true, cols=pred):\n", cm2)

#Comparison between previous model and post-hypothesis model
print("CHANGE IN METRICS AFTER FEATURE REMOVAL (WITH engagement) ===")
print(f"Accuracy Change: {acc2 - acc_t:+.4f}")
print(f"Precision Change: {prec2 - prec_t:+.4f}")
print(f"Recall Change: {rec2 - rec_t:+.4f}")
print(f"F1 Change: {f12 - f1_t:+.4f}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best params (no engagement): {'C': 10, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV f1_macro (no engagement): 0.4775303364395006

*****TUNED (NO ENGAGEMENT FEATURES) *****
Accuracy: 0.5044
Precision (macro): 0.4981
Recall (macro): 0.5044
F1 (macro): 0.4803

Classification report:
               precision    recall  f1-score   support

        High       0.47      0.73      0.57      1350
         Low       0.61      0.57      0.59      1350
      Medium       0.41      0.21      0.28      1350

    accuracy                           0.50      4050
   macro avg       0.50      0.50      0.48      4050
weighted avg       0.50      0.50      0.48      4050

Confusion matrix (rows=true, cols=pred):
 [[990 140 220]
 [396 772 182]
 [722 347 281]]
CHANGE IN METRICS AFTER FEATURE REMOVAL (WITH engagement) ===
Accuracy Change: -0.3528
Precision Change: -0.3618
Recall Change: -0.3528
F1 Change: -0.3769


In [14]:
#Observations: 
#After feature removal, there is a huge drop in accuracy (from 85% to 50%), 
#F1 score also drops (from 0.85 to 0.48), confusion matrix shows many misclassifications for the Medium class.
#Hence, engagement metrics such as Likes, Comments and Streams are highly predictive of popularity classes.
#Without engagement metrics, Logistic Regression fails in distinguish classes, performing only slightly bettee than random guess, i.e, 33%
#Hence, hypothesis is refused. Engagement metrics seems to be very important for model's ability to classify popularity of a track accurately.