In [1]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import ast
from tqdm import tqdm


def extract_track(row): 
    if "http" in row:
        return row.split("=")[-1]
    else: return row

def clean_lyrics(row):
    return [x for x in row.replace("\r", "").strip().split("\n") if x != ""]

def get_us_songs(row): 
    return "US" in row

In [None]:
def read_raw_billboard(path):
    data2 = pd.read_csv(path)
    billboard = data2[['year', 'title', 'artist', 'peak_pos', 'weeks', 'spotify_link',
            'genre', 'broad_genre', 'energy', 'liveness', 'tempo', 'speechiness', 'acousticness',
            'instrumentalness', 'time_signature', 'danceability', 'key',
            'duration_ms', 'loudness', 'valence', 'mode', 'lyrics',]]
    billboard = billboard.replace("unknown", np.nan)
    billboard = billboard.dropna().sort_values("weeks", ascending=False).drop_duplicates(subset="title").reset_index(drop=True)
    billboard["spotify_link"] = billboard["spotify_link"].apply(extract_track) # Making the columns the same 
    billboard = billboard.rename(columns = {"spotify_link" : "uri", "title" : 'name'})
    billboard['artist_popularity'] = 0
    billboard['followers'] = 0
    return billboard

billboard = read_raw_billboard("../billboard_2000_2018_spotify_lyrics.csv")

In [10]:
# Reading SPD and adding labels
def construct_spd(path = "../SpotGenTrack/Data Sources/spotify_tracks.csv"):
    spd = pd.read_csv(path, index_col=0)
    spd = spd[spd.available_markets.apply(get_us_songs)]
    spd['artists_id'] = spd['artists_id'].apply(ast.literal_eval)
    m = pd.read_csv("../SpotGenTrack/Data Sources/spotify_artists.csv", index_col=0)
    artist_popularity = list()
    followers = list()
    spd.reset_index(drop=True, inplace=True)
    for i in tqdm(range(spd.shape[0])): 
        row = m[m.id == spd.loc[i, "artists_id"][0]]

        artist_popularity.append(row.artist_popularity.values[0])
        followers.append(row.followers.values[0])

    spd['artist_popularity'] = artist_popularity
    spd['followers'] = followers
    spd["peak_pos"] = 0
    spd["weeks"] = 0 
    
    return spd

spd = construct_spd()
# Reading the billboard data
data = pd.read_pickle("../subset_billboard_2000_2018_spotify_lyrics.csv")
billboard = data[['year', 'name', 'main_artist', 'artist', 'peak_pos', 'weeks', 'uri',
       'genre', 'broad_genre', 'energy', 'liveness', 'tempo', 'speechiness', 'acousticness',
       'instrumentalness', 'time_signature', 'danceability', 'key',
       'duration_ms', 'loudness', 'valence', 'mode', 'lyrics', 'artist_popularity', 'followers',]]
#billboard = billboard.replace("unknown", np.nan)
#billboard = billboard.dropna().sort_values("weeks", ascending=False).drop_duplicates(subset="title").reset_index(drop=True)
#billboard["spotify_link"] = billboard["spotify_link"].apply(extract_track) # Making the columns the same 
#billboard = billboard.rename(columns = {"spotify_link" : "uri", "title" : 'name'})
# billboard = billboard.rename(columns = {"spotify_link" : "uri", "title" : 'name'})
# billboard['artist_popularity'] = 0
# billboard['followers'] = 0

common = np.intersect1d(billboard.columns, spd.columns)
spd_common = spd[spd.uri.isin(common)]
spd = spd[~spd.uri.isin(common)]
combined = pd.concat([spd[common], billboard[common]]).reset_index(drop=True)

combined['lyrics'] = combined['lyrics'].apply(clean_lyrics)
combined = combined.astype({"key" : "int32", "mode" : "int16", "time_signature" : "int16"})
combined["popularity"] = combined.weeks * combined.peak_pos
combined["hit"] = combined["weeks"] >= 1
combined

100%|██████████| 90118/90118 [05:31<00:00, 271.53it/s]


Unnamed: 0,acousticness,artist_popularity,danceability,duration_ms,energy,followers,instrumentalness,key,liveness,loudness,...,name,peak_pos,speechiness,tempo,time_signature,uri,valence,weeks,popularity,hit
0,0.294,28,0.698,235584.0,0.606,425,0.000003,10,0.151,-7.447,...,Blood,0,0.0262,115.018,4,spotify:track:5qljLQuKnNJf4F4vfxQB0V,0.622,0,0,False
1,0.863,36,0.719,656960.0,0.308,2965,0.0,6,0.253,-10.34,...,The Ugly Duckling,0,0.922,115.075,3,spotify:track:3VAX2MJdmdqARLSU5hPMpm,0.589,0,0,False
2,0.763,10,0.719,316578.0,0.126,158,0.0,3,0.113,-20.254,...,The Crime At Pickets Mill,0,0.938,112.822,3,spotify:track:6aCe9zzoZmCojX7bbgKKtf,0.533,0,0,False
3,0.971,62,0.367,183653.0,0.349,201820,0.296,11,0.633,-7.74,...,Already Gone,0,0.0268,81.85,4,spotify:track:4PrAZpH9Ic7S47E78BN6E4,0.192,0,0,False
4,0.824,36,0.688,29240.0,0.304,2965,0.0,10,0.142,-9.96,...,Three Blind Mice,0,0.531,77.056,3,spotify:track:1WJzRtI1ABzV3TPIeJZVvi,0.414,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95732,0.0465,81,0.693,292160,0.881,9261204,0,5,0.0817,-5.813,...,Taboo,97,0.0461,125.027,4,spotify:track:4cyYC67XY3weSVmSLdKLP8,0.852,1,97,True
95733,0.00268,71,0.494,199493,0.762,7064635,0.00538,10,0.472,-2.885,...,I Won't Back Down,57,0.0441,114.1,4,spotify:track:6XscPNlnKw0mnVYE7kvWRj,0.653,1,57,True
95734,0.00698,89,0.675,190977,0.842,26026314,0,11,0.349,-2.698,...,Little Bad Girl,70,0.0337,127.019,4,spotify:track:6JH56gZC7EJDcoxabVcWVL,0.617,1,70,True
95735,0.0217,89,0.263,258874,0.315,44311416,0.163,8,0.109,-10.797,...,Moving To Mars,90,0.03,142.657,4,spotify:track:0kuv7BqWNDprDao3Tb5flN,0.196,1,90,True


In [17]:
combined.to_pickle("../Combined_data_artists_included.pkl")

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix, mean_squared_error, mean_absolute_error
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier

combined = pd.read_pickle("../Combined_data_artists_included_eng.pkl").reset_index(drop=True)
subset = combined[['acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'liveness', 'loudness', 
       'speechiness', 'tempo', 'valence', 'key', 'artist_popularity', 'followers','hit', 'popularity', 'scaled_popularity']]# 'weeks', 'peak_pos',]]

print(f"Available Features: {combined.columns.values}\n")
print(f"Features being used: {subset.columns.values}")
#scaler = MinMaxScaler(feature_range=(0, 100))
#subset.loc[subset.hit, ["popularity"]] = scaler.fit_transform(subset.loc[subset.hit]["popularity"].values.reshape(-1,1))


# Separate the majority and minority classes
majority_class = subset[subset['hit'] == 0]
minority_class = subset[subset['hit'] == 1]

# Undersample the majority class to match the number of samples in the minority class
n_samples = len(minority_class)
majority_class_downsampled = resample(majority_class, 
                                      replace=False,
                                      n_samples=n_samples,
                                      random_state=0)

# Combine the downsampled majority class with the minority class
data_downsampled = pd.concat([majority_class_downsampled, minority_class])

# Shuffle the rows of the downsampled dataset
data_downsampled = data_downsampled.sample(frac=1, random_state=0)

# Split
X_train, X_test, y_train, y_test = train_test_split(data_downsampled.drop(columns=["hit", "popularity", "scaled_popularity"]), data_downsampled["scaled_popularity"], test_size=0.2, random_state=0)

# Normalise
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Available Features: ['acousticness' 'artist_popularity' 'danceability' 'duration_ms' 'energy'
 'followers' 'instrumentalness' 'key' 'liveness' 'loudness' 'lyrics'
 'mode' 'name' 'peak_pos' 'speechiness' 'tempo' 'time_signature' 'uri'
 'valence' 'weeks' 'popularity' 'hit' 'scaled_popularity']

Features being used: ['acousticness' 'danceability' 'duration_ms' 'energy' 'instrumentalness'
 'liveness' 'loudness' 'speechiness' 'tempo' 'valence' 'key'
 'artist_popularity' 'followers' 'hit' 'popularity' 'scaled_popularity']


# Classification

In [3]:
rfc = RandomForestClassifier(n_estimators=100, random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print("RandomForest Classification Report:")
print(classification_report(y_test, y_pred))

svm = SVC(kernel='rbf', random_state=0)
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, svm_preds))

lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_preds))

RandomForest Classification Report:
              precision    recall  f1-score   support

       False       0.95      0.92      0.93      1096
        True       0.92      0.95      0.94      1091

    accuracy                           0.94      2187
   macro avg       0.94      0.94      0.94      2187
weighted avg       0.94      0.94      0.94      2187

SVM Classification Report:
              precision    recall  f1-score   support

       False       0.87      0.83      0.85      1096
        True       0.84      0.88      0.86      1091

    accuracy                           0.85      2187
   macro avg       0.85      0.85      0.85      2187
weighted avg       0.85      0.85      0.85      2187

Logistic Regression Classification Report:
              precision    recall  f1-score   support

       False       0.81      0.86      0.83      1096
        True       0.85      0.79      0.82      1091

    accuracy                           0.83      2187
   macro avg       0.8

In [12]:
svm = SVR(kernel='linear')
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)
print("SVM (Linear) Report:")
print(f"mean_squared_error: {mean_squared_error(y_test.values, svm_preds):.2f}")
print(f"root_mean_squared_error: {mean_squared_error(y_test.values, svm_preds) ** 0.5:.2f}")
print(f"mean_absolute_error: {mean_absolute_error(y_test.values, svm_preds):.2f}\n")

svm = SVR(kernel='rbf')
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)
print("SVM (RBF) Report:")
print(f"mean_squared_error: {mean_squared_error(y_test.values, svm_preds):.2f}")
print(f"root_mean_squared_error: {mean_squared_error(y_test.values, svm_preds) ** 0.5:.2f}")
print(f"mean_absolute_error: {mean_absolute_error(y_test.values, svm_preds):.2f}\n")

lr = LinearRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
print("LR Report:")
print(f"mean_squared_error: {mean_squared_error(y_test.values, lr_preds):.2f}")
print(f"root_mean_squared_error: {mean_squared_error(y_test.values, lr_preds) ** 0.5:.2f}")
print(f"mean_absolute_error: {mean_absolute_error(y_test.values, lr_preds):.2f}")

SVM (Linear) Report:
mean_squared_error: 295.73
root_mean_squared_error: 17.20
mean_absolute_error: 9.93

SVM (RBF) Report:
mean_squared_error: 258.81
root_mean_squared_error: 16.09
mean_absolute_error: 9.20

LR Report:
mean_squared_error: 217.86
root_mean_squared_error: 14.76
mean_absolute_error: 11.43
