In [None]:
# Global variables for paths 
SPOTIFY_DATA_PATH = '/work/nlp_final/data/songs_with_attributes_and_lyrics.csv'
HOT100_DATA_PATH = '/work/nlp_final/data/hot100.csv'
RADIO_DATA_PATH = '/work/nlp_final/data/radio.csv'
DIGITAL_DATA_PATH = '/work/nlp_final/data/digital_songs.csv'
STREAMING_DATA_PATH = '/work/nlp_final/data/streaming_songs.csv'
OUTPUT_DATA_PATH = '/work/nlp_final/data.csv'

In [1]:
import pandas as pd
from pandas.plotting import scatter_matrix
from tqdm import tqdm
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import ast
import re
from sklearn.model_selection import train_test_split
import re, math
from collections import Counter
import numpy as np
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import train_test_split
from collections import Counter
from langdetect import detect, LangDetectException
import os

seed_value = 42

# Data cleaning

## Spotify data

This dataset, sourced from [Kaggle](https://www.kaggle.com/datasets/bwandowando/spotify-songs-with-attributes-and-lyrics/data), contains information about **955,320** songs on Spotify. It combines **audio attributes**, **metadata**, and **song lyrics**, making it ideal for projects like music recommendation systems, lyric analysis, and mood classification.

| Column | Type | Description |
|:---|:---|:---|
| `id` | _object_ | Spotify's unique identifier for each song.  |
| `name` | _object_ | Title of the song. |
| `album_name` | _object_ | Name of the album the song belongs to (may be missing for some entries). |
| `artists` | _object_ | List of artists associated with the song. |
| `danceability` | _float64_ | Measure (0.0–1.0) of how suitable a track is for dancing, based on tempo, rhythm stability, beat strength, etc. Describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.|
| `energy` | _float64_ | Measure (0.0–1.0) of intensity and activity — higher for energetic songs.  Represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low. Features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy. Scale from 0.0 to 1.0.|
| `key` | _object_ | Estimated overall musical key (0 = C, 1 = C♯/D♭, ..., 11 = B). |
| `loudness` | _float64_ | Overall loudness of a track in decibels (dB); typically negative values. |
| `mode` | _object_ | Modality of the track: 1 = major, 0 = minor. |
| `speechiness` | _float64_ | Measure (0.0–1.0) of how much spoken words are present in a track. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, including rap music. Values below 0.33 most likely represent music and other non-speech-like tracks. |
| `acousticness` | _float64_ | Measure (0.0–1.0) of whether a track is acoustic. Acousticness refers to how likely a track contains primarily acoustic sounds (as opposed to electric or electronic sounds). It measures whether the sound was likely produced by acoustic instruments like acoustic guitars, pianos, orchestral instruments, or unplugged performances rather than electronically generated or heavily processed sounds.  |
| `instrumentalness` | _float64_ | Prediction (0.0–1.0) of whether a track contains no vocals. It's specifically focused on the presence or absence of human voices, regardless of whether the instruments are acoustic or electronic. A high instrumentalness value (closer to 1.0) suggests the track is primarily instrumental with no vocals |
| `liveness` | _float64_ | Measure (0.0–1.0) of the presence of an audience in the recording (higher means more "live"). |
| `valence` | _float64_ | Measure (0.0–1.0) of musical positiveness or happiness. |
| `tempo` | _float64_ | Estimated tempo (beats per minute). |
| `duration_ms` | _float64_ | Duration of the song in milliseconds. |
| `lyrics` | _object_ | Full lyrics of the song (if available). |

## Billboard data

- **Hot 100**: Ranks the most popular songs in the U.S. by combining streaming, radio airplay, and sales data.

- **Radio Songs**: Lists the most played songs on U.S. radio stations based solely on audience impressions.

- **Streaming Songs**: Ranks songs based purely on the number of audio and video streams across major platforms.

- **Digital Song Sales**: Charts the best-selling digital songs in the U.S. based entirely on paid downloads.


In [2]:
spotify_df = pd.read_csv(SPOTIFY_DATA_PATH)
hot100_df = pd.read_csv(HOT100_DATA_PATH)
radio_df = pd.read_csv(RADIO_DATA_PATH)
digital_df = pd.read_csv(DIGITAL_DATA_PATH)
streaming_df = pd.read_csv(STREAMING_DATA_PATH) 

Filter for 23-01-2013 to 04-12-2024 + Drop rows with no lyrics

In [3]:
#  cutoff dates
start_date = pd.to_datetime("2013-01-23")
end_date = pd.to_datetime("2024-12-04")

#'Date' columns to datetime
for df in [hot100_df, radio_df, digital_df, streaming_df]:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Filter  Billboard on the date range
hot100_df = hot100_df[(hot100_df['Date'] >= start_date) & (hot100_df['Date'] <= end_date)]
radio_df = radio_df[(radio_df['Date'] >= start_date) & (radio_df['Date'] <= end_date)]
digital_df = digital_df[(digital_df['Date'] >= start_date) & (digital_df['Date'] <= end_date)]
streaming_df = streaming_df[(streaming_df['Date'] >= start_date) & (streaming_df['Date'] <= end_date)]

# confirm filtering worked
{"hot100_df": len(hot100_df),
    "radio_df": len(radio_df),
    "digital_df": len(digital_df),
    "streaming_df": len(streaming_df)}

{'hot100_df': 61900,
 'radio_df': 32075,
 'digital_df': 30625,
 'streaming_df': 30900}

In [4]:
spotify_df.dropna(subset=['lyrics'], inplace=True)

Normalization function for artists

In [5]:
# clean song names
def clean_name(name):
    if isinstance(name, list):
        name = " ".join(name)
    if isinstance(name, str):
        name = re.sub(r"[^\w\s]", "", name.lower()).strip()
    else:
        name = ""
    return name

# normalize artist names
def normalize_artist_name(name):
    if isinstance(name, list):
        name = " ".join(name)
    if isinstance(name, str):
        name = name.lower()
        name = re.sub(r"\s+(feat\.|featuring|ft\.|and|&)\s+", " ", name)
        name = re.sub(r"[^\w\s]", "", name)  
        name = re.sub(r"\s+", " ", name)     
        return name.strip()
    return ""

# Safe parser for artist in Spotify
def safe_parse_artist(x):
    try:
        if isinstance(x, str) and x.startswith("[") and x.endswith("]"):
            parsed = ast.literal_eval(x)
            if isinstance(parsed, list):
                return normalize_artist_name(parsed)
        return normalize_artist_name(x)
    except:
        return normalize_artist_name(x)

# clean and normalize Spotify data
spotify_df['spotify_name_clean'] = spotify_df['name'].apply(clean_name)
spotify_df['spotify_artist_clean'] = spotify_df['artists'].apply(safe_parse_artist)

# clean and normalize Billboard data
hot100_df['hot100_name_clean'] = hot100_df['Song'].apply(clean_name)
hot100_df['hot100_artist_clean'] = hot100_df['Artist'].apply(normalize_artist_name)

radio_df['radio_name_clean'] = radio_df['Song'].apply(clean_name)
radio_df['radio_artist_clean'] = radio_df['Artist'].apply(normalize_artist_name)

digital_df['digital_name_clean'] = digital_df['Song'].apply(clean_name)
digital_df['digital_artist_clean'] = digital_df['Artist'].apply(normalize_artist_name)

streaming_df['streaming_name_clean'] = streaming_df['Song'].apply(clean_name)
streaming_df['streaming_artist_clean'] = streaming_df['Artist'].apply(normalize_artist_name)

In [6]:
spotify_df

Unnamed: 0,id,name,album_name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,lyrics,spotify_name_clean,spotify_artist_clean
0,0Prct5TDjAnEgIqbxcldY9,!,UNDEN!ABLE,['HELLYEAH'],0.415,0.6050,7,-11.157,1,0.0575,0.00116,0.838000,0.4710,0.1930,100.059,79500.0,"He said he came from Jamaica,\n he owned a cou...",,hellyeah
1,2ASl4wirkeYm3OWZxXKYuq,!!,,Yxngxr1,0.788,0.6480,7,-9.135,0,0.3150,0.90000,0.000000,0.1760,0.2870,79.998,114000.0,"Fucked a bitch, now she running with my kids\n...",,yxngxr1
2,69lcggVPmOr9cvPx9kLiiN,!!! - Interlude,Where I Belong EP,['Glowie'],0.000,0.0354,7,-20.151,0,0.0000,0.90800,0.000000,0.4790,0.0000,0.000,11413.0,"Oh, my God, I'm going crazy\n",interlude,glowie
3,4U7dlZjg1s9pjdppqZy0fm,!!De Repente!!,Un Palo Al Agua (20 Grandes Canciones),['Rosendo'],0.657,0.8820,5,-6.340,1,0.0385,0.00740,0.000013,0.0474,0.9390,123.588,198173.0,Continuamente se extraña la gente si no puede ...,de repente,rosendo
4,4v1IBp3Y3rpkWmWzIlkYju,!!De Repente!!,Fuera De Lugar,['Rosendo'],0.659,0.8930,5,-8.531,1,0.0411,0.09220,0.000019,0.0534,0.9510,123.600,199827.0,Continuamente se extraña la gente si no puede ...,de repente,rosendo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955315,4zMgP1HJazJJdEf6AiG8M6,,,['SuperKek'],0.456,0.4820,8.0,-11.199,1.0,0.0504,0.02970,0.000000,0.1110,0.0352,71.455,281962.0,And all I am is a man\n I want the world in my...,,superkek
955316,5N0MQFuudsDIQpapNI5MHM,,,['Prasewon'],0.543,0.2900,2.0,-14.526,0.0,0.1580,0.66700,0.015100,0.1470,0.3640,172.118,87980.0,"I think I, I think I finally\n Found a way to ...",,prasewon
955317,5R8xbq4SXB5Cc62Lu7cW4y,,,['SoulkaOuter'],0.696,0.4440,10.0,-12.894,0.0,0.0593,0.59300,0.000740,0.2130,0.3070,105.953,129370.0,Tak dayte patsanam poschitat' poteri\n Summy n...,,soulkaouter
955318,5cjecvX0CmC9gK0Laf5EMQ,,,,0.678,0.6590,11,-5.364,0,0.3190,0.05340,0.000000,0.5530,0.1910,146.153,202235.0,"Ave Maria, Ave Maria\n ♪\n Ich bin in der Beto...",,


In [7]:
hot100_df.head()

Unnamed: 0,Date,Song,Artist,Rank,Last Week,Peak Position,Weeks in Charts,Image URL,hot100_name_clean,hot100_artist_clean
283987,2013-01-23,Locked Out Of Heaven,Bruno Mars,1,1,1,15,https://charts-static.billboard.com/img/2012/1...,locked out of heaven,bruno mars
283988,2013-01-23,Thrift Shop,Macklemore & Ryan Lewis Featuring Wanz,2,5,2,15,https://charts-static.billboard.com/img/2012/0...,thrift shop,macklemore ryan lewis wanz
283989,2013-01-23,Ho Hey,The Lumineers,3,4,3,32,https://charts-static.billboard.com/img/2012/0...,ho hey,the lumineers
283990,2013-01-23,I Knew You Were Trouble.,Taylor Swift,4,3,2,13,https://charts-static.billboard.com/img/2012/1...,i knew you were trouble,taylor swift
283991,2013-01-23,Diamonds,Rihanna,5,2,1,16,https://charts-static.billboard.com/img/2012/1...,diamonds,rihanna


In [8]:
radio_df

Unnamed: 0,Date,Song,Artist,Rank,Last Week,Peak Position,Weeks in Charts,Image URL,radio_name_clean,radio_artist_clean
87025,2013-01-23,Locked Out Of Heaven,Bruno Mars,1,1,1,15,https://charts-static.billboard.com/img/2012/1...,locked out of heaven,bruno mars
87026,2013-01-23,Diamonds,Rihanna,2,2,1,16,https://charts-static.billboard.com/img/2012/1...,diamonds,rihanna
87027,2013-01-23,Ho Hey,The Lumineers,3,3,3,14,https://charts-static.billboard.com/img/2012/0...,ho hey,the lumineers
87028,2013-01-23,I Knew You Were Trouble.,Taylor Swift,4,7,4,7,https://charts-static.billboard.com/img/2012/1...,i knew you were trouble,taylor swift
87029,2013-01-23,One More Night,Maroon 5,5,4,1,25,https://charts-static.billboard.com/img/1840/1...,one more night,maroon 5
...,...,...,...,...,...,...,...,...,...,...
119095,2024-12-04,Guy For That,Post Malone Featuring Luke Combs,46,46,1,-,https://charts-static.billboard.com/img/2024/0...,guy for that,post malone luke combs
119096,2024-12-04,Messed Up As Me,Keith Urban,47,46,6,-,https://charts-static.billboard.com/img/2024/0...,messed up as me,keith urban
119097,2024-12-04,Damn Good Day To Leave,Riley Green,48,48,1,-,https://charts-static.billboard.com/img/2024/0...,damn good day to leave,riley green
119098,2024-12-04,Disease,Lady Gaga,49,48,41,4,https://charts-static.billboard.com/img/2024/1...,disease,lady gaga


In [9]:
digital_df

Unnamed: 0,Date,Song,Artist,Rank,Last Week,Peak Position,Weeks in Charts,Image URL,digital_name_clean,digital_artist_clean
31875,2013-01-23,Thrift Shop,Macklemore & Ryan Lewis Featuring Wanz,1,2,1,15,https://charts-static.billboard.com/img/2012/0...,thrift shop,macklemore ryan lewis wanz
31876,2013-01-23,I Knew You Were Trouble.,Taylor Swift,2,1,1,13,https://charts-static.billboard.com/img/2012/1...,i knew you were trouble,taylor swift
31877,2013-01-23,Scream & Shout,will.i.am & Britney Spears,3,4,1,8,https://charts-static.billboard.com/img/2005/1...,scream shout,william britney spears
31878,2013-01-23,Locked Out Of Heaven,Bruno Mars,4,3,1,15,https://charts-static.billboard.com/img/2012/1...,locked out of heaven,bruno mars
31879,2013-01-23,Ho Hey,The Lumineers,5,7,3,32,https://charts-static.billboard.com/img/2012/0...,ho hey,the lumineers
...,...,...,...,...,...,...,...,...,...,...
62495,2024-12-04,4x4xU,Lainey Wilson,21,21,1,-,https://charts-static.billboard.com/img/2024/0...,4x4xu,lainey wilson
62496,2024-12-04,That’s So True,Gracie Abrams,22,24,13,4,https://charts-static.billboard.com/img/2020/1...,thats so true,gracie abrams
62497,2024-12-04,Birds Of A Feather,Billie Eilish,23,21,2,27,https://charts-static.billboard.com/img/2017/0...,birds of a feather,billie eilish
62498,2024-12-04,Beautiful Things,Benson Boone,24,19,1,45,https://charts-static.billboard.com/img/2024/0...,beautiful things,benson boone


In [10]:
streaming_df

Unnamed: 0,Date,Song,Artist,Rank,Last Week,Peak Position,Weeks in Charts,Image URL,streaming_name_clean,streaming_artist_clean
0,2013-01-23,Thrift Shop,Macklemore & Ryan Lewis Featuring Wanz,1,1,1,-,https://charts-static.billboard.com/img/2012/0...,thrift shop,macklemore ryan lewis wanz
1,2013-01-23,Locked Out Of Heaven,Bruno Mars,2,2,1,-,https://charts-static.billboard.com/img/2012/1...,locked out of heaven,bruno mars
2,2013-01-23,Diamonds,Rihanna,3,3,1,-,https://charts-static.billboard.com/img/2012/1...,diamonds,rihanna
3,2013-01-23,Ho Hey,The Lumineers,4,4,1,-,https://charts-static.billboard.com/img/2012/0...,ho hey,the lumineers
4,2013-01-23,It's Time,Imagine Dragons,5,5,1,-,https://charts-static.billboard.com/img/1840/1...,its time,imagine dragons
...,...,...,...,...,...,...,...,...,...,...
30895,2024-12-04,Popular,Ariana Grande,46,46,1,-,https://charts-static.billboard.com/img/2011/0...,popular,ariana grande
30896,2024-12-04,Please Please Please,Sabrina Carpenter,47,28,1,25,https://charts-static.billboard.com/img/2024/0...,please please please,sabrina carpenter
30897,2024-12-04,It's Beginning To Look A Lot Like Christmas,Michael Buble,48,11,23,-,https://charts-static.billboard.com/img/2003/0...,its beginning to look a lot like christmas,michael buble
30898,2024-12-04,Feliz Navidad,Jose Feliciano,49,7,31,-,https://charts-static.billboard.com/img/1998/0...,feliz navidad,jose feliciano


## Merging

In [11]:
# clean and normalize Billboard data
def select_billboard_columns(df, source_name):
    df = df.copy()
    df['source'] = source_name
    df['name_clean'] = df['Song'].apply(clean_name)
    df['artist_clean'] = df['Artist'].apply(normalize_artist_name)

    # Normalize Rank (1 = best, 0 = worst)
    max_rank = 100
    df['normalized_rank'] = 1 - ((df['Rank'] - 1) / (max_rank - 1))
    return df[['Date', 'Song', 'Rank', 'normalized_rank', 'Last Week', 'Weeks in Charts',
               'source', 'name_clean', 'artist_clean']]

# Apply cleaning and normalization
hot100_clean = select_billboard_columns(hot100_df, 'hot100')
radio_clean = select_billboard_columns(radio_df, 'radio')
digital_clean = select_billboard_columns(digital_df, 'digital')
streaming_clean = select_billboard_columns(streaming_df, 'streaming')

billboard_combined = pd.concat([hot100_clean, radio_clean, digital_clean, streaming_clean], ignore_index=True)

In [12]:
billboard_combined

Unnamed: 0,Date,Song,Rank,normalized_rank,Last Week,Weeks in Charts,source,name_clean,artist_clean
0,2013-01-23,Locked Out Of Heaven,1,1.000000,1,15,hot100,locked out of heaven,bruno mars
1,2013-01-23,Thrift Shop,2,0.989899,5,15,hot100,thrift shop,macklemore ryan lewis wanz
2,2013-01-23,Ho Hey,3,0.979798,4,32,hot100,ho hey,the lumineers
3,2013-01-23,I Knew You Were Trouble.,4,0.969697,3,13,hot100,i knew you were trouble,taylor swift
4,2013-01-23,Diamonds,5,0.959596,2,16,hot100,diamonds,rihanna
...,...,...,...,...,...,...,...,...,...
155495,2024-12-04,Popular,46,0.545455,46,-,streaming,popular,ariana grande
155496,2024-12-04,Please Please Please,47,0.535354,28,25,streaming,please please please,sabrina carpenter
155497,2024-12-04,It's Beginning To Look A Lot Like Christmas,48,0.525253,11,-,streaming,its beginning to look a lot like christmas,michael buble
155498,2024-12-04,Feliz Navidad,49,0.515152,7,-,streaming,feliz navidad,jose feliciano


In [13]:
# one-hot encoded source flags and peak position
source_flags = (
    billboard_combined
    .groupby(['name_clean', 'artist_clean', 'source'])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)

#  binary flags
for source in ['hot100', 'radio', 'streaming', 'digital']:
    if source in source_flags.columns:
        source_flags[f'hit_{source}'] = (source_flags[source] > 0).astype(int)
        source_flags.drop(columns=[source], inplace=True)

#  peak chart  
peak_rank_df = (
    billboard_combined
    .groupby(['name_clean', 'artist_clean'])
    .agg(
        peak_rank=('Rank', 'min'),                    
        peak_normalized=('normalized_rank', 'max')    
    )
    .reset_index()
)

#Merge grouped features with source flags and peak position
grouped_features = (
    billboard_combined
    .groupby(['name_clean', 'artist_clean'])
    .agg(
        weeks_in_charts_total=('Date', 'count'),
    )
    .reset_index()
)

# Add long-running hit flag (10+ weeks)
grouped_features['long_running_hit'] = (grouped_features['weeks_in_charts_total'] >= 10).astype(int)

# Merge everything
grouped_features = grouped_features \
    .merge(source_flags, on=['name_clean', 'artist_clean'], how='left') \
    .merge(peak_rank_df, on=['name_clean', 'artist_clean'], how='left')

# Fill any missing source flags 
for col in ['hit_hot100', 'hit_radio', 'hit_streaming', 'hit_digital']:
    if col in grouped_features.columns:
        grouped_features[col] = grouped_features[col].fillna(0).astype(int)

grouped_features

Unnamed: 0,name_clean,artist_clean,weeks_in_charts_total,long_running_hit,hit_hot100,hit_radio,hit_streaming,hit_digital,peak_rank,peak_normalized
0,0 to 100 the catch up,drake,53,1,1,1,1,1,19,0.818182
1,0000 zero oclock,bts,1,0,0,0,0,1,16,0.848485
2,020299,that mexican ot,3,0,1,0,0,0,74,0.262626
3,1 2 many,luke combs brooks dunn,2,0,1,0,0,1,5,0.959596
4,1 night,lil yachty,34,1,1,0,1,0,26,0.747475
...,...,...,...,...,...,...,...,...,...,...
8698,zombified,falling in reverse,1,0,0,0,0,1,35,0.656566
8699,zoo york,lil tjay fivio foreign pop smoke,2,0,1,0,1,0,37,0.636364
8700,zoom,future,1,0,1,0,0,0,99,0.010101
8701,zoom,lil uzi vert,1,0,1,0,0,0,92,0.080808


In [14]:
# First, ensure clean merge keys on both sides
spotify_df['spotify_name_clean'] = spotify_df['spotify_name_clean'].fillna("").astype(str)
spotify_df['spotify_artist_clean'] = spotify_df['spotify_artist_clean'].fillna("").astype(str)
grouped_features['name_clean'] = grouped_features['name_clean'].fillna("").astype(str)
grouped_features['artist_clean'] = grouped_features['artist_clean'].fillna("").astype(str)

# Merge Spotify with engineered Billboard features
spotify_merged = spotify_df.merge(
    grouped_features,
    how='left',
    left_on=['spotify_name_clean', 'spotify_artist_clean'],
    right_on=['name_clean', 'artist_clean']
)

# Fill Billboard-related missing values with 0 (for songs not found in Billboard)
fill_zero_cols = [
    'weeks_in_charts_total', 'long_running_hit',
    'hit_hot100', 'hit_radio', 'hit_streaming', 'hit_digital'
]
for col in fill_zero_cols:
    if col in spotify_merged.columns:
        spotify_merged[col] = spotify_merged[col].fillna(0).astype(int)

#keep 0 if na 
spotify_merged['peak_rank'] = spotify_merged['peak_rank'].fillna(0).astype(int)
spotify_merged['peak_normalized'] = spotify_merged['peak_normalized'].fillna(0.0)

# Drop now-redundant merge keys from Billboard
spotify_merged.drop(columns=['name_clean', 'artist_clean'], inplace=True)

spotify_merged.drop(columns=['name', 'artists', 'album_name'], inplace=True)

spotify_merged['hit'] = (
    spotify_merged[['hit_hot100', 'hit_radio', 'hit_streaming', 'hit_digital']].sum(axis=1) > 0
).astype(int)

In [15]:
spotify_merged['hit'].value_counts(normalize=True)

hit
0    0.991226
1    0.008774
Name: proportion, dtype: float64

In [16]:
# Check for duplicates in name_clean and artist_clean (should be unique at this point)
duplicates = grouped_features[
    grouped_features.duplicated(subset=['name_clean', 'artist_clean'], keep=False)
].sort_values(by=['name_clean', 'artist_clean'])

# Show count of duplicates by song-artist pair
duplicate_counts = (
    duplicates
    .groupby(['name_clean', 'artist_clean'])
    .size()
    .reset_index(name='duplicate_count')
    .sort_values(by='duplicate_count', ascending=False)
)

# Display
print("Potential duplicates found:", len(duplicates))
display(duplicate_counts.head())

Potential duplicates found: 0


Unnamed: 0,name_clean,artist_clean,duplicate_count


### Fixing key, loudness and mode 

In [17]:
print("Loudness min:", spotify_merged['loudness'].min())
print("Loudness max:", spotify_merged['loudness'].max())
print(spotify_merged['loudness'].value_counts())
print()
print(spotify_merged['mode'].value_counts())
print()
print(spotify_merged['key'].value_counts())
print()

Loudness min: -60.0
Loudness max: 4.882
loudness
-5.020     170
-5.131     166
-6.279     164
-5.926     163
-5.956     162
          ... 
-19.461      1
-26.656      1
 1.060       1
 0.982       1
-22.444      1
Name: count, Length: 27158, dtype: int64

mode
1        523385
0        263270
1.0       69710
Major     39508
0.0       38678
Minor     20756
Name: count, dtype: int64

key
7       94771
0       92881
2       86808
9       86169
1       69351
4       64616
5       62753
11      60749
6       48937
10      48499
8       46945
3       24176
0.0     12497
7.0     12323
2.0     11172
9.0     10870
1.0     10576
11.0     8786
5.0      8758
4.0      8476
6.0      7410
8.0      7108
C        7081
G        6909
10.0     6824
A        6131
C#       6001
D        5929
B        5109
F        4810
E        4558
F#       4116
G#       4031
A#       3847
3.0      3588
D#       1742
Name: count, dtype: int64



In [18]:
spotify_merged['loudness'] = pd.to_numeric(spotify_merged['loudness'], errors='coerce')
spotify_merged['loudness'] = spotify_merged['loudness'].clip(upper=0)

spotify_merged['mode'] = spotify_merged['mode'].astype(str).replace({
    'Major': 1, 'Minor': 0,
    '1.0': 1, '0.0': 0,
    '1': 1, '0': 0
}).astype(int)

key_mapping = {
    'C': 0, 'C#': 1, 'D': 2, 'D#': 3,
    'E': 4, 'F': 5, 'F#': 6, 'G': 7,
    'G#': 8, 'A': 9, 'A#': 10, 'B': 11
}

spotify_merged['key'] = spotify_merged['key'].replace(key_mapping)
spotify_merged['key'] = pd.to_numeric(spotify_merged['key'], errors='coerce').astype('Int64')

In [19]:
print("Loudness min:", spotify_merged['loudness'].min())
print("Loudness max:", spotify_merged['loudness'].max())
print(spotify_merged['loudness'].value_counts())
print()
print(spotify_merged['mode'].value_counts())
print()
print(spotify_merged['key'].value_counts())
print()

Loudness min: -60.0
Loudness max: 0.0
loudness
 0.000     444
-5.020     170
-5.131     166
-6.279     164
-5.926     163
          ... 
-21.211      1
-32.755      1
-25.008      1
-0.008       1
-29.540      1
Name: count, Length: 26778, dtype: int64

mode
1    632603
0    322704
Name: count, dtype: int64

key
7     114003
0     112459
2     103909
9     103170
1      85928
4      77650
5      76321
11     74644
6      60463
10     59170
8      58084
3      29506
Name: count, dtype: Int64



### Dropping Irrelevant Columns

In [20]:
spotify_merged.drop(['spotify_name_clean','spotify_artist_clean'], axis=1, inplace=True)

<hr>

# NLP processing

Detect language and only keep songs in English

In [21]:
tqdm.pandas()

# Path to cached English song IDs
english_ids_file = "english_ids.txt"

# Print initial stats before filtering
total_before = len(spotify_merged)
hits_before = spotify_merged['hit'].sum()
non_hits_before = (spotify_merged['hit'] == 0).sum()

print("Before filtering (all songs):")
print(f"Total songs: {total_before}")
print(f"Hits: {hits_before} ({hits_before / total_before:.2%})")
print(f"Non-hits: {non_hits_before} ({non_hits_before / total_before:.2%})")
print()

# Load cached English IDs
if os.path.exists(english_ids_file):
    with open(english_ids_file, 'r') as f:
        english_ids = set(line.strip() for line in f)
else:
    english_ids = set()

if not english_ids:
    # 1) Detect language
    def detect_language(text):
        try:
            return detect(text)
        except LangDetectException:
            return "unknown"

    spotify_merged['lang'] = spotify_merged['lyrics'].progress_apply(detect_language)
    
    # 2) Filter only English songs and reassign spotify_merged
    spotify_merged = spotify_merged[spotify_merged['lang'] == 'en'].copy()
    
    # 3) Cache the IDs
    english_ids = set(spotify_merged['id'].astype(str))
    with open(english_ids_file, 'w') as f:
        for song_id in english_ids:
            f.write(f"{song_id}\n")

    # 4) Drop the lang column now that we’ve filtered
    spotify_merged.drop(columns=['lang'], inplace=True)

else:
    # If we already have a cache, just filter by it
    spotify_merged = spotify_merged[
        spotify_merged['id'].astype(str).isin(english_ids)
    ].copy()
# Now spotify_merged contains English-only songs

# Stats after filtering
total_after = len(spotify_merged)
hits_after = spotify_merged['hit'].sum()
non_hits_after = (spotify_merged['hit'] == 0).sum()

print("\nAfter filtering (English songs only):")
print(f"Total songs: {total_after}")
print(f"Hits: {hits_after} ({hits_after / total_after:.2%})")
print(f"Non-hits: {non_hits_after} ({non_hits_after / total_after:.2%})")

Before filtering (all songs):
Total songs: 955307
Hits: 8382 (0.88%)
Non-hits: 946925 (99.12%)


After filtering (English songs only):
Total songs: 713620
Hits: 8181 (1.15%)
Non-hits: 705439 (98.85%)


NLP cleaning 

In [22]:
spotify_merged['lyrics_cleaned'] = spotify_merged['lyrics'].str.lower()
spotify_merged['lyrics_cleaned'] = spotify_merged['lyrics_cleaned'].str.replace(r'\s+', ' ', regex=True).str.strip()

# later, when we use BoW and TF-IDF, we also need to remove punctiuation, stopwords, and maybe lemmatize + stem -- this will be done in the model training!

In [23]:
#side-by-side comparison for the first 5 rows
spotify_merged[['lyrics', 'lyrics_cleaned']].head()

Unnamed: 0,lyrics,lyrics_cleaned
0,"He said he came from Jamaica,\n he owned a cou...","he said he came from jamaica, he owned a coupl..."
1,"Fucked a bitch, now she running with my kids\n...","fucked a bitch, now she running with my kids a..."
5,You like to stand on the other side\n Point an...,you like to stand on the other side point and ...
6,"It's been a while, shit I missed the rehab, ps...","it's been a while, shit i missed the rehab, ps..."
7,I would like to give you all my time\n I would...,i would like to give you all my time i would l...


In [24]:
spotify_merged.drop(columns=['lyrics'], inplace=True)
spotify_merged.rename(columns={'lyrics_cleaned': 'lyrics'}, inplace=True)

In [25]:
spotify_merged['lyrics'].isna().sum()

0

## Creating a set of lyric-agnostic features

| New column             | How it’s calculated (language-agnostic)                                                                                 | Why it might help predict a **hit** |
|------------------------|-------------------------------------------------------------------------------------------------------------------------|-------------------------------------|
| **`ly_num_chars`**     | `len(lyrics)` – total characters, including spaces & punctuation                                                        | Captures overall lyric length / “density”; extremely short or very long songs often chart differently. |
| **`ly_vocab_size`**    | • Tokenise lyrics with `\b\w+\b`  <br>• Count **unique** tokens (`len(set(tokens))`)                                    | Measures lexical richness; sparse vocabulary can indicate simple, catchy writing while richer vocab suggests complexity. |
| **`ly_ttr`**           | Type–Token Ratio = `ly_vocab_size / total_words`                                                                        | Quantifies repetition vs. variety; high repetition (low TTR) is common in radio-friendly hooks. |
| **`ly_top5_ratio`**    | 1. Count word frequencies (`Counter(tokens)`)<br>2. Sum counts of the **5** most common words<br>3. Divide by `total_words` | Direct signal of chorus or hook repetition (how dominant the top words are). |
| **`ly_ngram_repetition`** | For overlapping 4-word n-grams:<br>• Build list `[' '.join(tokens[i:i+4]) …]`<br>• `max(counts) / total_ngrams`         | Finds the strongest repeating phrase even without line breaks; high values often mark memorable refrains. |
| **`ly_rhyme_like_ratio`** | Scan adjacent word pairs; count pairs whose **last 2 chars match**; ratio = matches / (total_pairs)                    | Quick proxy for end-rhymes or alliteration, features linked to lyrical catchiness across languages. |


In [26]:
#Tokeniser that works for any language 
_token_re = re.compile(r"\b\w+\b", re.UNICODE)

# Feature-builder for one lyric string
def lyric_custom_feats(text: str):
    """
    Return dict with:
      ly_num_chars, ly_vocab_size, ly_ttr,
      ly_top5_ratio, ly_ngram_repetition, ly_rhyme_like_ratio
    """
    # Treat empty / NaN lyrics the same
    if not text or text.strip() == '':
        return {
            'ly_num_chars': 0,
            'ly_vocab_size': 0,
            'ly_ttr': 0.0,
            'ly_top5_ratio': 0.0,
            'ly_ngram_repetition': 0.0,
            'ly_rhyme_like_ratio': 0.0,
        }

    #  Word tokens 
    tokens = _token_re.findall(text.lower())
    n_words = len(tokens)
    vocab = Counter(tokens)

    # Basic counts
    num_chars     = len(text)
    vocab_size    = len(vocab)
    ttr           = vocab_size / n_words                           

    # Top-5 repetition ratio
    top_counts    = [c for _, c in vocab.most_common(5)]
    top5_ratio    = sum(top_counts) / n_words

    #  n-gram repetition (4-grams by default) 
    n = 4
    if n_words >= n:
        ngrams = [' '.join(tokens[i:i+n]) for i in range(n_words - n + 1)]
        ngram_counts = Counter(ngrams).values()
        ngram_repetition = max(ngram_counts) / len(ngrams)
    else:
        ngram_repetition = 0.0

    #  rhyme-like ratio (last two chars of adjacent words) 
    rhyming_pairs = 0
    for w1, w2 in zip(tokens, tokens[1:]):
        if len(w1) > 1 and len(w2) > 1 and w1[-2:] == w2[-2:]:
            rhyming_pairs += 1
    rhyme_ratio = rhyming_pairs / (n_words - 1) if n_words > 1 else 0.0

    return {
        'ly_num_chars': num_chars,
        'ly_vocab_size': vocab_size,
        'ly_ttr': ttr,
        'ly_top5_ratio': top5_ratio,
        'ly_ngram_repetition': ngram_repetition,
        'ly_rhyme_like_ratio': rhyme_ratio,
    }

# ------------------------------------------------------------------
# Vectorise across the whole DataFrame of 286k songs
#  
df_nlp = (
    spotify_merged['lyrics']                     
      .fillna('')                     
      .apply(lyric_custom_feats)      
      .apply(pd.Series)              
)

# ------------------------------------------------------------------
# Merge back (keeps index alignment)
spotify_merged = pd.concat([spotify_merged, df_nlp], axis=1)

# Final Dataset

Making the data smaller to reduce computational complexity

In [27]:
spotify_merged

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,peak_rank,peak_normalized,hit,lyrics,ly_num_chars,ly_vocab_size,ly_ttr,ly_top5_ratio,ly_ngram_repetition,ly_rhyme_like_ratio
0,0Prct5TDjAnEgIqbxcldY9,0.415,0.605,7,-11.157,1,0.0575,0.001160,0.838000,0.471,...,0,0.0,0,"he said he came from jamaica, he owned a coupl...",2341.0,211.0,0.422000,0.174000,0.008048,0.018036
1,2ASl4wirkeYm3OWZxXKYuq,0.788,0.648,7,-9.135,0,0.3150,0.900000,0.000000,0.176,...,0,0.0,0,"fucked a bitch, now she running with my kids a...",1280.0,90.0,0.311419,0.266436,0.031469,0.045139
5,5tA3ImW310llKo8EMBj2Ga,0.171,0.957,2,-5.749,1,0.1490,0.000029,0.000032,0.330,...,0,0.0,0,you like to stand on the other side point and ...,1050.0,127.0,0.564444,0.191111,0.013514,0.004464
6,0fROT4kK5oTm8xO8PX6EJF,0.823,0.612,1,-7.767,1,0.2480,0.168000,0.000000,0.109,...,0,0.0,0,"it's been a while, shit i missed the rehab, ps...",2655.0,222.0,0.402904,0.203267,0.014599,0.090909
7,1xBFhv5faebv3mmwxx7DnS,0.729,0.552,7,-8.562,0,0.0650,0.183000,0.000000,0.131,...,0,0.0,0,i would like to give you all my time i would l...,1612.0,141.0,0.377005,0.219251,0.013477,0.005362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955297,50G4xoTaf87f9qRBViB1XF,0.715,0.544,4,-9.431,0,0.0296,0.028800,0.016300,0.126,...,0,0.0,0,when days are short and nights are long i feel...,1073.0,85.0,0.357143,0.411765,0.025532,0.000000
955298,13vP95VYqHp3dN6TjGZ12s,0.411,0.854,6,-5.075,0,0.0371,0.052900,0.000095,0.302,...,0,0.0,0,"you know it's wrong, you know it's right no si...",1019.0,86.0,0.367521,0.269231,0.017316,0.012876
955299,1KFjbnC7IRtkCh5MEJhWt9,0.523,0.889,4,-6.191,1,0.1890,0.007440,0.000000,0.491,...,0,0.0,0,spend a lotta money on some damn clothes spend...,2906.0,203.0,0.332242,0.235679,0.006579,0.024590
955302,4zMgP1HJazJJdEf6AiG8M6,0.456,0.482,8,-11.199,1,0.0504,0.029700,0.000000,0.111,...,0,0.0,0,and all i am is a man i want the world in my h...,2803.0,169.0,0.275693,0.225122,0.013115,0.024510


In [28]:
# Print original info
original_counts = spotify_merged['hit'].value_counts()
original_total = len(spotify_merged)
original_ratio = (original_counts / original_total * 100).round(2)

print("Original dataset:")
print(f"Total rows: {original_total}")
print(f"Class 0: {original_counts[0]} ({original_ratio[0]}%)")
print(f"Class 1: {original_counts[1]} ({original_ratio[1]}%)")

# Stratified 5% sample
spotify_merged = spotify_merged.groupby('hit', group_keys=False).apply(lambda x: x.sample(frac=0.05, random_state=42))

# Print sampled info
sampled_counts = spotify_merged['hit'].value_counts()
sampled_total = len(spotify_merged)
sampled_ratio = (sampled_counts / sampled_total * 100).round(2)

print("\nSampled dataset (5% stratified):")
print(f"Total rows: {sampled_total}")
print(f"Class 0: {sampled_counts[0]} ({sampled_ratio[0]}%)")
print(f"Class 1: {sampled_counts[1]} ({sampled_ratio[1]}%)")

Original dataset:
Total rows: 713620
Class 0: 705439 (98.85%)
Class 1: 8181 (1.15%)

Sampled dataset (5% stratified):
Total rows: 35681
Class 0: 35272 (98.85%)
Class 1: 409 (1.15%)


In [29]:
spotify_merged.to_csv(OUTPUT_DATA_PATH, index=False)
spotify_merged

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,peak_rank,peak_normalized,hit,lyrics,ly_num_chars,ly_vocab_size,ly_ttr,ly_top5_ratio,ly_ngram_repetition,ly_rhyme_like_ratio
191062,0VdcSHXCPRfQQZYQPXSlZ9,0.364,0.126,7,-18.071,1,0.0286,0.86200,0.000203,0.0969,...,0,0.000000,0,i found her diary underneath a tree and starte...,928.0,85.0,0.435897,0.215385,0.015625,0.000000
135980,60NxxZzE3CFzuCgQmhqNi0,0.287,0.883,9,-6.383,1,0.0462,0.00116,0.000949,0.1550,...,0,0.000000,0,i get violent when i'm fucked up i get silent ...,619.0,76.0,0.550725,0.239130,0.007407,0.036496
23563,0BSPhsCKfwENstErymcD80,0.624,0.803,9,-5.107,0,0.2180,0.04530,0.000000,0.3180,...,0,0.000000,0,"the ghetto othello, the moor oh my god, they s...",5921.0,291.0,0.250000,0.161512,0.010336,0.012038
528493,5gGMmuBXnT9ywO8gklzWkc,0.639,0.711,0,-7.149,1,0.0273,0.10500,0.165000,0.2440,...,0,0.000000,0,i've got sunshine on a cloudy day. when it's c...,720.0,62.0,0.380368,0.355828,0.050000,0.037037
92271,1Lsrh6dwQW98CbWfCbwf57,0.458,0.266,7,-12.223,0,0.0316,0.16100,0.000000,0.0997,...,0,0.000000,0,"she puts on her work clothes, fishnet panty ho...",1024.0,109.0,0.547739,0.231156,0.015306,0.005051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373741,1VbhR6D6zUoSTBzvnRonXO,0.678,0.491,5,-5.815,1,0.0358,0.56500,0.000028,0.0772,...,2,0.989899,1,"you and me, we made a vow for better or for wo...",1117.0,91.0,0.339552,0.369403,0.026415,0.007491
422209,0M98PvIvx7vZ8LDpzMw1hB,0.738,0.861,2,-4.141,1,0.2370,0.32000,0.001750,0.3250,...,22,0.787879,1,"yeah, yeah, yeah blackpink in your area yeah, ...",1410.0,163.0,0.501538,0.212308,0.015528,0.083333
177666,5ETe7SElBIgm2NAZY3SpX4,0.476,0.991,10,-3.458,1,0.0520,0.04460,0.005740,0.1280,...,49,0.515152,1,long truck bed hop in it ♪ fire engine red lik...,1501.0,121.0,0.384127,0.193651,0.016026,0.000000
187154,5NU40QTlXrDUJzDBdv79bg,0.677,0.633,7,-7.831,1,0.0771,0.03140,0.000000,0.1120,...,76,0.242424,1,when i'm in my thoughts sometimes it's hard to...,1893.0,111.0,0.246667,0.368889,0.026846,0.022272


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=068aa808-4695-4d75-a14b-785a94b3cf38' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>