### Preprocessing

In [1]:
import os
from pathlib import Path
import pandas as pd

In [2]:
# Read in the data

data_path = Path(os.getcwd()).parent / 'data'
artist_file = 'artists-data.csv'
lyrics_file = 'lyrics-data.csv'
print('Loading artist and lyrics data into memory...')
artist_df = pd.read_csv(f'{data_path}/{artist_file}')
lyrics_df = pd.read_csv(f'{data_path}/{lyrics_file}')
print('Done!')

Loading artist and lyrics data into memory...
Done!


### Artists

In [3]:
print(f"Shape: {artist_df.shape}")
print(f"Rows: {artist_df.shape[0]}, Columns: {artist_df.shape[1]}")
display(artist_df.dtypes)
display(artist_df.head(1))
display(artist_df.describe())
print("null?")
print(artist_df.isnull().sum())

Shape: (4168, 5)
Rows: 4168, Columns: 5


Artist         object
Genres         object
Songs         float64
Popularity    float64
Link           object
dtype: object

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/


Unnamed: 0,Songs,Popularity
count,4167.0,4166.0
mean,91.021358,1.831325
std,96.655819,8.036739
min,0.0,0.0
25%,26.0,0.0
50%,64.0,0.0
75%,123.0,1.0
max,1029.0,205.5


null?
Artist        1
Genres        5
Songs         1
Popularity    2
Link          1
dtype: int64


Lyrics

In [4]:
print(f"Shape: {lyrics_df.shape}")
print(f"Rows: {lyrics_df.shape[0]}, Columns: {lyrics_df.shape[1]}")
display(lyrics_df.dtypes)
display(lyrics_df.head(1))
display(lyrics_df.describe())
print("null?")
print(lyrics_df.isnull().sum())

Shape: (379931, 5)
Rows: 379931, Columns: 5


ALink       object
SName       object
SLink       object
Lyric       object
language    object
dtype: object

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt


Unnamed: 0,ALink,SName,SLink,Lyric,language
count,379930,379928,379930,379854,365296
unique,4238,267258,379892,371181,52
top,/anjos-anonimos/,Intro,/roy-orbison/i-get-so-sentimental.html,Instrumental,en
freq,1029,116,2,2087,191814


null?
ALink           1
SName           3
SLink           1
Lyric          77
language    14635
dtype: int64


In [5]:
print("Artist")
for col in artist_df.select_dtypes(include='object').columns:
    print(f"{col}: {artist_df[col].nunique()} unique values")

print("Lyrics")
for col in lyrics_df.select_dtypes(include='object').columns:
    print(f"{col}: {lyrics_df[col].nunique()} unique values")

Artist
Artist: 4167 unique values
Genres: 1750 unique values
Link: 4167 unique values
Lyrics
ALink: 4238 unique values
SName: 267258 unique values
SLink: 379892 unique values
Lyric: 371181 unique values
language: 52 unique values


In [6]:
# Drop non-english songs
print("Original count:", len(lyrics_df))
lyrics_df_en = lyrics_df[lyrics_df['language'] == 'en'].copy()
print("English count:", len(lyrics_df_en))
assert len(lyrics_df_en) < len(lyrics_df)

Original count: 379931
English count: 191814


In [7]:
# fix for merge
lyrics_df_en['ALink_clean'] = lyrics_df_en['ALink'].str.strip('/')
artist_df['Link_clean'] = artist_df['Link'].str.strip('/')

merged_df = lyrics_df_en.merge(
    artist_df[['Link_clean', 'Artist', 'Genres']], 
    left_on='ALink_clean', 
    right_on='Link_clean', 
    how='inner'
)
merged_df.head(1)

Unnamed: 0,ALink,SName,SLink,Lyric,language,ALink_clean,Link_clean,Artist,Genres
0,/ivete-sangalo/,Careless Whisper,/ivete-sangalo/careless-whisper.html,I feel so unsure\nAs I take your hand and lead...,en,ivete-sangalo,ivete-sangalo,Ivete Sangalo,Pop; Axé; Romântico


In [8]:
merged_df['PrimaryGenre'] = (
    merged_df['Genres']
    .fillna('')   
    .str.split(';').str[0]
    .str.strip()
    .replace('', None)
)
genre_counts = merged_df['PrimaryGenre'].value_counts()
print("Top 10 genres by song count:")
print(genre_counts.head(10))
num_null_lyrics = merged_df['Lyric'].isna().sum()
print(f"count missing lyric: {num_null_lyrics}")

Top 10 genres by song count:
PrimaryGenre
Rock                25177
Pop                 13759
Heavy Metal         13496
Indie               12998
Rap                  9589
Pop/Rock             9019
Hip Hop              8412
Country              7377
Rock Alternativo     5555
R&B                  5309
Name: count, dtype: int64
count missing lyric: 0


In [9]:
genres = ['Pop', 'Heavy Metal', 'Indie']

genre_lyric_df = merged_df[merged_df['PrimaryGenre'].isin(genres)].copy()
genre_lyric_df = genre_lyric_df[['SName', 'PrimaryGenre', 'Lyric']].copy()
genre_lyric_df.columns = ['song_name', 'genre', 'lyrics']

genre_lyric_df = genre_lyric_df[genre_lyric_df['lyrics'].notna()].copy()
genre_lyric_df.head(1)

Unnamed: 0,song_name,genre,lyrics
0,Careless Whisper,Pop,I feel so unsure\nAs I take your hand and lead...


In [10]:
genre_lyric_df.to_csv(f'{data_path}/genre_lyric-data.csv', index=False)