# Compressing and Cleaning Datasets

## Song data from Genius

Compressed the genius song data into only english songs, a range from 1980 to 2019, and views greater than or equal to 10000 and cleaned the lyrical column

In [3]:
import pandas as pd

cols = ['title', 'tag', 'artist', 'year', 'views','lyrics', 'language_cld3']


songs_df = pd.read_csv('../data/raw/song_lyrics.csv', usecols=cols, chunksize=100000)

filtered = []

for data in songs_df:
    data = data[(data['language_cld3'] == 'en') & ((data['year'] >= 1980) & (data['year'] <= 2019)) & (data['views'] >= 10000)]

    data['lyrics'] = data['lyrics'].fillna('').astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()
    
    filtered.append(data)

df = pd.concat(filtered, ignore_index=True)

#songs_compressed = df.groupby('year').apply(lambda x: x.sample(n=min(len(x), 275), random_state=42, replace=False)).reset_index(drop=True)

df.to_csv('../data/raw/songs_compressed.csv', index=False)




FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/song_lyrics.csv'

## Merge Genius and Spotify Data & Clean Data

In [2]:
import pandas as pd
import re


def cleaned_title(title):
    t = title.lower().strip()
    t = re.sub(r"\((ft|feat|featuring)[^)]+\)","",t) # remove (feat. Artist)
    t = re.sub(r"(ft|feat|featuring)\s+[^-()]+", "", t) #remove feat. (without parantheses)
    t = re.sub(r"-\s*(edit|remastered|remaster|version|live).*","",t)
    t = " ".join(t.split()) # removing extra spaces
    return t

def cleaned_artist(artist):
    a = artist.lower().strip()
    a = " ".join(a.split()) # removing extra spaces

    return a


lyrics_df = pd.read_csv('../data/raw/songs_compressed.csv')
spotify_df = pd.read_csv('../data/raw/spotify.csv')


spotify_df['artist_list'] = spotify_df['artists'].apply(lambda x: [a.strip().strip("'\"") for a in x[1:-1].split(',')] if isinstance(x, str) else [x]) # converts the string representation of list to actual list of artist names
spotify_df = spotify_df.explode('artist_list') # makes sure that each artists in the list has its own row, so we can match them individually

spotify_df = spotify_df.rename(columns={'name': 'title', 'artist_list': 'artist'})


spotify_df['artist'] = spotify_df['artist'].astype(str)
spotify_df['title'] = spotify_df['title'].astype(str)
lyrics_df['artist'] = lyrics_df['artist'].astype(str)
lyrics_df['title'] = lyrics_df['title'].astype(str)

spotify_df['cleaned_artist'] = spotify_df['artist'].apply(cleaned_artist)
spotify_df['cleaned_title'] = spotify_df['title'].apply(cleaned_title)

lyrics_df['cleaned_artist'] = lyrics_df['artist'].apply(cleaned_artist)
lyrics_df['cleaned_title'] = lyrics_df['title'].apply(cleaned_title)    


merged_df = pd.merge(lyrics_df,spotify_df,on=['cleaned_title', 'cleaned_artist'],how='inner')


merged_df = merged_df.drop(columns=['cleaned_artist', 'cleaned_title', 'Unnamed: 0', 'id', 'artists_upd_v1',  'artists_upd_v2', 'artists_upd', 'artists_song', 'title_y', 'artist_y', 'popularity', 'duration_ms', 'explicit', 'artists', 'id_artists', 'release_date'])

merged_df = merged_df.rename(columns={'title_x': 'title','artist_x': 'artist','language_cld3': 'language','consolidates_genre_lists': 'spotify_genre_list','tag': 'genius_genre'})

merged_df = merged_df.drop_duplicates(subset=['title', 'artist'], keep='first')

merged_df.to_csv('../data/compressed/lyrics_spotify_features.csv', index=False)

print(merged_df.shape)
merged_df.head()


(18065, 20)


Unnamed: 0,title,genius_genre,artist,year,views,lyrics,language,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,spotify_genre_list
0,Can I Live,rap,JAY-Z,1996,468624,"[Produced by Irv Gotti] [Intro] Yeah, hah, yea...",en,0.628,0.692,9,-12.365,1,0.437,0.0823,0.0,0.161,0.575,76.44,4,"['pop_rap', 'rap', 'east_coast_hip_hop', 'hip_..."
1,Money On My Mind,rap,Lil Wayne,2005,128927,"[Intro] Yeah Money on my mind, money on my min...",en,0.535,0.772,1,-6.503,0,0.37,0.0127,0.0,0.11,0.661,152.173,4,"['trap', 'rap', 'pop_rap', 'hip_hop', 'new_orl..."
2,Mr. Carter,rap,Lil Wayne,2008,542488,[Produced by Infamous and Drew Correa] [Intro:...,en,0.485,0.71,7,-6.288,1,0.364,0.0444,0.0,0.35,0.473,170.942,4,"['trap', 'east_coast_hip_hop', 'rap', 'pop_rap..."
3,C.R.E.A.M.,rap,Wu-Tang Clan,1994,1984638,[Produced by RZA] [Intro: Raekwon & Method Man...,en,0.479,0.549,11,-10.551,0,0.373,0.57,0.0239,0.127,0.576,180.985,4,"['east_coast_hip_hop', 'gangster_rap', 'hardco..."
4,Barry Bonds,rap,Kanye West,2007,280626,[Verse 1: Kanye West] It's what you all been w...,en,0.48,0.624,1,-6.131,1,0.382,0.0451,0.0,0.337,0.704,165.057,4,"['chicago_rap', 'trap', 'rap', 'pop_rap', 'hip..."


## Billboard data