In [1]:
import json
import re
import os

### Find saved lyrics files

In [2]:
artist_regex = re.compile(r'Lyrics_(\w*)\.json')

In [3]:
lyrics_path = 'lyrics'

In [3]:
def merge_json_files(filenames: list, newfilename: str):
    """
    Parameters:
    -----------
    filenames: list of str
        list of sub file names with artist lyrics
    newfilename: str
        name of new combined json file
    """
    result = list()
    for fname in filenames:
        with open(fname, 'r') as subfile:
            result.append(json.load(subfile))
            
    with open(f'{newfilename}.json', 'w') as output_file:
        json.dump(result, output_file)

In [10]:
filenames = []

for lyrics_file in os.listdir(lyrics_path):
    if lyrics_file == '.ipynb_checkpoints':
        continue
    filenames.append(f'lyrics/{lyrics_file}')

In [11]:
filenames

['lyrics/Lyrics_2Pac.json',
 'lyrics/Lyrics_Eminem.json',
 'lyrics/Lyrics_IceCube.json',
 'lyrics/Lyrics_JAYZ.json',
 'lyrics/Lyrics_KanyeWest.json',
 'lyrics/Lyrics_KendrickLamar.json',
 'lyrics/Lyrics_MFDOOM.json',
 'lyrics/Lyrics_Nas.json']

In [12]:
newfilename = 'rap_10_combined'

In [13]:
merge_json_files(filenames = filenames,
                 newfilename = newfilename)

### Open combined file, create multi-artist dataframe

In [14]:
json_file = open(f'{newfilename}.json')
artists_obj = json.load(json_file)
json_file.close()

In [16]:
artists_obj[0]['name']

'2Pac'

In [47]:
import pandas as pd

In [56]:
artists_df = pd.DataFrame(columns=['Artist', 'Song Title', 'Song Lyrics', 'Artist Image'])

In [57]:
artists_df

Unnamed: 0,Artist,Song Title,Song Lyrics,Artist Image


In [58]:
for artist in artists_obj:
    artist_name = artist['name']
    for song in artist['songs']:
        new_row = {
            'Artist': artist_name,
            'Song Title': song['title'],
            'Song Lyrics': song['lyrics'],
            'Artist Image': artist['image_url']
        }
        artists_df = pd.concat([artists_df, pd.DataFrame([new_row])], ignore_index=True)

In [59]:
artists_df.head()

Unnamed: 0,Artist,Song Title,Song Lyrics,Artist Image
0,2Pac,Hit ’Em Up,460 ContributorsTranslationsFrançaisAzərbaycan...,https://images.genius.com/46cb64bfbd6e4b6e247a...
1,2Pac,Changes,323 ContributorsTranslationsالعربيةChanges Lyr...,https://images.genius.com/46cb64bfbd6e4b6e247a...
2,2Pac,Keep Ya Head Up,257 ContributorsKeep Ya Head Up Lyrics\nLittle...,https://images.genius.com/46cb64bfbd6e4b6e247a...
3,2Pac,Hail Mary,294 ContributorsTranslationsEnglishAzərbaycanc...,https://images.genius.com/46cb64bfbd6e4b6e247a...
4,2Pac,Dear Mama,217 ContributorsTranslationsPortuguêsDeutschEn...,https://images.genius.com/46cb64bfbd6e4b6e247a...


#### Cleanup genius metadata

In [52]:
genius_regex_text = r'(\d*.*)ContributorsTranslations(.*)Lyrics\n((.|\n|\r)*)\n*(\d*Embed)'
genius_regex = re.compile(genius_regex_text)

In [34]:
# artists_df['Song Lyrics'] = artists_df['Song Lyrics'].str.lower()

In [60]:
artists_df['Song Lyrics'].replace({genius_regex_text : r'\3'}, inplace=True, regex=True)

In [61]:
artists_df.head()

Unnamed: 0,Artist,Song Title,Song Lyrics,Artist Image
0,2Pac,Hit ’Em Up,(Sucka-ass)\nI ain't got no mothafuckin' frien...,https://images.genius.com/46cb64bfbd6e4b6e247a...
1,2Pac,Changes,"Ooh, yeah (Ooh)\n(Come on, come on)\n\nI see n...",https://images.genius.com/46cb64bfbd6e4b6e247a...
2,2Pac,Keep Ya Head Up,257 ContributorsKeep Ya Head Up Lyrics\nLittle...,https://images.genius.com/46cb64bfbd6e4b6e247a...
3,2Pac,Hail Mary,"\nMakaveli in this, Killuminati\nAll through y...",https://images.genius.com/46cb64bfbd6e4b6e247a...
4,2Pac,Dear Mama,"You are appreciated\n\nWhen I was young, me an...",https://images.genius.com/46cb64bfbd6e4b6e247a...


In [62]:
test_lyrics = artists_df['Song Lyrics'][0]
print(test_lyrics)

(Sucka-ass)
I ain't got no mothafuckin' friends
That's why I fucked yo' bitch, you fat mothafucka!
(Take money) Westside, Bad Boy killas
(Take money) (You know) You know who the realest is
(Take money) Niggas, we bring it too
That's a'ight, haha
(Take money) Haha
First off, fuck yo' bitch and the clique you claim
Westside when we ride, come equipped with game
You claim to be a player, but I fucked your wife
We bust on Bad Boys, niggas fucked for life
Plus, Puffy tryna see me, weak hearts I rip
Biggie Smalls and Junior M.A.F.I.A., some mark-ass bitches
We keep on comin' while we runnin' for your jewels
Steady gunnin', keep on bustin' at them fools, you know the rules
Lil' Caesar, go ask your homie how I'll leave ya
Cut your young-ass up, leave you in pieces, now be deceased
Lil' Kim, don't fuck around with real G's
Quick to snatch yo' ugly ass off the streets, so fuck peace!
I'll let them niggas know it's on for life
Don't let the Westside ride tonight (Hahaha)
Bad Boy murdered on wax a

#### Cleanup numbers, newlines and others

In [64]:
artists_df['Song Lyrics'] = artists_df['Song Lyrics'].str.lower()

In [67]:
artists_df['Song Lyrics'] = artists_df['Song Lyrics'].str.strip()

In [68]:
artists_df['Song Lyrics'] = artists_df['Song Lyrics'].str.replace('\n', ' ')

In [74]:
artists_df['Song Lyrics'] = artists_df['Song Lyrics'].str.replace('(', '')
artists_df['Song Lyrics'] = artists_df['Song Lyrics'].str.replace(')', '')

In [75]:
artists_df['Song Lyrics'][0]

"sucka-ass i ain't got no mothafuckin' friends that's why i fucked yo' bitch, you fat mothafucka! take money westside, bad boy killas take money you know you know who the realest is take money niggas, we bring it too that's a'ight, haha take money haha first off, fuck yo' bitch and the clique you claim westside when we ride, come equipped with game you claim to be a player, but i fucked your wife we bust on bad boys, niggas fucked for life plus, puffy tryna see me, weak hearts i rip biggie smalls and junior m.a.f.i.a., some mark-ass bitches we keep on comin' while we runnin' for your jewels steady gunnin', keep on bustin' at them fools, you know the rules lil' caesar, go ask your homie how i'll leave ya cut your young-ass up, leave you in pieces, now be deceased lil' kim, don't fuck around with real g's quick to snatch yo' ugly ass off the streets, so fuck peace! i'll let them niggas know it's on for life don't let the westside ride tonight hahaha bad boy murdered on wax and killed fuc

### Save df to file

In [76]:
artists_df.to_csv('artists_10_cleaned.csv')

In [77]:
artists_df_loaded = pd.read_csv('artists_10_cleaned.csv', index_col=0)

In [78]:
artists_df_loaded.tail()

Unnamed: 0,Artist,Song Title,Song Lyrics,Artist Image
155,Nas,Daughters,92 contributorsdaughters lyrics check it out ...,https://images.genius.com/23061dd2dc7e863127db...
156,Nas,Cherry Wine,"87 contributorscherry wine lyrics where is he,...",https://images.genius.com/23061dd2dc7e863127db...
157,Nas,Hate Me Now,117 contributorshate me now lyrics escobar se...,https://images.genius.com/23061dd2dc7e863127db...
158,Nas,EPMD 2,respectfully bucket on low like erick and par...,https://images.genius.com/23061dd2dc7e863127db...
159,Nas,The Don,95 contributorsthe don lyrics he's the heartbe...,https://images.genius.com/23061dd2dc7e863127db...


## Merge files genre by genre

In [4]:
artist_regex = re.compile(r'Lyrics_(\w*)\.json')

lyrics_path = 'lyrics'

In [None]:
# newfilename = 'rap_10_combined'

get genre folder names

In [5]:
folder_name_regex = re.compile(r'.*/(.*)')

In [6]:
os.getcwd()

'C:\\Users\\User\\Desktop\\programowanie_web_etc\\python_projects\\data_projects\\lyrics_analysis'

In [9]:
folder_paths = []
folder_names = []

for lyrics_folder in os.listdir(lyrics_path):
    if lyrics_folder == '.ipynb_checkpoints':
        continue
    folder_paths.append(f'{lyrics_path}/{lyrics_folder}')
    folder_names.append(lyrics_folder)

In [11]:
folder_names

['pop', 'rap', 'rock', 'soul']

In [15]:
for folder_index, folder in enumerate(folder_paths):
    filenames = []
    
    for lyrics_file in os.listdir(folder):
        if lyrics_file == '.ipynb_checkpoints':
            continue
        filenames.append(f'{folder}/{lyrics_file}')

    file_len = len(filenames)
    newfilename = f'{folder}/{folder_names[folder_index]}_combined_{file_len}'
    
    merge_json_files(filenames = filenames,
                    newfilename = newfilename )

In [12]:
# test_folder_name = 'lyrics/rap'
# folder_name_regex = re.compile(r'.*/(.*)')
# folder_name_regex.findall(test_folder_name)[0]

### Function to create dataframes from genre folders

In [23]:
import pandas as pd

combined_path = 'lyrics/combined'

# artists_df = pd.DataFrame(columns=['Artist', 'Song Title', 'Song Lyrics', 'Artist Image'])

In [27]:
os.getcwd()

'C:\\Users\\User\\Desktop\\programowanie_web_etc\\python_projects\\data_projects\\lyrics_analysis'

In [38]:
def create_dataframes_from_combined(combined_path):
    for genre in os.listdir(combined_path):
        genre_path = f'{combined_path}/{genre}'
        json_file = open(genre_path)
        genre_obj = json.load(json_file)
        json_file.close()

        genre_df = pd.DataFrame(columns=['Artist', 'Song Title', 'Full Title',
                                         'Release Date', 'Year', 'Month', 'Day',
                                         'Pageviews', 'url', 'featured_count',
                                         'producer_count', 'writer_count',
                                         'Song Lyrics', 'Artist Image'])
        
        for artist in genre_obj:
            artist_name = artist['name']
            for song in artist['songs']:
                release_date = song.get('release_date_components', None)
                year = None
                month = None
                day = None
                if release_date:
                    year = release_date.get('year', None)
                    month = release_date.get('month', None)
                    day = release_date.get('day', None)
                
                stats = song['stats']
                
                if song.get('producer_artists'):
                    producer_count = len(song.get('producer_artists'))
                else:
                    producer_count = 0

                if song.get('writer_artists'):
                    writer_count = len(song.get('writer_artists'))
                else:
                    writer_count = 0

                if song.get('featured_artists'):
                    featured_count = len(song.get('featured_artists'))
                else:
                    featured_count = 0
                
                new_row = {
                    'Artist': artist_name,
                    'Song Title': song['title'],
                    'Full Title': song['full_title'],
                    'Release Date': song['release_date'],
                    'Year': year,
                    'Month': month,
                    'Day': day,
                    'Pageviews': stats.get('pageviews', 0),
                    'url': song['url'],
                    'featured_count': featured_count,
                    'producer_count': producer_count,
                    'writer_count': writer_count,
                    'Song Lyrics': song['lyrics'],
                    'Artist Image': artist['image_url']
                }
                
                genre_df = pd.concat([genre_df, pd.DataFrame([new_row])],
                                    ignore_index=True)
        genre_len = len(genre_obj)
        genre_df.to_csv(f'{genre}_{genre_len}.csv')

In [39]:
create_dataframes_from_combined(combined_path)

### Function to cleanup the saved dataframes

In [40]:
genius_regex_text = r'(\d*.*)ContributorsTranslations(.*)Lyrics\n((.|\n|\r)*)\n*(\d*Embed)'
genius_regex = re.compile(genius_regex_text)

genius_regex_lowercase_text = r'(\d*.*)contributorstranslations(.*)lyrics\n((.|\n|\r)*)\n*(\d*embed)'
genius_regex_lowercase = re.compile(genius_regex_lowercase_text)

In [None]:
# artists_df['Song Lyrics'].replace({genius_regex_text : r'\3'}, inplace=True, regex=True)

In [43]:
def cleanup_genius_lyrics(df, genius_regex, remove_newlines=True,
                         remove_question=True, remove_interp=True,
                         remove_dollar=True):
    df['Song Lyrics'] = df['Song Lyrics'].str.lower()

    df['Song Lyrics'].replace({genius_regex : r'\3'}, inplace=True, regex=True)
    
    df['Song Lyrics'] = df['Song Lyrics'].str.strip()

    if remove_newlines:
        df['Song Lyrics'] = df['Song Lyrics'].str.replace('\n', ' ')
    
    df['Song Lyrics'] = df['Song Lyrics'].str.replace('(', '')
    df['Song Lyrics'] = df['Song Lyrics'].str.replace(')', '')
    df['Song Lyrics'] = df['Song Lyrics'].str.replace('-', '')

    if remove_question:
        df['Song Lyrics'] = df['Song Lyrics'].str.replace('?', '')
    
    if remove_dollar:
        df['Song Lyrics'] = df['Song Lyrics'].str.replace('$', '')

    if remove_interp:
        df['Song Lyrics'] = df['Song Lyrics'].str.replace(',', '')
        df['Song Lyrics'] = df['Song Lyrics'].str.replace('.', '')
        df['Song Lyrics'] = df['Song Lyrics'].str.replace('!', '')
        df['Song Lyrics'] = df['Song Lyrics'].str.replace(':', '')
        df['Song Lyrics'] = df['Song Lyrics'].str.replace(';', '')
    
    return df

In [42]:
dataframes_path = 'dataframes'

In [45]:
for frame in os.listdir(dataframes_path):
    if frame == '.ipynb_checkpoints':
        continue
    frame_path = f'{dataframes_path}/{frame}'
    dataframe = pd.read_csv(frame_path, index_col=0)
    
    df = cleanup_genius_lyrics(dataframe, genius_regex_lowercase_text)
    df.to_csv(f'{dataframes_path}/cleaned_{frame}')