# 15688 Project - Lyrics Generator & Classificator

“Music can change the word because it can change people.” said by the Legendary U2 rocker Bono. A beautiful song usually has memorable lyrics that sometimes change people. However, it is not an easy task to write good lyrics. 

The aim of our project is to create a lyric generation model based on existing lyrics of different music genres - pop, rock, hip hop, etc - using machine learning algorithms that are common in natural language processing.

## Step 1. Data Collection

in order to train the lyric model, the first step is to collect lyrics by genre. We collected the male and female artists' names from [music.163.com ](https://music.163.com/#/discover/artist/cat?id=2001)by copying the information on the webpage and saved them as csv files.

after getting artists' names, we use the [musixmatch](http://api.musixmatch.com/ws/1.1/) api to collect the genre and name of songs of the artists. The results are exported as csv file so that we can count the most frequent genres among all the songs we collected. 

In [4]:
'''
15688 final project - lyric generator

data collection

retrieve the artists, genres and tracks and export to csv file

API used: musixmatch Developer
documentation: https://developer.musixmatch.com/documentation

'''
import os
import json
import requests
import pandas as pd

# load api key
with open("../musicmatch_api.key",'r') as f:
    api = f.read()

root = "http://api.musixmatch.com/ws/1.1/"

def get_artist(api, pageNum, page_size=100, country = "us"):

    '''
    getting top artists and their genres
    Args:
        api: API key
        pageNum: the page number for paginated results
        page_size: the page size for paginated results. Range is 1 to 100
        country: country of the artist ranking
    Return:
        df: a pandas dataframe containing artists, genres and genre id
        all_genres: a set of all genres related to the artists found
    '''
    result = []
    all_genres = set()
    for i in range(pageNum):
        param = {
            "apikey":api,
            "country": "country",
            "page": i+1,
            "page_size": page_size,
            "format": "json"
        }

        singers = requests.get(root + "chart.artists.get?", params = param)
        response = json.loads(singers.content)
        artist_list = response.get("message").get("body").get("artist_list")
        
        for artist in artist_list:
            name = artist.get("artist").get('artist_name')
            genres = artist.get("artist").get("primary_genres").get("music_genre_list")
            for g in genres:
                genre = g.get("music_genre").get("music_genre_name")
                genre_id = g.get("music_genre").get("music_genre_id")
                all_genres.add(genre)
                result.append({"artist":name, "genre":genre, "genre_id":genre_id})
    
    df = pd.DataFrame(result)
    df = df.loc[:, ["artist", "genre", "genre_id"]]
    return df, all_genres


def get_artist_genre(api, all_artist_list):

    '''
    getting the artists and their genres of given list

    Args:
        api: API key
        all_artist_list: list of all the artists
    Return:
        df: a pandas dataframe containing artists, genres and genre id
        all_genres: a set of all genres related to the artists found

    '''
    result = []
    all_genres = set()
    param = {
            "apikey":api,
            "page":1,
            "page_size":10
        }
    for artist in all_artist_list:
        param["q_artist"] = artist

        search_result = requests.get(root + "artist.search?", params = param)
        response = json.loads(search_result.content)

        artist_list = response.get("message").get("body").get("artist_list")

        if not artist_list:
            continue
        artist_item = artist_list[0]
        
        name = artist_item.get("artist").get('artist_name')
        genres = artist_item.get("artist").get("primary_genres").get("music_genre_list")
        for g in genres:
            genre = g.get("music_genre").get("music_genre_name")
            genre_id = g.get("music_genre").get("music_genre_id")
            all_genres.add(genre)
            result.append({"artist":name, "genre":genre, "genre_id":genre_id})
    
    df = pd.DataFrame(result)

    if not df.empty:
        df = df.loc[:, ["artist", "genre", "genre_id"]]
    else:
        print("result is an empty dataframe")
    return df, all_genres

def get_songs(api, artist_df, page_size = 100):


    '''
    getting track names by artists and genre id

    Args:
        api: API key
        artist_df: dataframe with columns of artist, genre and genre id
        page_size: the page size for paginated results. Range is 1 to 100
    Return:
        df: a pandas dataframe containing artists, genres, genre id and the top
        100 tracks with lyrics under that genre by the artist
        
    '''

    result = []

    for i, row in artist_df.iterrows(): 
        param = {
                "apikey":api,
                "q_artist": row['artist'],
                "f_music_genre_id": row['genre_id'], # filter by genre id
                "f_has_lyrics":"True", # only get tracks with lyrics
                "page": 1,
                "page_size": page_size
            }

        singer = requests.get(root + "track.search?", params = param)
        response = json.loads(singer.content)
        song_list = response.get("message").get("body").get("track_list")
        for song in song_list:
    
            track_name = song.get("track").get("track_name")
            result.append(
                {
                "artist":row["artist"], 
                "genre":row["genre"], 
                "genre_id":row["genre_id"],
                "track_name":track_name
                })

    df = pd.DataFrame(result)
    df = df.loc[:, ["artist", "genre","genre_id", "track_name"]]
    return df


    #Step 1. get artists and their genres
    # load the first 1,300 artists of from csv file
    artist_df = pd.read_csv("./csv_files/all_female_artists.csv", header = None)[:50]
    artists_list = []
    for col in artist_df.columns.values:
        artists_list += list(artist_df[col])

    artist_genre_df, all_genres = get_artist_genre(api, artists_list)
    artist_genre_df.to_csv("./csv_files/all_female_artist_genre.csv",index = False)

    #Step 2. get songs by artists and genres
    artist_df = pd.read_csv("./csv_files/all_female_artist_genre.csv")[:1000]
    print(artist_df.shape)
    song_df = get_songs(api, artist_df)
    song_df.to_csv("./csv_files/all_female_artist_genre_track.csv", index = False)

With all the song names, we use [lyricwikia](https://github.com/enricobacis/lyricwikia) package in Python to collect the lyrics. The package can be installed with pip.

```python
pip install lyricwikia
```


In [None]:
import lyricwikia as ly
import argparse

#request lyric song by song
#row by row in the dataframe
def getLyrics(songs):
    i = -1
    print("Total songs number:" + str(songs.shape[0]))
    for index, row in songs.iterrows():
        i += 1
        if i%100 == 0:
            print("Processing song [" + str(i) + "]")

        song = row['track_name']
        #print(song)
        artist = row['artist']
        try:
            lyric = ly.get_lyrics(artist, song, linesep='\n', timeout=None)
            songs.loc[index,'lyric'] = lyric
        except:
            continue    
        #print(lyric)
    return songs


def run(oriFile, newFile):
    songs = pd.read_csv(oriFile, encoding = "ISO-8859-1")
    songs = getLyrics(songs)
    songs = songs.dropna()
    #print(songs)
    songs.to_csv(newFile)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--filename', type=str, 
        required=True, help='input the path of the csv file of the lyric track file.')
    parser.add_argument('-o', '--output', type=str, 
        required=True, help='the output path of the lyric file.')

    args = parser.parse_args()

    run(args.filename, args.output)

main()



From the csv file of songs and their genres, we found the top 3 genres are:

* Pop
* Hip Hop/Rap
* Rock

We will train the model based on these three genres. Therefore, we will extract and generate the dataset of lyrics of each genre.

In [None]:
import numpy as np

def split_lyrics(csv_path):    
    df = pd.read_csv(csv_path)

    df = df.iloc[:,1:]

    result = []
    for genre in ['Pop','Rock','Hip Hop/Rap']:
        result.append(df[df['genre'] == genre])
    return result



if __name__ == "__main__":
    df_female = split_lyrics('../csv_files/all_female_artist_lyrics.csv')
    df_male = split_lyrics('../csv_files/all_male_artist_lyrics.csv')

    for d, f in zip(df_female,df_male):
        genre = d.iloc[0,1]
        genre = genre.replace(" ", "_").replace("/", "_")
        df = pd.concat([d,f])
        df.to_csv('../csv_files/lyrics_' + genre +".csv", index = False)


We are now ready to train the model with 3 datasets consisting of lyrics of different genres. 