In [95]:
import pandas as pd
import numpy as np
import json
import re

In [96]:
# Dataset attained from: https://www.cs.cornell.edu/~arb/data/genius-expertise/

# Annotation Dataset

In [None]:
with open('./data/genius-expertise/annotation_info.json', encoding='utf-8') as f:
    temp = []
    for line in f.readlines():
        temp.append(json.loads(line))
annotations_df = pd.DataFrame(temp)

In [None]:
# initial look
annotations_df.head(3)

In [None]:
# i'm only interested in annotations with edits_lst, also the most recent edit
print('Number of Annotated Songs: {}'.format(annotations_df.shape[0]))
annotations_df = annotations_df.dropna(axis=0 ,subset=['edits_lst'])
print('Number of Annotated Songs after Filters: {}'.format(annotations_df.shape[0]))

In [None]:
# looking at an example
annotations_df.iloc[10]['edits_lst'][0]['content']

In [None]:
def clean_edits_lst(x):
    # get first instance of edit
    x = x[0]['content']
    # extract text body from brackets (https://stackoverflow.com/a/12982689/21492082)
    x = re.sub(re.compile('<.*?>') ,"",x)
    # remove end of line
    x = x.replace('\n', '')
    return x

In [None]:
# looking at a clean example
print(clean_edits_lst(annotations_df.iloc[10]['edits_lst']))

In [None]:
# map function to entire column
annotations_df['edits_lst'] = annotations_df['edits_lst'].apply(clean_edits_lst)

In [None]:
# save cleaned dataset
annotations_df.to_csv('./data/genius-expertiste_clean/annotation.csv', index=False)

# Lyrics Dataset

In [None]:
with open('./data/genius-expertise/lyrics.jl', encoding='utf-8') as f:
    temp = []
    for line in f.readlines():
        temp.append(json.loads(line))
lyrics_df = pd.DataFrame(temp)

In [None]:
# looking at an example
lyrics_df.iloc[10]['lyrics']

In [None]:
# looks like not much is required other than removing the end of line chars
lyrics_df.iloc[10]['lyrics'].replace('\n', ' ')

In [None]:
# map function to entire column
lyrics_df['lyrics'] = lyrics_df['lyrics'].apply(lambda x: x.replace('\n', ' '))
lyrics_df

In [None]:
# cleaning song column

songs_dict = {}
with open('./data/genius-expertise/artist_info.json', encoding='utf-8') as f:
    temp = []
    for line in f.readlines():
        line = json.loads(line)
        for song in line['songs']:
            if '-and-' in song or 'mtv' in song or 'version' in song:
                continue
            artist = line['url_name']
            if artist in song:
                if song in songs_dict:
                    pass
                else:
                    songs_dict[song] = ' '.join(song.replace(artist, ' ').split('-')[1:-1])

In [None]:
lyrics_df['song_clean'] = lyrics_df['song'].apply(lambda x: songs_dict.get(x, np.nan))
lyrics_df = lyrics_df.dropna()

In [None]:
lyrics_df.head()

In [None]:
# save cleaned dataset
lyrics_df.to_csv('./data/genius-expertiste_clean/lyrics.csv', index=False)

# Inner Join of Datasets

In [None]:
annotations_df = pd.read_csv('./data/genius-expertiste_clean/annotation.csv')
lyrics_df = pd.read_csv('./data/genius-expertiste_clean/lyrics.csv')

In [None]:
songs = lyrics_df.merge(annotations_df, how='inner', on='song')

In [None]:
# multiple people can submit annotations per song; I want the highest voted one
# incase I want to use the annotation column
songs = songs.loc[songs.reset_index().groupby(['song'])['votes'].idxmax()]

In [None]:
songs.head()

In [None]:
# keeping columns required for summarization task
songs = songs[['lyrics_x', 'song_clean']]
songs = songs.dropna()
songs.columns = ['lyrics', 'songs']
songs.head()

In [None]:
songs.to_csv('./data/genius-expertiste_clean/songs.csv', index=False)

# Final Dataset for Summarization

In [None]:
songs.head()