In [1]:
import pandas as pd
import missingno
import ast
import sys
import pickle
from zipfile import ZipFile

In [2]:
zip_name = "lastfm-dataset-1K.zip"
with ZipFile(zip_name, 'r') as z:
    folder = "lastfm-dataset-1K/"
    
    #Read song csv
    song_file = folder +"userid-timestamp-artid-artname-traid-traname.tsv"
    songs = pd.read_csv(z.open(song_file), sep='\t', 
                        error_bad_lines=False, 
                        header=None, parse_dates=[1], 
                        dtype={0:'category', 2:'category',3:'category', 
                          4:'category',5:'category'})
    songs.columns = ['user_id', 'timestamp', 'artist_id', 
                     'artist_name', 'track_id', 'track_name']
    
    #Read user csv
    user_file = folder +"userid-profile.tsv"
    column_types = {'#id': 'category', 'gender':'category', 
                    'age':'float32', 'country':'category'}
    users = pd.read_csv(z.open(user_file), sep='\t', parse_dates=[4], dtype=column_types, error_bad_lines=False) \
                .rename(columns={'#id': 'user_id'})
    

b'Skipping line 2120260: expected 6 fields, saw 8\n'
b'Skipping line 2446318: expected 6 fields, saw 8\n'
b'Skipping line 11141081: expected 6 fields, saw 8\n'
b'Skipping line 11152099: expected 6 fields, saw 12\nSkipping line 11152402: expected 6 fields, saw 8\n'
b'Skipping line 11882087: expected 6 fields, saw 8\n'
b'Skipping line 12902539: expected 6 fields, saw 8\nSkipping line 12935044: expected 6 fields, saw 8\n'
b'Skipping line 17589539: expected 6 fields, saw 8\n'


In [25]:
zip_name = "data/song_tags_combined.zip"
with ZipFile(zip_name, 'r') as z:
    folder = "song_tags_combined/"
    column_types = {'artist_name': 'category', 'track_name':'category', 
                    'tags':'category'}
    file = folder +"song_tags.csv"
    song_tags = pd.read_csv(z.open(file), dtype=column_types, error_bad_lines=False)

In [26]:
song_tags.head()

Unnamed: 0,artist_name,track_name,tags
0,Boy Division,Love Will Tear Us Apart,"[{'count': 100, 'name': 'rock', 'url': 'https:..."
1,Death Cab For Cutie,Soul Meets Body,"[{'count': 100, 'name': 'indie', 'url': 'https..."
2,The Knife,Heartbeats,"[{'count': 100, 'name': 'electronic', 'url': '..."
3,Muse,Starlight,"[{'count': 100, 'name': 'alternative rock', 'u..."
4,The Killers,When You Were Young,"[{'count': 100, 'name': 'rock', 'url': 'https:..."


# Top N Tags, All tags in one column

In [27]:
all = False
n = sys.maxsize if all else 1

for index, row in song_tags.iterrows():
    tags = row['tags']
    str_tag = str(tags)
    if(str_tag[0] == '['):
        x = ast.literal_eval(str(tags).lower())
        tag_list = []
        for i in range(min(n, len(x))):
            if(song_tags.loc[index, 'artist_name'].lower() != x[i]['name']):
                tag_list.append(x[i]['name'])
        song_tags.loc[index, 'tag_names'] = ", ".join(str(x) for x in tag_list)
    else:
        song_tags.loc[index, 'tag_names'] = str_tag

song_tags

KeyboardInterrupt: 

# Top N Tags, Column per tag

In [10]:
all = False
n = sys.maxsize if all else 3

for index, row in song_tags.iterrows():
    tags = row['tags']
    str_tag = str(tags)
    if(str_tag[0] == '['):
        x = ast.literal_eval(str(tags).lower())
        for i in range(min(n, len(x))):
            song_tags.loc[index, 'tag_' + str(i)] = x[i]['name']
    else:
        song_tags.loc[index, 'tag_names'] = str_tag

song_tags

KeyboardInterrupt: 

# Get tags set 

In [13]:
all = False
n = sys.maxsize if all else 10

tag_set = set()



for index, row in song_tags.iterrows():
    tags = row['tags']
    str_tag = str(tags)
    if(str_tag[0] == '['):
        x = ast.literal_eval(str(tags))
        for i in range(min(len(x), n)):
            tag_set.add(x[i]['name'])


tag_set

{'spooky songs',
 'canciones tesoro',
 'dance-punk',
 'The Damage Manual I wish I had',
 'love vent',
 'Kicks ass',
 'pee-why',
 'incredibly cool and different',
 'Great Psyche',
 'game boy',
 'Hitchhikers Guide to the Galaxy',
 'road tripping',
 'In a Nutshell',
 'lay ya ass down',
 'jesse james',
 'Favorit songs',
 'For the club',
 'motion',
 'j4zz r0ck',
 'Benny Benassi',
 'Los Toreros',
 'ice mc',
 'piedpp tagged',
 'Dancefloor Mayhem',
 'genres: rock and roll 50s - 60s',
 'Awesome solo',
 'frenchrap',
 'Rachael Starr Till there was you',
 'Deutsch Gut',
 'cendres ascendantes',
 'pale',
 'cool trip',
 'awnnnnnnnn those days are so gone',
 'somekinda all-ages party type deal',
 'highly syncopated drum beats',
 'feel it',
 'proto hip-hop',
 'florida songs',
 'Better off Dead',
 'acoustic dreamy',
 'sidereal space',
 'best of 2002',
 'rapider than horsepower',
 'baroque rock',
 'Electric Eel Shock',
 'indie instrumental chill out',
 'Daniel Merriweather',
 'teqqen',
 'Moszeed unter St

In [14]:
fname = "all_tags.txt" if all else "top_tags.txt"

with open(fname, "w") as output:
    output.write(str(tag_set))

# Merge song tags with listening history

In [None]:
song_tags.head()

In [None]:
song_tags.drop(columns=['tags'], inplace=True)
song_tags.drop(columns=['tag_0'], inplace=True)
song_tags.drop(columns=['tag_1'], inplace=True)
song_tags.drop(columns=['tag_2'], inplace=True)

In [None]:
song_tags.drop(columns=['tags'], inplace=True)

In [None]:
song_tags.rename(columns={"tag_names": "top_tag"}, inplace=True)
song_tags.head()

In [None]:
new_df = pd.merge(songs, song_tags, on=['artist_name','track_name'])

In [None]:
new_df.head()

In [None]:
new_df.to_pickle("data/pickle/song_tags_top1")