In [None]:
import pandas
import requests
import spotipy
import string
import re

from spotipy.oauth2 import SpotifyClientCredentials
from genderize import Genderize

"""
# Spacy approach didn't work well here
# From 'Song for Bob Dylan', extracted the name 'Song' :(
import spacy
import spacy.cli
spacy.cli.download('en_core_web_lg')
en = spacy.load('en_core_web_lg')

"""


In [None]:
with open('vars.env', 'r') as f:
    my_id = f.readline().rstrip('\n')
    my_secret = f.readline().rstrip('\n')
    playlistURI = f.readline().rstrip('\n')
    owner = f.readline().rstrip('\n')

client_credentials_manager = SpotifyClientCredentials(client_id=my_id, client_secret=my_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

playlistID = playlistURI[-22:]

results = sp.user_playlist(owner, playlist_id=playlistID, fields='tracks, next')['tracks']

songs =  results['items']
while results['next']:
    results = sp.next(results)
    songs.extend(results['items'])

idSet = {track['added_by']['id'] for track in songs}    
us = {(id, sp.user(id)['display_name']) for id in idSet}
us = dict(us)
us['rmb101'] = 'rmb101'

print(len(songs))

In [None]:
# Make the dataframe by pulling out the song name, artist and person who added it
table = [(track['track']['name'], track['track']['artists'][0]['name'], us[track['added_by']['id']]) 
         for track in songs]
df = pandas.DataFrame(table, columns = ['Song', 'Artist', 'Added by'])

# Get rid of anything after a hyphen, as this is all '2012 remastered' kinda text
# also remove all apostrophe s and digits
df['Text'] = df['Song'].str.lower()
df['Text'].replace('\d| - .*|\'s','', regex = True, inplace = True)

# set up regex to kill off remaining punctuation
punclist = string.punctuation
punc = re.compile('[%s]' % re.escape(punclist))

# bring in list of common words (ie not names) to remove
# also a manual list of words I don't want to pick up as names
words = pandas.read_csv('top5000.csv')
wordlist = words['Top words'].tolist()
wordlist.remove('sue')
blocklist = ['rigby', 'marmalade', 'runaround', 'dont', 'hasty', 'mambo', 'holland', 'speedy', 'lind', 'stars', 'o', 'ode', 'punk', 'rocker']
wordlist.extend(blocklist)

# function to remove the punctuation and common names
# returns as list because I'm going to gender assign each word
# names set is so I don't need to call genderize on the same name twice
nameSet = set()

def namesOnly(row):
    text = punc.sub('', row['Text']).split()
    newTitle = [word for word in text if word not in wordlist]
    global nameSet
    for word in newTitle:
        nameSet.add(word)
    return newTitle 

df['ShortTitle'] = df.apply(namesOnly, axis = 1)

In [None]:
nameScores = Genderize().get(nameSet)

genders = {}
for name in nameScores:
    if name['gender']:
        if name['probability'] >= 0.7:
            genders[name['name']] = name['gender']


In [None]:
# Manually add gender for ones it can't guess
manual = {'chun-li': 'female', 'lady marmalade': 'female', "l'autre valse d'amélie": 'female',
 'mr. bojangles': 'male','i loves you porgy': 'male', 'rihanna': 'female',
 'dear rose': 'female', 'van gogh ear': 'male', "rockin' robin": 'female',
 'gurdjieff daughter': 'female', 'seigfried': 'male', 'saint dymphna': 'female'}

# Function to get genders from list for names in song title
def identify(name):
    guesses = set()
    for word in name['ShortTitle']:
        if word in genders:
            guesses.add(genders[word])
    if len(set(guesses)) == 1:
        return guesses.pop()
    elif not guesses:
        try:
            return manual[name['Text']]
        except:
            return ''
    else:
        return guesses

df['Guesses'] = df.apply(identify, axis = 1)


In [None]:
df[['Text', 'ShortTitle', 'Guesses']].loc[~df['Guesses'].astype(str).isin(['male', 'female']) ]



In [None]:
# now for artists (argh)


# Not really working

def entity(text):
    nameList = [name for name in en(text['Song']).ents if name.label_ == 'PERSON']
    if nameList:
        return nameList[0]
    else:
        return None

df['SongNames'] = df.apply(entity, axis = 1)
df.head(50)