In [None]:
import pandas
import requests
import spotipy
import string
import re

from spotipy.oauth2 import SpotifyClientCredentials
from genderize import Genderize
from numpy import nan

# Authenticate from offline file
with open('vars.env', 'r') as f:
    my_id = f.readline().rstrip('\n')
    my_secret = f.readline().rstrip('\n')
    playlistURI = f.readline().rstrip('\n')
    owner = f.readline().rstrip('\n')

# Get playlist from Spotify 
client_credentials_manager = SpotifyClientCredentials(client_id=my_id, client_secret=my_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
playlistID = playlistURI[-22:]
results = sp.user_playlist(owner, playlist_id=playlistID, fields='tracks, next')['tracks']
songs =  results['items']

# Page through songs in playlist
while results['next']:
    results = sp.next(results)
    songs.extend(results['items'])

# Work out who added each song and get their display names
idSet = {track['added_by']['id'] for track in songs}    
us = {(id, sp.user(id)['display_name']) for id in idSet}
us = dict(us)
us['rmb101'] = 'rmb101'

In [None]:
# Make the dataframe by pulling out the song name, artist and person who added it
table = [(track['track']['name'], track['track']['artists'][0]['name'], us[track['added_by']['id']]) 
         for track in songs]
df = pandas.DataFrame(table, columns = ['Song', 'Artist', 'Added by'])

# Get rid of anything after a hyphen, as this is all '2012 remastered' kinda text
# also remove all apostrophe s and digits
df['Text'] = df['Song'].str.lower()
df['Text'].replace('\d| - .*|\'s','', regex = True, inplace = True)

# set up regex to kill off remaining punctuation
punclist = string.punctuation
punc = re.compile('[%s]' % re.escape(punclist))

# bring in list of common words (ie not names) to remove
# also a manual list of words I don't want to pick up as names
words = pandas.read_csv('top5000.csv')
wordlist = words['Top words'].tolist()
wordlist.remove('sue')
blocklist = ['rigby', 'marmalade', 'runaround', 'dont', 'hasty', 'mambo', 'holland', 'speedy', 'lind', 'stars', 'o', 'ode', 'punk', 'rocker']
wordlist.extend(blocklist)

# function to remove the punctuation and common names
# returns as list because I'm going to gender assign each word
# names set is so I don't need to call genderize on the same name twice
nameSet = set()

def namesOnly(row):
    text = punc.sub('', row['Text']).split()
    newTitle = [word for word in text if word not in wordlist]
    global nameSet
    for word in newTitle:
        nameSet.add(word)
    return newTitle 

df['ShortTitle'] = df.apply(namesOnly, axis = 1)

# Now add names from artists to the name set and tidy them the same way
# Just grab first word from each artist and check if it's a first name
df['FirstName'] = df['Artist'].str.lower().str.split(n = 1).str[0]
df['FirstName'].replace('\d| - .*|\'s','', regex = True, inplace = True)

for name in df['FirstName'].unique():
    text = punc.sub('', name)
    if name not in wordlist:
        nameSet.add(name)
    


In [None]:
# Send the whole list off names off to the Genderize API in one go
nameScores = Genderize().get(nameSet)
genders = {}
for name in nameScores:
    if name['gender']:
        if name['probability'] >= 0.7:
            genders[name['name']] = name['gender']


In [None]:
# Manually add gender for songs it can't guess
manual = {'chun-li': 'female', 'lady marmalade': 'female', "l'autre valse d'amélie": 'female',
 'mr. bojangles': 'male','i loves you porgy': 'male', 'rihanna': 'female',
 'dear rose': 'female', 'van gogh ear': 'male', "rockin' robin": 'female',
 'gurdjieff daughter': 'female', 'seigfried': 'male', 'saint dymphna': 'female'}

# Function to check words in song titles against Genderize list
def identifySong(name):
    guesses = set()
    for word in name['ShortTitle']:
        if word in genders:
            guesses.add(genders[word])
    if len(set(guesses)) == 1:
        name['Type'] = 'solo'
        return guesses.pop()
    elif not guesses:
        try:
            return manual[name['Text']]
        except:
            return ''
    else:
        return guesses

# Simpler function to get genders for artists from the Genderize list
def identifyArtist(name):
    guesses = set()
    if name['FirstName'] in genders:
        return genders[name['FirstName']]
    else:
        return ''

# Apply the functions
df['SongGender'] = df.apply(identifySong, axis = 1)
df['ArtistGender'] = df.apply(identifyArtist, axis = 1)

# Label artists identified so far as solo (this is clumsy)
df['Type'] = ''
df.Type[df.ArtistGender!=''] = 'solo'

# Use a local csv of genders for bands
unknowns = pandas.read_csv('Unknowns.csv')

# Join the csv up with the main table
# There's probably a neater way to do this
merged = df.merge(unknowns, 'left', on = 'Artist')
merged = merged.replace('', nan)
merged['Type'] = merged['Type_x'].combine_first(merged['Type_y'])
merged['ArtistGender'] = merged['ArtistGender_x'].combine_first(merged['ArtistGender_y'])



In [None]:
# Pick the relevant columns and export results
results = merged[['Song', 'Artist', 'Added by', 'SongGender', 'ArtistGender', 'Type']]
results.to_csv('SongGenders.csv', index = False)