## standard imports

In [2]:
import pandas as pd
import numpy as np
import sqlite3 as sql
import requests
import json
import re
import pprint 
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time
from bs4 import BeautifulSoup
#from termcolor import colored

## Loading the pitchfork reviews

In [15]:
df = pd.read_pickle("complete pitchfork.pkl")
bands = pd.Series(df.artist.unique())

In [16]:
### for later
names_to_wiki = {band: band.title().strip().replace(" ", "_").replace("&", "%26") for band in bands}
wiki_to_names = {band.title().strip().replace(" ", "_").replace("&", "%26"): band for band in bands}

## We're trying three different approaches
1. Using genders from Wikidata
2. Using Categories from Wikipedia with the wiki API
3. Counting Pronouns

## 1. Approach: Using Wikidata


In [28]:
def find_gender_data(url, http):
    """
    given the unique wikidata URL of a profile,
    this function returns the gender if it is given
    """
    
    gender = "unknown"
    response = http.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    spoon = soup.find('div', id= "P21")
    if spoon != None:
        gender = spoon.find("div", class_="wikibase-snakview-value wikibase-snakview-variation-valuesnak").get_text()     
    return gender

def find_page_id_data(base_url, artist, http, pattern=r'(?<=#58;)(.*)(?=&#124)'):
    """
    given an artist name,
    this function return the unique wikidata URL to the artist.
    To avoid dissambiguation we add the suffixes when necessary...
    """
    
    variations = ['_(band)', '_(musician)', '_(singer)', '_(rapper)', '']
    for variation in variations:
        url = base_url + artist + variation
        
        response = http.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        spoon = soup.find_all('th', {'id': re.compile(r'&#58')})
        
        if len(spoon) > 0:
            spoon = spoon[0].get("id")
            m = re.search(pattern, spoon)
            link = "https:" + m.group(0)
            return link             
    return  None


def get_wikidata(artist_list, verbose=True):
    """
    the wikidata approach to finding the data.
    We use the unique identifier wikidata pages and scrape them with bs4 to get
    info about the artists gender
    the function takes a list of artists and returns a dictionary with the 
    artists and their respective genders"""
    
    time_before = time.time()
    
    #### setup    
    artist_list = artist_list.str.title().str.strip().str.replace(" ", "_").str.replace("&", "%26") #formating for wikisearch
    base_url = "https://en.wikipedia.org/wiki/"
    pattern = r'(?<=#58;)(.*)(?=&#124)' #pattern we found around the unique wikidata identifier
    band_dict = {}
    time_before = time.time()
        
    #### retry strategy
    #Essentially making sure we try 3 times so the algortihm doesn't give up immediately 
    #everytime it can't load the page
    retry_strategy = Retry(
        total=3,
        status_forcelist=[429, 500, 502, 503, 504],
        method_whitelist=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    http.mount("http://", adapter)

    
    for counter, artist in enumerate(artist_list): 
        
        #### find the wikidata page link
        link = find_page_id_data(base_url, artist, http)
        gender = "unknown"

        #### tells us how we're doing with the algorithm
        if verbose:
            print (f"working on band number {counter} of {len(artist_list)}", end="\r")
        
        #### results
        if link != None:
            gender = find_gender_data(link, http) 
        band_dict[artist] = gender        
        
    #### timer for final print
    elapsed_time = time.time() - time_before
    print(f"this approach took {elapsed_time} seconds for a total of {len(artist_list)} bands")
    
    return band_dict

In [29]:
band_dict_data = get_wikidata(bands, verbose=True)

this approach took 26801.310893774033 seconds for a total of 10622 bands


In [30]:
df_data = pd.DataFrame(pd.Series(band_dict_data).values, pd.Series(band_dict_data).index)
df_data.rename(columns={0: "gender_data"}, inplace=True)
df_data.index.name = "artist"

In [37]:
df_data.to_csv("gender data.csv") # save the DataFrame

## 2. Approach: Use Categories via Wiki API

### Some notes on the API
the &redirects links directly to the final redirect. We then use the page_id of the final redirect to do our second query. (like in the last line in the next cell) This is important to get the right results...

In [45]:
pp = pprint.PrettyPrinter()

# a standard API request
r = requests.get("https://en.wikipedia.org/w/api.php?action=query&format=json&prop=categories&redirects&titles=Ben_Harper")

## A band without redirect.
r = requests.get("https://en.wikipedia.org/w/api.php?action=query&format=json&titles=Acid_Mothers_Temple_%26_The_Cosmic_Inferno")

## A band with redirect. Note how it gives us a new pageid. The one of the final redirect...
r = requests.get("https://en.wikipedia.org/w/api.php?action=query&format=json&titles=Acid_Mothers_Temple_%26_The_Cosmic_Inferno&redirects")

## A band with redirect that is already the final landing page. The query yields essentially the same results
r = requests.get("https://en.wikipedia.org/w/api.php?action=query&format=json&titles=Acid_Mothers_Temple&redirects")

## A band with redirect and direct query for categories. The golden ticket...
r = requests.get("https://en.wikipedia.org/w/api.php?action=query&format=json&prop=categories&redirects&titles=Acid_Mothers_Temple_%26_The_Cosmic_Inferno")
pp.pprint(r.json())


{'continue': {'clcontinue': '1316023|P.S.F._Records_artists', 'continue': '||'},
 'query': {'normalized': [{'from': 'Acid_Mothers_Temple_&_The_Cosmic_Inferno',
                           'to': 'Acid Mothers Temple & The Cosmic Inferno'}],
           'pages': {'1316023': {'categories': [{'ns': 14,
                                                 'title': 'Category:Alien8 '
                                                          'Recordings artists'},
                                                {'ns': 14,
                                                 'title': 'Category:Articles '
                                                          'with hCards'},
                                                {'ns': 14,
                                                 'title': 'Category:Experimental '
                                                          'rock groups'},
                                                {'ns': 14,
                                                 'title':

In [32]:
pp = pprint.PrettyPrinter()

def scrape_categories(band_list, redirect=True, verbose=True):
    """
    given a list of bands, this function returns all the categories the bands belong to on Wikipedia.
    The function does some preprocessing of the band names such as replacing spaces with "_" and 
    making the first letter of a word a capital and a few more operations...
    
    the function return a dict with the categories and a list of bands that have not been found
    
    redirect is a very important hypervariable. 
    It makes sure we look at the final redirect of the url we want to query. 
    We can turn it off, potentially speeding up the algorithm but that would yield inferior results...
    
    verbose just tells us where we're at in our process...
    """
    ### string preprocessing
    band_list = band_list.str.title().str.strip().str.replace(" ", "_").str.replace("&", "%26")
    
    #### setup
    not_found = []
    band_dict = {}
    time_before = time.time()
    base_url = 'https://en.wikipedia.org/w/api.php?action=query&format=json&prop=categories'
    
    if redirect:
        base_url += "&redirects"
        
    #### retry strategy
    retry_strategy = Retry(
        total=3,
        status_forcelist=[429, 500, 502, 503, 504],
        method_whitelist=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    http.mount("http://", adapter)
    
    for counter,band in enumerate(band_list):
        
        ##### speakerbox
        if verbose:
            print (f"working on band number {counter} of {len(band_list)}", end="\r")
            
        #### setup. We set up a helperfunction to return the page_id and the parsed query from an artist         
        page_id, url, parsed, suffix = find_page_id_cat(base_url, band, http)

        #### page_id -1 indicates a non-existant page. all other cases will be stripped in else
        if page_id == "-1":
            not_found.append(band)  
            
        else:
            category_path = parsed.get("query").get("pages").get(page_id).get("categories")
            category_list = []
            for category in category_path:
                category_list.append(category.get("title").replace("Category:", ""))  
            continuation = parsed.get("continue")     
            
            while continuation != None: #### The api only gives us 10 results. for more we need to select &continue
                next_item = continuation.get("clcontinue")
                new_url = url + f'&continue={continuation.get("continue")}&clcontinue={next_item}'
                parsed = json.loads(http.get(new_url).content)
                category_path = parsed.get("query").get("pages").get(page_id).get("categories")
                if category_path != None:
                    for category in category_path:
                        category_list.append(category.get("title").replace("Category:", ""))   
                continuation = parsed.get("continue") 
            category_list.append(suffix)    
            band_dict[band] = category_list 
            
    #### timer for final print
    elapsed_time = time.time() - time_before
    print(f"this approach took {elapsed_time} seconds for a total of {len(band_list)} bands")
    return band_dict, not_found


def find_page_id_cat(base_url, band, http):
    """
    a helper function that finds the page id of a given wikipedia article. 
    Additionally it adds suffixes to avoid ambivalent names or unwanted redirects such as 'Air', or 'Foals' for example
    """
    variations = ['_(band)', '_(musician)', '_(singer)',  '_(rapper)', ''] 
    for variation in variations:
        url = base_url + f'&titles={band}{variation}'
        response = http.get(url)
        parsed = json.loads(response.content)
        page_id = [*parsed.get("query").get("pages")][0]
        if page_id != "-1":
            return page_id, url, parsed, variation
    
    return page_id, url, None, variation

In [None]:
band_dict_cat, not_found = scrape_categories(bands)
print("Here are the results of the algorithm with redirect:\n\n")
print(f"it hasn't found the following {len(not_found)} bands: \n", not_found, "\n")
print(f"it has found categories for {len(band_dict_cat)} bands. Here they are...\n")
# pp.pprint(band_dict_cat)
# This is a very long printout for all the bands...

### Formatting the categories dataframe:

In [43]:
df_cat = pd.DataFrame(pd.Series(band_dict_cat).values, pd.Series(band_dict_cat).index)
df_cat.rename(columns={0: "categories"}, inplace=True)
df_cat.loc[:, 'suffix'] = df_cat.categories.map(lambda x: x[-1])
df_cat.loc[:, 'categories'] = df_cat.categories.map(lambda x: x[:-1])
df_cat.index.name = "artist"
df_cat.to_csv("gender cat.csv")

In [None]:
## just backing up
#backup_cat=df_cat.copy()
#backup_data = df_data.copy()

## Concatenating the two dataframes to find out which method is better

In [3]:
df_cat = pd.read_csv('gender cat.csv')
df_data = pd.read_csv('gender data.csv')

df_cat.index = df_cat.artist
df_cat = df_cat.drop("artist", axis=1)
df_data.index = df_data.artist
df_data = df_data.drop("artist", axis=1)

In [33]:
df = pd.concat([df_data, df_cat], axis=1, join='outer')
#df = pd.merge(df_cat, df_data)

In [34]:
## replace the nan values to make the next step work
df.categories = df.categories.fillna("not found")
df.suffix = df.suffix.fillna("")
df["categories"] = df.categories.apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").split(", "))
df.head()

Unnamed: 0,gender_data,categories,suffix
Burna_Boy,male,"[1991 births, 21st-century Nigerian musicians,...",
Katya_Yonder,unknown,[not found],
Flo_Milli,unknown,"[2000 births, 21st-century American rappers, A...",
Rival_Consoles,male,"[1985 births, All BLP articles lacking sources...",
Popcaan,male,"[1988 births, Articles with hCards, Articles w...",


### Now finding the genders from the categories

In [36]:
def find_gender_cat(df, band_dict):
    df["gender_cat"] = "unknown"
    for band in band_dict:
        for cat in df.loc[band,"categories"]:
            if ((" male" in cat) or ("Male" in cat) or ("Boy" in cat) or ("boy" in cat)):
                df.loc[band,"gender_cat"] = "male"
            if (("Female" in cat) or ("female" in cat) or ("girl" in cat) or ("Girl" in cat)):
                df.loc[band,"gender_cat"] = "female"
            if ("Transgender" in cat or "transsexual"in cat):
                df.loc[band,"gender_cat"] = "other"
                
find_gender_cat(df, list(df.index))

In [37]:
df = df.reset_index()
df = df.rename(columns={"index":"artist"})
df = df[["artist", "categories", "suffix", "gender_data", "gender_cat"]]
df["artist"] = df["artist"].apply(lambda x: wiki_to_names[x])
df.head()

Unnamed: 0,artist,categories,suffix,gender_data,gender_cat
0,Burna Boy,"[1991 births, 21st-century Nigerian musicians,...",,male,male
1,Katya Yonder,[not found],,unknown,unknown
2,Flo Milli,"[2000 births, 21st-century American rappers, A...",,unknown,female
3,Rival Consoles,"[1985 births, All BLP articles lacking sources...",,male,unknown
4,Popcaan,"[1988 births, Articles with hCards, Articles w...",,male,unknown


In [278]:
#df.to_csv("gender full.csv", index=False)
#df = pd.read_csv("gender full.csv")

In [40]:
df["gender"] = df.gender_data
for i in range(len(df)):
    if df.loc[i,"gender"] == "unknown":
        df.loc[i,"gender"] = df.loc[i,"gender_cat"]

In [46]:
#where do cat and data gender not lign up? (provided we have an entry for both)
check = df[df["gender_data"] != df["gender_cat"]]
check[(check.gender_data != "unknown") & (check.gender_cat != "unknown")]

Unnamed: 0,artist,categories,suffix,gender_data,gender_cat,gender
163,Arca,"[1989 births, Articles with Italian-language s...",_(musician),non-binary,other,non-binary
189,Phoebe Bridgers,"[1994 births, 21st-century American singers, 2...",,female,male,female
258,Carly Rae Jepsen,"[1985 births, 21st-century Canadian singers, 2...",,female,male,female
306,Elysia Crampton,"[1986 births, American electronic musicians, A...",,female,other,female
376,Sir Richard Bishop,"[All BLP articles lacking sources, American ex...",,male,female,male
829,Lucy Dacus,"[1995 births, 21st-century American singers, A...",,female,male,female
1030,Ezra Furman,"[1986 births, 21st-century American guitarists...",,transgender female,other,transgender female
1207,Kim Petras,"[1992 births, 21st-century German singers, 21s...",,transgender female,other,transgender female
1286,Miley Cyrus,"[1992 births, 21st-century American actresses,...",,genderfluid,female,genderfluid
1596,Du Blonde,"[1990 births, 21st-century British singers, Ar...",,non-binary,other,non-binary


In [264]:
#df.to_csv('gender full.csv', index=False) ### To save
#df = pd.read_csv('some list.csv') ### to load
#df.rename(columns={"Unnamed: 0": "artist"}, inplace=True)
#df.set_index("artist")
#df["categories"] = df.categories.apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").split(", "))

## We got some wrong results. 
for example articles linking to disambiguation pages.

## Returns all bands / artists with disambiguation pages
This means we have been linked to the wrong page. However, since (almost) none of them had a gender assigned to them it doesn't seem to be a huge problem right now. I might look into the root of the problem at some later time. If we investigate further this might help us get a few more categories / genders

In [42]:
selection = ["All disambiguation pages"]
mask = df.categories.apply(lambda x: np.intersect1d(x, selection).size > 0)
disamb = df[mask]
disamb

Unnamed: 0,artist,categories,suffix,gender_data,gender_cat,gender
17,Jim O’Rourke,"[All article disambiguation pages, All disambi...",,unknown,unknown,unknown
44,otta,"[All article disambiguation pages, All disambi...",,unknown,unknown,unknown
105,Haux,"[All article disambiguation pages, All disambi...",,unknown,unknown,unknown
152,Special Interest,"[All article disambiguation pages, All disambi...",,unknown,unknown,unknown
195,Pinch,"[All article disambiguation pages, All disambi...",_(musician),unknown,unknown,unknown
...,...,...,...,...,...,...
10456,LU,"[All article disambiguation pages, All disambi...",,unknown,unknown,unknown
10468,Heavenly,"[All article disambiguation pages, All disambi...",_(band),unknown,unknown,unknown
10517,American Heritage,"[All article disambiguation pages, All disambi...",,unknown,unknown,unknown
10548,LiLiPUT,"[All article disambiguation pages, All disambi...",,unknown,unknown,unknown


## 3. Approach: Using the pronouns

Stopping here as this approach leads to many wrong results. Leaving the code here for reference / people who are interested...

In [None]:
#band_list = band_list.str.title().str.strip().str.replace(" ", "_").str.replace("&", "%26")
def pronoun_gender(df, band_list, verbose=True, limit=2):
    
    time_before = time.time()
    new_df = df.copy()
    new_df["gender_pronoun"] = "unknown"
    male_identifiers = r'\bhe\b|\bhis\b'
    male_expr = re.compile(male_identifiers, re.IGNORECASE)

    female_identifiers = r'\bshe\b|\bher\b'
    female_expr = re.compile(female_identifiers, re.IGNORECASE)

    base_url = "https://en.wikipedia.org/wiki/"
    
    for counter, band in enumerate(band_list.str.title().str.strip().str.replace(" ", "_").str.replace("&", "%26")):              
        if df.loc[band]["suffix"] != "_(band)": 
            
            pronoun = "unknown"
            
            if verbose:
                print (f"working on band number {counter} of {len(band_list)}", end="\r")  
                
            url = base_url + f"{band}" + df.loc[band]["suffix"]

            response = requests.get(url)
            soup = BeautifulSoup(response.text, "html.parser")
            spoon = soup.find(id="mw-content-text").find_all("p", limit=limit)

            male = male_expr.findall(str(spoon))
            female = female_expr.findall(str(spoon))

            if len(male) > len (female):
                pronoun = "male" 
            elif len(male) < len (female):
                pronoun = "female"
            new_df.loc[band]["gender_pronoun"] = pronoun
    elapsed_time = time.time() - time_before
    print(f"this approach took {elapsed_time} seconds for a total of {len(band_list)} bands")
            
    return new_df

In [None]:
df = pronoun_gender(df, bands, limit=2, verbose=True)
#df_after.gender.value_counts()

In [None]:
## Where gender_data  and gender_pronoun are not the same. Quite a big list...(over10%)
df[(df["gender_data"] == "unknown" ) & ( df["gender_pronoun"] != "unknown" )]

In [None]:
df.to_csv('all methods all bands.csv') ### Saving the dataset