In [1]:
#Notebook for experiments in gathering data

In [121]:
import re
import csv
import requests
from bs4 import BeautifulSoup 
from utils import parse_tools

In [125]:
def parse_directors(url):

    directors = dict()
    # this regex accepts over 70 European (and some African) characters
    # source: https://port135.com/2017/01/31/how-to-allow-european-characters-in-text-fields-by-using-regular-expression/
    # Agnès Varda
    # Agnieszka Wojtowicz-Vosloo
    # Jasmila Žbanić
    # Haifaa al-Mansour
    # etc
    name_regexp = fr"[-A-z\u00c0-\u017e]+"    

    # Title = {name surname}
    # 1) title should contains only two words
    # 2) name and surnames don`t always start with upper case: Haifaa al-Mansour
    #    it`s checked with "if name.islower() or surname.islower(): continue"
    #title_regexp = f"^{name_regexp} {name_regexp}$"
    title_regexp = fr"^(\s*{name_regexp}){{2,4}}$"

    # parse url for wiki pages
    # '%' and digits added, because url could be like /wiki/Jasmila_%C5%BDbani%C4%87
    url_name_regexp = fr"[-\%\dA-z\u00c0-\u017e]+"
    url_regexp = fr"^/wiki/(\_*{url_name_regexp}){{2,4}}$"

    re_title = re.compile(title_regexp)
    re_url = re.compile(url_regexp)

    fake_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}    
    response = requests.get(url, headers=fake_headers)
    #print(response.text)
    
    html_page = response.text
    soup = BeautifulSoup(html_page, "lxml")    
    for link in soup.findAll('a', attrs={'title': re_title}):
        #print(link)
        try:
            # parse title
            title = re.match(re_title, link.get('title'))[0]
            
            #check that each part contains at least one capital
            splitted_title = title.split()                        
            cond = [s for s in list(splitted_title) if s.islower()]
            if len(cond) > 1 or (len(cond) == 1 and cond[0] not in ['von', 'de', 'van']):
                continue
                    
            # try to parse url
            try:
                wiki_url = "https://en.wikipedia.org"
                url = re.match(re_url, link.get('href'))[0]
                url = wiki_url+url
            except:
                url = None
                print("Could not parse url:", title, url)
            directors.update({title : url})
        except:
            pass
    #print(len(directors))
    return directors    

Say hello to these heroic women:
Kyōko Aizome
André-Line Beauparlant
Icíar Bollaín
Věra Chytilová
Michèle Cournoyer
Doris Dörrie
Ildikó Enyedi
Alice Guy-Blaché
Marion Hänsel
Julie Hébert
Lucile Hadžihalilović
Mia Hansen-Løve
Jackée Harry
Agnès Jaoui
Yashira Jordán
Dorota Kędzierzawska
Micheline Lanctôt
Alexandra Leclère
Nnegest Likké
Kátia Lund
Noémie Lvovsky
Lisbeth Lynghøft
Márta Mészáros
Laura Mañá
Agnès Merlet
Anne-Marie Miéville
Pilar Miró
Katrin Ottarsdóttir
Léa Pool
Brigitte Roüan
Shimako Satō
Céline Sciamma
Thérèse Sita-Bella
Danièle Thompson
Gariné Torossian
Agnès Varda
Jürgen Vsych
Lina Wertmüller
Jasmila Žbanić

In [126]:
female_directors_url = 'https://en.wikipedia.org/wiki/List_of_female_film_and_television_directors'
female_directors = parse_directors(female_directors_url)
print(len(female_directors))
female_directors

988


{'Jennifer Abbott': 'https://en.wikipedia.org/wiki/Jennifer_Abbott',
 'Marguerite Abouet': 'https://en.wikipedia.org/wiki/Marguerite_Abouet',
 'Abiola Abrams': 'https://en.wikipedia.org/wiki/Abiola_Abrams',
 'Jill Ackles': 'https://en.wikipedia.org/wiki/Jill_Ackles',
 'Joey Lauren Adams': 'https://en.wikipedia.org/wiki/Joey_Lauren_Adams',
 'Perry Miller Adato': 'https://en.wikipedia.org/wiki/Perry_Miller_Adato',
 'Maren Ade': 'https://en.wikipedia.org/wiki/Maren_Ade',
 'Yasmin Ahmad': 'https://en.wikipedia.org/wiki/Yasmin_Ahmad',
 'Peggy Ahwesh': 'https://en.wikipedia.org/wiki/Peggy_Ahwesh',
 'Kyōko Aizome': 'https://en.wikipedia.org/wiki/Ky%C5%8Dko_Aizome',
 'Mania Akbari': 'https://en.wikipedia.org/wiki/Mania_Akbari',
 'Chantal Akerman': 'https://en.wikipedia.org/wiki/Chantal_Akerman',
 'Zoya Akhtar': 'https://en.wikipedia.org/wiki/Zoya_Akhtar',
 'Nargis Akhter': 'https://en.wikipedia.org/wiki/Nargis_Akhter',
 'Haifaa al-Mansour': 'https://en.wikipedia.org/wiki/Haifaa_al-Mansour',
 '

In [127]:
# Unfortunately there is no "male directors list" :)
all_directors_url = 'https://en.wikipedia.org/wiki/List_of_film_and_television_directors'
all_directors = parse_directors(all_directors_url)
print(len(all_directors))

1774


In [128]:
male_directors = dict()
special_female_directors = dict()
for key, value in all_directors.items():
    if female_directors.get(key) == None:
        male_directors.update({key : value})
    else:
        special_female_directors.update({key : value})
len(male_directors)
#special_female_directors

1664

In [131]:
def save_directors(directors, filename):
    f = csv.writer(open(f"{filename}.csv", "w"))
    f.writerow(["name", "wiki_url"])
    for key, value in directors.items():
        f.writerow([key, value])

In [132]:
save_directors(female_directors, "female_directors_wiki")

save_directors(male_directors, "male_directors_wiki")

save_directors(all_directors, "all_directors_wiki")