# Three investigators - part 2

A project for scraping and analysing data from a fan site on the audio book called 'The three investigators'

Part 2: Actors gender prediction

- Clean up actors names to extract first name
- Using [Genderize API](https://github.com/SteelPangolin/genderize) to predict gender of actors using first names


In [6]:
# import modules
import pandas as pd
from datetime import date
import requests

# for file directories
import os

In [2]:
#change directory to root folder
os.chdir("..")

#load script to work with Genderize API (download [here](https://github.com/SteelPangolin/genderize))
#this will require to have requests imported
from src.genderize import Genderize

# Load data

In [3]:
actors = pd.read_csv(".\\data\\scraped\\actors.csv")


## Clean actors names

In [8]:
#get real name not the pseudonym (if they have one)
actors["actor_clean"] = actors["actor"].str.split("[").str[-1].str.replace("]", "")

#create column for aristocratic or academic title
actors["title"] = 0
actors.loc[actors["actor_clean"].str.contains(" van "), "title"] = 1
actors.loc[actors["actor_clean"].str.contains(" von "), "title"] = 1
actors.loc[actors["actor_clean"].str.contains("Dr. "), "title"] = 1
actors.loc[actors["actor_clean"].str.contains("Prof. "), "title"] = 1

#remove aristocratic title in cleaned name
actors["actor_clean"] = (actors["actor_clean"].str.replace(" von", "")
                            .str.replace(" van", "")
                            .str.replace(" der", "")
                            .str.replace(" de", "")
                            .str.replace("Dr. ", "")
                           .str.replace("Prof. ", ""))

#get first and last name
actors["firstname"] = actors["actor_clean"].str.split(" ").str[0]
actors["surename"] = actors["actor_clean"].str.split(" ").str[-1]

#get number of names
actors["no_names"] = actors["actor_clean"].str.split(" ").apply(lambda x: len(x))

#save output
actors.to_csv(".\\data\\processed\\actors_cleaned_" + "%s" %(str(date.today())) + ".csv", index=False) 

actors.head()

Unnamed: 0,role,actor,id,actor_clean,title,firstname,surename,no_names
0,"Hitchcock, Erzähler",Peter Pasetti,1,Peter Pasetti,0,Peter,Pasetti,2
1,"Justus Jonas, Erster Detektiv",Oliver Rohrbeck,1,Oliver Rohrbeck,0,Oliver,Rohrbeck,2
2,"Peter Shaw, Zweiter Detektiv",Jens Wawrczeck,1,Jens Wawrczeck,0,Jens,Wawrczeck,2
3,"Bob Andrews, Recherchen und Archiv",Andreas Fröhlich,1,Andreas Fröhlich,0,Andreas,Fröhlich,2
4,"Mr. Fentriss, Schriftsteller",Richard Lauffen,1,Richard Lauffen,0,Richard,Lauffen,2


# Predict actor gender and save output file

In [9]:
#get gender for list of firstnames from Genderize.io

#get unique list of first actor names
names_unique = list (set (actors["firstname"]))

#access genderize.io api using this repository: tps://github.com/SteelPangolin/genderize
gender_raw = Genderize().get(names_unique)

#make dictionary into data frame
gender = []
for i in range(len(gender_raw)):
    c = pd.DataFrame(gender_raw[i].items())[1]
    gender.append(c)

#rename columns
gender = pd.concat(gender, axis=1).transpose().rename(columns={0:"name", 1:"gender", 2:"gender_probability", 3:"gender_count"})  

#save output as backup to access later
gender.to_csv(".\\data\\processed\\actor_gender_prediction_" + "%s" %(str(date.today())) + ".csv", index=False) 

#display
gender.head()

Unnamed: 0,name,gender,gender_probability,gender_count
1,Guido,male,0.99,15568
1,René,male,1.0,4
1,Heiner,male,0.98,760
1,Hella,female,0.93,704
1,Tanja,female,0.98,35007
