In [1]:
import time
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import gender_guesser.detector as gender
from genderize import Genderize


# Load IMDB data

In [2]:
fpath = "./title.crew.tsv/data.tsv"
df_crew = pd.read_csv(fpath, sep='\t')

fpath = "./title.ratings.tsv/data.tsv"
df_ratings = pd.read_csv(fpath, sep='\t')

fpath = "./name.basics.tsv/data.tsv"
df_name = pd.read_csv(fpath, sep='\t')

fpath = "./title.basics.tsv/data.tsv"
df_title = pd.read_csv(fpath, sep='\t')

print('len artist names:', len(df_name)   )
print('len ratings file:', len(df_ratings))
print('len crew file   :', len(df_crew)   )
print('len movie title :', len(df_title)  )


  interactivity=interactivity, compiler=compiler, result=result)


len artist names: 9604401
len ratings file: 976773
len crew file   : 6193756
len movie title : 6193756


In [3]:
df_merged = pd.merge(df_title , df_crew[['tconst', 'directors']] , left_on='tconst', right_on='tconst')
df_merged = pd.merge(df_merged, df_ratings                       , left_on='tconst', right_on='tconst')
df_merged = pd.merge(df_merged, df_name[['nconst','primaryName']], left_on='directors', right_on='nconst')
df_merged = df_merged.rename(columns={'primaryName': "Director"})#.drop(['nconst'], axis=1)
df_merged = df_merged[['tconst'        , 'primaryTitle', 'originalTitle', 'startYear', 'endYear',
                       'runtimeMinutes', 'genres'      , 'averageRating', 'numVotes' , 'Director']]
print('# of Titles with Ratings and Director Name:',len(df_merged))
df_merged.head()

# of Titles with Ratings and Director Name: 723690


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,Director
0,tt0000001,Carmencita,Carmencita,1894,\N,1,"Documentary,Short",5.6,1538,William K.L. Dickson
1,tt0000005,Blacksmith Scene,Blacksmith Scene,1893,\N,1,"Comedy,Short",6.1,1909,William K.L. Dickson
2,tt0000006,Chinese Opium Den,Chinese Opium Den,1894,\N,1,Short,5.2,102,William K.L. Dickson
3,tt0000008,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,1894,\N,1,"Documentary,Short",5.4,1643,William K.L. Dickson
4,tt0000036,Awakening of Rip,Awakening of Rip,1896,\N,0,"Drama,Short",4.5,449,William K.L. Dickson


# Get Directors gender

In [4]:
d = gender.Detector()
def get_gender_from_name(x):
    return d.get_gender(x)

df_merged['DirectorFirstName'] = df_merged['Director'].str.split().str[0]
df_merged['DirectorGender']    = df_merged['DirectorFirstName'].apply(get_gender_from_name)
print('# Different names:', df_merged['Director'].nunique())
print('# Unknown gender :', df_merged[df_merged['DirectorGender'] == 'unknown']['Director'].nunique())
print('Names cathegories:', df_merged['DirectorGender'].unique())
df_merged.head()

# Different names: 152330
# Unknown gender : 20843
Names cathegories: ['male' 'unknown' 'female' 'mostly_male' 'mostly_female' 'andy']


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,Director,DirectorFirstName,DirectorGender
0,tt0000001,Carmencita,Carmencita,1894,\N,1,"Documentary,Short",5.6,1538,William K.L. Dickson,William,male
1,tt0000005,Blacksmith Scene,Blacksmith Scene,1893,\N,1,"Comedy,Short",6.1,1909,William K.L. Dickson,William,male
2,tt0000006,Chinese Opium Den,Chinese Opium Den,1894,\N,1,Short,5.2,102,William K.L. Dickson,William,male
3,tt0000008,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,1894,\N,1,"Documentary,Short",5.4,1643,William K.L. Dickson,William,male
4,tt0000036,Awakening of Rip,Awakening of Rip,1896,\N,0,"Drama,Short",4.5,449,William K.L. Dickson,William,male


In [33]:
df_merged['DirectorGender'].value_counts()/df_merged['DirectorGender'].value_counts().sum()

male             0.726853
unknown          0.125043
female           0.080116
mostly_male      0.045439
mostly_female    0.012983
andy             0.009566
Name: DirectorGender, dtype: float64

# Upload descriptions and merge

In [5]:
years = [2000, 2001, 2002, 2003, 2004, 2005]
df_descriptions = pd.DataFrame()
for year in years:
    df_aux = pd.read_csv('IMDB_movie_description_'+str(year)+'.csv').drop(['Unnamed: 0'], axis=1)
    df_descriptions = df_descriptions.append(df_aux)
df_merged = pd.merge(df_merged, df_descriptions[['tconst','description']], left_on='tconst', right_on='tconst')
print('# Movies with description, ratings and directors name', len(df_merged))
df_merged.head()

# Movies with description, ratings and directors name 5854


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,Director,DirectorFirstName,DirectorGender,description
0,tt0283422,I'm Going Home,Je rentre à la maison,2001,\N,90,"Comedy,Drama",7.0,1603,Manoel de Oliveira,Manoel,male,Je rentre à la maison is a movie starring Mich...
1,tt0364093,A Talking Picture,Um Filme Falado,2003,\N,96,"Comedy,Drama,History",6.6,1839,Manoel de Oliveira,Manoel,male,Um Filme Falado is a movie starring Leonor Sil...
2,tt0035423,Kate & Leopold,Kate & Leopold,2001,\N,118,"Comedy,Fantasy,Romance",6.4,74950,James Mangold,James,male,"Kate & Leopold is a movie starring Meg Ryan, H..."
3,tt0309698,Identity,Identity,2003,\N,90,"Mystery,Thriller",7.3,212248,James Mangold,James,male,"Identity is a movie starring John Cusack, Ray ..."
4,tt0358273,Walk the Line,Walk the Line,2005,\N,136,"Biography,Drama,Music",7.8,215793,James Mangold,James,male,Walk the Line is a movie starring Joaquin Phoe...


# Tfidf Vectorizer

In [11]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_merged, test_size=0.2)
text_train  = train['description'].values
text_test   =  test['description'].values

# create the transform
# tokenize and build vocab
stop_words   = stopwords.words('english')
vectorizer   = TfidfVectorizer(stop_words=stop_words, min_df=10, max_df=250, lowercase=True)
vector_train = vectorizer.fit_transform(text_train)
vector_test  = vectorizer.transform(text_test)

# summarize
#print(vectorizer.vocabulary_)
print(vector_train.shape)
print(vector_test.shape)
#print(vector.toarray())

(4683, 1814)
(1171, 1814)


In [14]:
from sklearn.naive_bayes import MultinomialNB
clf       = MultinomialNB().fit(vector_train, train['DirectorGender'])
predicted = clf.predict(vector_test)


In [22]:
real_genders = test['DirectorGender'].values
for i in enumerate(predicted):
    print(i[1],real_genders[i[0]])


male unknown
male male
male male
male male
male male
male male
male andy
male unknown
male male
male male
male male
male male
male male
male male
male male
male male
male unknown
male female
male unknown
male male
male male
male male
male unknown
male male
male unknown
male female
male female
male male
male male
male male
male male
unknown unknown
male male
male male
male unknown
male male
male unknown
male male
male unknown
male male
male mostly_male
male male
male male
male male
male female
male female
male male
male male
male male
male male
male andy
male male
male male
unknown unknown
male unknown
male unknown
male female
male male
male male
male male
male female
male male
male male
male male
male unknown
male male
male male
male male
male male
male male
male male
male male
male male
male male
male male
male male
male male
male male
male male
male male
male unknown
male male
male male
male male
male mostly_male
male female
male male
male male
male male
male male
male female
male ma

In [34]:
np.mean(predicted == real_genders)

0.7198975234842016