## I. First Names Dataset

In [1]:
import pandas as pd
# import numpy as np

In [41]:
name_df = pd.read_excel('SSA_Names_DB.xlsx')
name_df.head()

Unnamed: 0,Name,Gender,Frequency,Include?
0,Emma,F,20355,Yes
1,Olivia,F,19553,Yes
2,Noah,M,19511,Yes
3,Liam,M,18281,Yes
4,Sophia,F,17327,Yes


### Clean name dataset
* Drop `Frequency` and `Include?` columns
* Change genders to numerical values
    * `M` $\rightarrow$ `0`
    * `F` $\rightarrow$ `1`
    * Gender neutral names included in both categories $\rightarrow$ `2`
* Drop duplicated gender neutral names

In [93]:
name_df.Gender.value_counts()

F    18993
M    13959
Name: Gender, dtype: int64

In [94]:
name_df['Include?'].value_counts()

No     27252
Yes     5700
Name: Include?, dtype: int64

In [42]:
# change gender values to binary
name_df.Gender.replace({'M': 0, 'F': 1}, inplace=True)

In [17]:
name_df.head()

Unnamed: 0,Name,Gender,Frequency,Include?
0,Emma,1,20355,Yes
1,Olivia,1,19553,Yes
2,Noah,0,19511,Yes
3,Liam,0,18281,Yes
4,Sophia,1,17327,Yes


In [5]:
# according to the data dictionary, the Include? column has to do with number of social security applications and
# is not needed for this project

In [43]:
name_df.drop('Include?', axis=1, inplace=True)

In [44]:
name_df.head()

Unnamed: 0,Name,Gender,Frequency
0,Emma,1,20355
1,Olivia,1,19553
2,Noah,0,19511
3,Liam,0,18281
4,Sophia,1,17327


In [80]:
name_df.values[0]

array(['Emma', 1, 20355], dtype=object)

In [99]:
female_names = []
male_names = []

for x in name_df.values:
    if x[1] == 1:
        female_names.append(x[0])
    if x[1] == 0:
        male_names.append(x[0])

In [100]:
any(i in female_names for i in male_names)

True

In [101]:
neutral = set(female_names).intersection(set(male_names))

In [102]:
len(neutral)

2492

In [86]:
list(neutral)[:10]

[True,
 'Zyair',
 'Amour',
 'Kentley',
 'Jermiah',
 'Yichen',
 'Truett',
 'Channing',
 'Brody',
 'Christian']

In [8]:
name_df.loc[name_df.Name == 'Olivia']

Unnamed: 0,Name,Gender,Frequency
1,Olivia,1,19553
22217,Olivia,0,8


In [19]:
name_df.loc[name_df.Name == 'Emma']

Unnamed: 0,Name,Gender,Frequency
0,Emma,1,20355
18390,Emma,0,10


In [109]:
len(name_df.loc[name_df.Frequency < 20])

21698

In [112]:
len(name_df.loc[name_df.Frequency < 15])

19037

In [45]:
# try names that appear more than 30 times, because using 15 as the threshold still included some males with the name
# of 'Emily'
common_names = name_df.loc[name_df.Frequency > 30]

In [46]:
common_names.shape

(8128, 3)

In [47]:
female_names = []
male_names = []

for x in common_names.values:
    if x[1] == 1:
        female_names.append(x[0])
    if x[1] == 0:
        male_names.append(x[0])

neutral = set(female_names).intersection(set(male_names))

In [48]:
len(neutral)

469

In [49]:
list(neutral)[:50]

[True,
 'Jai',
 'Noor',
 'Monroe',
 'Chase',
 'Porter',
 'Kyler',
 'Landry',
 'Cypress',
 'Kyree',
 'Jayce',
 'Jordan',
 'Kingsley',
 'Bryar',
 'Cassidy',
 'Cooper',
 'Easton',
 'Brooklyn',
 'Austin',
 'Fallon',
 'Parker',
 'Tory',
 'Sunny',
 'Teagan',
 'Harper',
 'Lennon',
 'Tatum',
 'Daylin',
 'Maddox',
 'Lane',
 'Arrow',
 'Hudson',
 'Jamison',
 'Wisdom',
 'Henley',
 'Jessy',
 'Leighton',
 'Charley',
 'Micaiah',
 'Adrian',
 'Yuri',
 'Ira',
 'London',
 'Rylee',
 'Hendrix',
 'Jesse',
 'Devon',
 'Tenzin',
 'Clarke',
 'Tristyn']

In [59]:
# for x in list(neutral):
#     common_names.loc[common_names.Name == x, 'Gender'] = 2

In [51]:
common_names.head()

Unnamed: 0,Name,Gender,Frequency
0,Emma,1,20355
1,Olivia,1,19553
2,Noah,2,19511
3,Liam,0,18281
4,Sophia,1,17327


In [52]:
common_names.Gender.value_counts()

1    4089
0    3101
2     938
Name: Gender, dtype: int64

In [54]:
common_names[:25]

Unnamed: 0,Name,Gender,Frequency
0,Emma,1,20355
1,Olivia,1,19553
2,Noah,2,19511
3,Liam,0,18281
4,Sophia,1,17327
5,Mason,2,16535
6,Ava,1,16286
7,Jacob,0,15816
8,William,0,15809
9,Isabella,1,15504


In [58]:
# common_names.drop('Frequency', axis=1, inplace=True)
# common_names.drop_duplicates(inplace=True)

In [56]:
common_names.head()

Unnamed: 0,Name,Gender
0,Emma,1
1,Olivia,1
2,Noah,2
3,Liam,0
4,Sophia,1


In [57]:
common_names.Gender.value_counts()

1    4089
0    3101
2     469
Name: Gender, dtype: int64

#### Function to reassign gender neutral names to value 2 and to drop duplicates

In [45]:
# function to reassign gender value 2 to gender neutral names and drop duplicates from single dataframe
# for this dataset, would need to drop Include? and Frequency columns before using this function

# def reassign_gender_neutral_drop_duplicates(df):
    
#     girl_names = []
#     boy_names = []

#     for x in df.values:
#         if x[1] == 1:
#             girl_names.append(x[0])
#         if x[1] == 0:
#             boy_names.append(x[0])
    
#     neutral = set(girl_names).intersection(set(boy_names))
    
#     for x in list(neutral):
#         df.loc[df.name == x, 'gender'] = 2
        
#     df.drop_duplicates(inplace=True)

### Split dataframe into three series
* Female names
* Male names
* Gender-neutral names

In [60]:
male_names = common_names.loc[common_names.Gender == 0]
female_names = common_names.loc[common_names.Gender == 1]
neutral_names = common_names.loc[common_names.Gender == 2]

In [62]:
male_names.head()

Unnamed: 0,Name,Gender
3,Liam,0
7,Jacob,0
8,William,0
10,Ethan,0
13,Alexander,0


In [63]:
female_names.head()

Unnamed: 0,Name,Gender
0,Emma,1
1,Olivia,1
4,Sophia,1
6,Ava,1
9,Isabella,1


In [64]:
neutral_names.head()

Unnamed: 0,Name,Gender
2,Noah,2
5,Mason,2
12,James,2
14,Michael,2
18,Aiden,2


#### Drop gender from each new DataFrame

In [66]:
# for df in male_names, female_names, neutral_names:
#     df.drop('Gender', axis=1, inplace=True)

### Create new .csv files from each Series

In [68]:
male_names.to_csv('male_names.csv', index=False)

In [69]:
# check to see if that worked
males = pd.read_csv('male_names.csv')
males.head()

Unnamed: 0,Name
0,Liam
1,Jacob
2,William
3,Ethan
4,Alexander


In [70]:
female_names.to_csv('female_names.csv', index=False)
neutral_names.to_csv('neutral_names.csv', index=False)

## II. Movie Datasets
* Wishlist for one dataset containing
    * Title
    * Year
    * Genre
    * Different ratings (Rotten Tomatoes, IMDB, critic rating, user ratings)
    * Director(s) name
    * Director(s) gender
    * Writer(s) name
    * Writer(s) gender
    * Producer(s) name
    * Producer(s) gender
    * Lead actor(s) name
    * Lead actor(s) gender
    * Budget
    * Box office revenue
    * Bechdel points (0,1,2, or 3)
    * Bechdel binary pass or fail
    * Magical wish: production company and information on company

### A. Hydra Movies Dataset
* Note: rating may be user rating from Hydra Movies website, which I can't seem to find. . . 
* IMDB ID included

In [73]:
hydra_df = pd.read_csv('original_data/Hydra-Movie-Scrape.csv')
hydra_df.head()

Unnamed: 0,Title,Year,Summary,Short Summary,Genres,IMDB ID,Runtime,YouTube Trailer,Rating,Movie Poster,Director,Writers,Cast
0,Patton Oswalt: Annihilation,2017,"Patton Oswald, despite a personal tragedy, pro...","Patton Oswalt, despite a personal tragedy, pro...",Uncategorized,tt7026230,66,4hZi5QaMBFc,7.4,https://hydramovies.com/wp-content/uploads/201...,Bobcat Goldthwait,Patton Oswalt,Patton Oswalt
1,New York Doll,2005,A recovering alcoholic and recently converted ...,A recovering alcoholic and recently converted ...,Documentary|Music,tt0436629,75,jwD04NsnLLg,7.9,https://hydramovies.com/wp-content/uploads/201...,Greg Whiteley,Arthur Kane,Sylvain Sylvain
2,Mickey's Magical Christmas: Snowed in at the H...,2001,After everyone is snowed in at the House of Mo...,Mickey and all his friends hold their own Chri...,Adventure|Animation|Comedy|Family|Fantasy,tt0300195,65,uCKwHHftrU4,6.8,https://hydramovies.com/wp-content/uploads/201...,Tony Craig,Thomas Hart,Carlos Alazraqui|Wayne Allwine
3,Mickey's House of Villains,2001,The villains from the popular animated Disney ...,The villains from the popular animated Disney ...,Animation|Comedy|Family|Fantasy|Horror,tt0329374,0,JA03ciYt-Ek,6.6,https://hydramovies.com/wp-content/uploads/201...,Jamie Mitchell,Thomas Hart,Tony Anselmo|Wayne Allwine
4,And Then I Go,2017,"In the cruel world of junior high, Edwin suffe...","In the cruel world of junior high, Edwin suffe...",Drama,tt2018111,99,8CdIiD6-iF0,7.6,https://hydramovies.com/wp-content/uploads/201...,Vincent Grashaw,Brett Haley,Arman Darbo|Sawyer Barth


In [77]:
hydra_df.shape

(3940, 13)

In [84]:
hydra_movies = []
for movie in hydra_df.Title:
    hydra_movies.append(movie.lower().strip())

In [90]:
# hydra_movies[:50]

### B. IMDB Movies Dataset
* Note: not sure what num_critic_for_reviews represents
* IMDB ID is included, in an annoying way

In [75]:
imdb_df = pd.read_csv('original_data/movie_metadata.csv')
imdb_df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [78]:
imdb_df.shape

(5043, 28)

In [79]:
imdb_df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [80]:
imdb_df.movie_imdb_link.head()

0    http://www.imdb.com/title/tt0499549/?ref_=fn_t...
1    http://www.imdb.com/title/tt0449088/?ref_=fn_t...
2    http://www.imdb.com/title/tt2379713/?ref_=fn_t...
3    http://www.imdb.com/title/tt1345836/?ref_=fn_t...
4    http://www.imdb.com/title/tt5289954/?ref_=fn_t...
Name: movie_imdb_link, dtype: object

IMDB id contained in IMDB link

In [86]:
imdb_movies = []
for movie in imdb_df.movie_title:
    imdb_movies.append(movie.lower().strip())

In [89]:
# imdb_movies[:50]

### C. TMDB Movies Dataset
* Note: vote average and vote count specific to TMDB website
* IMDB ID not included, but could possibly be found in API if joining on titles doesn't work well

In [76]:
tmdb_df = pd.read_csv('original_data/Movies.csv')
tmdb_df.head()

Unnamed: 0,revenue,vote_average,vote_count,title,original_language,release_date,production_companies,production_countries,genres,director,producer,cast,runtimes,writer
0,4300000,6.6,714,Four Rooms,en,1995-12-09,"Miramax Films,A Band Apart",United States of America,"Crime,Comedy","Allison Anders,Alexandre Rockwell,Robert Rodri...","Lawrence Bender,Paul Hellerman,Scott Lambert,","Sammi Davis,Amanda De Cadenet,Valeria Golino,",98.0,"Allison Anders,Alexandre Rockwell,Robert Rodri..."
1,12,6.4,96,Judgment Night,en,1993-10-15,"Universal Pictures,Largo Entertainment,JVC Ent...","Japan,United States of America","Action,Thriller,Crime",Stephen Hopkins,"Gene Levy,Christopher Mollo,Lloyd Segan,","Emilio Estevez,Cuba Gooding Jr.,Denis Leary,",110.0,"Lewis Colick,Jere Cunningham,Lewis Colick"
2,775398007,8.1,8550,Star Wars,en,1977-05-25,"Lucasfilm,20th Century Fox",United States of America,"Adventure,Action,Science Fiction",George Lucas,"Gary Kurtz,George Lucas,Rick McCallum","Mark Hamill,Harrison Ford,Carrie Fisher,",121.0,George Lucas
3,940335536,7.7,8085,Finding Nemo,en,2003-05-30,Pixar,United States of America,"Animation,Family","Andrew Stanton,Lee Unkrich","Jinko Gotoh,John Lasseter,Graham Walters","Albert Brooks,Ellen DeGeneres,Alexander Gould,",100.0,"Andrew Stanton,Andrew Stanton,Bob Peterson,"
4,677945399,8.3,10346,Forrest Gump,en,1994-07-06,Paramount,United States of America,"Comedy,Drama,Romance",Robert Zemeckis,"Wendy Finerman,Charles Newirth,Steve Starkey,","Tom Hanks,Rebecca Williams,Sally Field,",142.0,"Winston Groom,Eric Roth"


In [81]:
tmdb_df.shape

(10000, 14)

In [88]:
tmdb_movies = []
for movie in tmdb_df.title:
    tmdb_movies.append(movie.lower().strip())

### Check for movies shared between all 3 datasets

In [94]:
# movies_in_three = set(hydra_movies).intersection(set(imdb_movies)).intersection(set(tmdb_movies))
movies_in_three = set(hydra_movies) & set(imdb_movies) & set(tmdb_movies)

In [95]:
movies_in_three = list(movies_in_three)
print(f'There are {len(movies_in_three)} movies in all three datasets')

There are 772 movies in all three datasets


In [93]:
# oh no lol

In [97]:
hydra_imdb_shared = set(hydra_movies) & set(imdb_movies)
hydra_tmdb_shared = set(hydra_movies) & set(tmdb_movies)
imdb_tmdb_shared = set(imdb_movies) & set(tmdb_movies)

print(f'There are {len(list(hydra_imdb_shared))} movies in both the hydra and imdb datasets.')
print(f'There are {len(list(hydra_tmdb_shared))} movies in both the hydra and tmdb datasets.')
print(f'There are {len(list(imdb_tmdb_shared))} movies in both the imdb and tmdb datasets.')

There are 1607 movies in both the hydra and imdb datasets.
There are 972 movies in both the hydra and tmdb datasets.
There are 2660 movies in both the imdb and tmdb datasets.


### Confirm shared movies in hydra and imdb datasets with IMDB id

In [98]:
imdb_df.movie_imdb_link[0]

'http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1'

In [99]:
link_bits = imdb_df.movie_imdb_link[0].split('/')
link_bits

['http:', '', 'www.imdb.com', 'title', 'tt0499549', '?ref_=fn_tt_tt_1']

In [113]:
imdb_ids = []
movie_links = imdb_df.movie_imdb_link.values
for link in movie_links:
    bits = link.split('/')
    imdb_ids.append(bits[4])

In [115]:
len(imdb_ids)

5043

In [116]:
imdb_df.shape

(5043, 28)

In [117]:
imdb_ids[:10]

['tt0499549',
 'tt0449088',
 'tt2379713',
 'tt1345836',
 'tt5289954',
 'tt0401729',
 'tt0413300',
 'tt0398286',
 'tt2395427',
 'tt0417741']

In [118]:
imdb_df['imdb_id'] = imdb_ids

In [119]:
imdb_df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,imdb_id
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,tt0499549
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,tt0449088
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,tt2379713
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,tt1345836
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,tt5289954


In [120]:
hydra_ids = []
for imdb_id in hydra_df['IMDB ID']:
    hydra_ids.append(imdb_id)

In [121]:
# the letters i and d are starting to look less and less like real letters
imdb_df_ids = []
for i_id in imdb_df.imdb_id:
    imdb_df_ids.append(i_id)

In [124]:
hydra_imdb_ids_shared = set(hydra_ids) & set(imdb_df_ids)
print(f'There are {len(list(hydra_imdb_ids_shared))} total shared ids in the hydra and imdb datasets, and \
I hope this is the same number I got before, which was. . . 1607.')

There are 1580 total shared ids in the hydra and imdb datasets, and I hope this is the same number I got before, which was. . . 1607.


In [125]:
# noooooooo!

In [126]:
hydra_df.isnull().sum()

Title               0
Year                0
Summary             5
Short Summary       1
Genres              0
IMDB ID             0
Runtime             0
YouTube Trailer    47
Rating              0
Movie Poster        0
Director            0
Writers            18
Cast               24
dtype: int64

## III. Clean Hydra Movies Dataset - To Do
The hydra movies seem more obscure than the movies in the other datasets, making them less likely to have Bechdel Test scores. So I'll deal with these later. 

In [127]:
hydra_df.isnull().sum()

Title               0
Year                0
Summary             5
Short Summary       1
Genres              0
IMDB ID             0
Runtime             0
YouTube Trailer    47
Rating              0
Movie Poster        0
Director            0
Writers            18
Cast               24
dtype: int64

### A. Drop extraneous columns
Columns that could prettify dashboard, but are not needed for analysis
* YouTube Trailer
* Movie Poster
Columns that could be useful later

## IV. Clean IMDB Movies Dataset

In [128]:
imdb_df.isnull().sum()

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
imdb_id                        0
dtype: int64

### A. Drop extraneous columns
Columns that are unique to this dataset, that I may use later for extra analyses. The facebook likes columns could probably be used as their own dataset later for analysis.
* color
* number of critics for reviews
* duration
* director facebook likes 
* actor 3 facebook likes
* actor 1 facebook likes
* number voted users
* total facebook likes for cast (?)
* face number in poster
* plot keywords
* movie imdb link
* number of users for reviews
* language
* country
* content rating
* actor 2 facebook likes
* aspect ratio
* movie facebook likes

In [130]:
imdb_df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes', 'imdb_id'],
      dtype='object')

In [133]:
cols_to_drop = ['color', 'num_critic_for_reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', \
               'actor_1_facebook_likes', 'num_voted_users', 'cast_total_facebook_likes', 'facenumber_in_poster', \
               'plot_keywords', 'movie_imdb_link', 'num_user_for_reviews', 'language', 'country', 'content_rating', \
               'actor_2_facebook_likes', 'aspect_ratio', 'movie_facebook_likes']

In [134]:
imdb_df.drop(columns=cols_to_drop, axis=1, inplace=True)

In [135]:
imdb_df.head()

Unnamed: 0,director_name,actor_2_name,gross,genres,actor_1_name,movie_title,actor_3_name,budget,title_year,imdb_score,imdb_id
0,James Cameron,Joel David Moore,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,Wes Studi,237000000.0,2009.0,7.9,tt0499549
1,Gore Verbinski,Orlando Bloom,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,Jack Davenport,300000000.0,2007.0,7.1,tt0449088
2,Sam Mendes,Rory Kinnear,200074175.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,Stephanie Sigman,245000000.0,2015.0,6.8,tt2379713
3,Christopher Nolan,Christian Bale,448130642.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,Joseph Gordon-Levitt,250000000.0,2012.0,8.5,tt1345836
4,Doug Walker,Rob Walker,,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens ...,,,,7.1,tt5289954


In [None]:
# wait. . . Daniel Craig - who playes JAMES BOND IN A JAMES BOND FILM - is not listed as actor 1, 2, OR 3???
# uggghhhhhhhhh
# uggghhhhghghghghgh
# not sure if I can trust this dataset

In [136]:
imdb_df.isnull().sum()

director_name    104
actor_2_name      13
gross            884
genres             0
actor_1_name       7
movie_title        0
actor_3_name      23
budget           492
title_year       108
imdb_score         0
imdb_id            0
dtype: int64

In [137]:
# could probably use IMDbPY to fill in missing values

## V. Clean the TMDB Movies Dataset

In [2]:
tmdb_df = pd.read_csv('other_data_that_is_not_to_be_trusted/movies.csv')

In [3]:
tmdb_df.head()

Unnamed: 0,revenue,vote_average,vote_count,title,original_language,release_date,production_companies,production_countries,genres,director,producer,cast,runtimes,writer
0,4300000,6.6,714,Four Rooms,en,1995-12-09,"Miramax Films,A Band Apart",United States of America,"Crime,Comedy","Allison Anders,Alexandre Rockwell,Robert Rodri...","Lawrence Bender,Paul Hellerman,Scott Lambert,","Sammi Davis,Amanda De Cadenet,Valeria Golino,",98.0,"Allison Anders,Alexandre Rockwell,Robert Rodri..."
1,12,6.4,96,Judgment Night,en,1993-10-15,"Universal Pictures,Largo Entertainment,JVC Ent...","Japan,United States of America","Action,Thriller,Crime",Stephen Hopkins,"Gene Levy,Christopher Mollo,Lloyd Segan,","Emilio Estevez,Cuba Gooding Jr.,Denis Leary,",110.0,"Lewis Colick,Jere Cunningham,Lewis Colick"
2,775398007,8.1,8550,Star Wars,en,1977-05-25,"Lucasfilm,20th Century Fox",United States of America,"Adventure,Action,Science Fiction",George Lucas,"Gary Kurtz,George Lucas,Rick McCallum","Mark Hamill,Harrison Ford,Carrie Fisher,",121.0,George Lucas
3,940335536,7.7,8085,Finding Nemo,en,2003-05-30,Pixar,United States of America,"Animation,Family","Andrew Stanton,Lee Unkrich","Jinko Gotoh,John Lasseter,Graham Walters","Albert Brooks,Ellen DeGeneres,Alexander Gould,",100.0,"Andrew Stanton,Andrew Stanton,Bob Peterson,"
4,677945399,8.3,10346,Forrest Gump,en,1994-07-06,Paramount,United States of America,"Comedy,Drama,Romance",Robert Zemeckis,"Wendy Finerman,Charles Newirth,Steve Starkey,","Tom Hanks,Rebecca Williams,Sally Field,",142.0,"Winston Groom,Eric Roth"


In [140]:
tmdb_df.isnull().sum()

revenue                    0
vote_average               0
vote_count                 0
title                      0
original_language          0
release_date               8
production_companies    1485
production_countries       0
genres                   243
director                  69
producer                 115
cast                      32
runtimes                 120
writer                   290
dtype: int64

In [141]:
tmdb_df.shape

(10000, 14)

In [4]:
tmdb_sample = tmdb_df.sample(n=10)

In [5]:
tmdb_sample

Unnamed: 0,revenue,vote_average,vote_count,title,original_language,release_date,production_companies,production_countries,genres,director,producer,cast,runtimes,writer
6159,0,6.2,6,It Happened in Brooklyn,en,1947-04-07,Metro-Goldwyn-Mayer,United States of America,"Comedy,Music,Romance",Richard Whorf,Jack Cummings,"Frank Sinatra,Kathryn Grayson,Peter Lawford,",104.0,"Isobel Lennart,Jack McGowan"
9078,0,6.4,45,The Pope of Greenwich Village,en,1984-06-22,United Artists,United States of America,"Drama,Action,Comedy,Crime",Stuart Rosenberg,"Gene Kirkwood,Hawk Koch,Benjamin Rosenberg","Eric Roberts,Mickey Rourke,Daryl Hannah,",121.0,"Vincent Patrick,Vincent Patrick"
3403,30626182,5.3,108,Major League II,en,1994-03-30,"Morgan Creek Productions,Warner Bros. Pictures",United States of America,Comedy,David S. Ward,"Gary Barber,Edward D. Markley,Julia Miller,","Charlie Sheen,Tom Berenger,Corbin Bernsen,",105.0,"David S. Ward,R.J. Stewart,Tom S. Parker,"
2878,15360553,6.6,289,The Witches,en,1990-05-25,"Lorimar Film Entertainment,Jim Henson Productions","United Kingdom,United States of America","Adventure,Fantasy,Horror",Nicolas Roeg,"Jim Henson,Mark Shivas,Dusty Symonds","Anjelica Huston,Mai Zetterling,Jasen Fisher,",91.0,"Roald Dahl,Allan Scott"
3038,11285588,7.2,228,Manhattan Murder Mystery,en,1993-08-18,TriStar Pictures,United States of America,"Comedy,Mystery",Woody Allen,"Robert Greenhut,Joseph Hartwick,Charles H. Joffe,","Woody Allen,Diane Keaton,Jerry Adler,",104.0,"Woody Allen,Marshall Brickman"
7461,0,6.7,48,The Naked Kiss,en,1964-10-29,"The Criterion Collection,Leon Fromkess-Sam Fir...",United States of America,"Crime,Drama",Samuel Fuller,"Samuel Fuller,Sam Firks,Leon Fromkess","Karen Conrad,Marie Devereux,Betty Bronson,",90.0,Samuel Fuller
4692,3,6.0,74,Feast of Love,en,2007-09-28,"Revelations Entertainment,Greenestreet Films,L...",United States of America,"Comedy,Drama,Romance",Robert Benton,"Marisa Forzano,Ted Gidlow,Gary Lucchesi,","Morgan Freeman,Greg Kinnear,Radha Mitchell,",97.0,"Allison Burnett,Charles Baxter"
3934,17900000,6.8,343,Bedknobs and Broomsticks,en,1971-10-07,Walt Disney Productions,United States of America,"Adventure,Fantasy,Animation,Comedy,Family,Music",Robert Stevenson,Bill Walsh,"Angela Lansbury,David Tomlinson,Roddy McDowall,",117.0,"Ralph Wright,Ted Berman,Bill Walsh,"
2240,16478900,5.1,101,Terminal Velocity,en,1994-09-23,"Hollywood Pictures,Interscope Communications,P...",United States of America,"Action,Thriller",Deran Sarafian,"Ron Booth,Joan Bradshaw,Robert W. Cort,","Charlie Sheen,Nastassja Kinski,James Gandolfini,",102.0,David Twohy
8983,0,5.8,5,Barbra Streisand: One Voice,en,1986-12-27,HBO,United States of America,"Music,TV Movie",Jack Arnold,Albert Zugsmith,"Grant Williams,Randy Stuart,April Kent,",81.0,"Richard Matheson,Richard Matheson,Richard Alan..."


In [6]:
# it happened in brooklyn - verified
# the pope of greenwich village - info verified, though missing box office which is on wikipedia
# ok all this stuff looks accurate - I think we can trust this dataset

In [7]:
bechdel_df = pd.read_csv('my_data/bechdel_test_movies.csv')
bechdel_df.head()

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
0,2019,Alita: Battle Angel,3,1,tt0437086,http://us.imdb.com/title/tt0437086/
1,2019,American Woman,3,1,tt9109492,http://us.imdb.com/title/tt9109492/
2,2019,Close,3,1,tt5316540,http://us.imdb.com/title/tt5316540/
3,2019,Cold Pursuit,1,0,tt5719748,http://us.imdb.com/title/tt5719748/
4,2019,How to Train Your Dragon: The Hidden World,2,0,tt2386490,http://us.imdb.com/title/tt2386490/


In [8]:
bechdel_movies = []
for m in bechdel_df.title:
    bechdel_movies.append(m.lower().strip())

tmdb_movies = []
for t in tmdb_df.title:
    tmdb_movies.append(t.lower().strip())

In [9]:
shared_bechdel_tmdb_movies = set(bechdel_movies) & set(tmdb_movies)

In [10]:
len(shared_bechdel_tmdb_movies)

2816