In [1]:
import pandas as pd

#### Check value counts for each `gender` column 

In [2]:
movies_2011_and_beyond = pd.read_csv('my_data/movies_2011_and_beyond.csv')
movies_2011_and_beyond.head()

Unnamed: 0,movieTitle,directorName,directorGender,writerName,writerGender,screenwriterName,screenwriterGender,executiveproducerName,executiveproducerGender,producerName,producerGender,budgetAmount,boxofficeAmount,imdbId
0,Fast Five,Neal H. Moritz,male,,,Chris Morgan,male,Justin Lin,male,Vin Diesel,male,,626100000.0,tt1596343
1,Albert Nobbs,Rodrigo García,male,,,Glenn Close,female,,,Bonnie Curtis,female,,,tt1602098
2,Albert Nobbs,Rodrigo García,male,,,George Moore,male,,,Bonnie Curtis,female,,,tt1602098
3,300: Rise of an Empire,Noam Murro,male,,,Kurt Johnstad,male,,,Thomas Tull,male,110000000.0,337580051.0,tt1253863
4,Fast Five,Neal H. Moritz,male,,,Chris Morgan,male,Justin Lin,male,Neal H. Moritz,male,,626100000.0,tt1596343


### Custom Functions
* add docstrings / make pretty for all finalized functions that are used

In [3]:
# check for all unique values in gender columns, will use my column names within function since they're all the same

def check_unique_gender_values(df):
    gender_columns = ['directorGender', 'writerGender', 'screenwriterGender', 'executiveproducerGender', 'producerGender']
    for col in gender_columns:
        print(col)
        print(df[col].unique())

In [4]:
check_unique_gender_values(movies_2011_and_beyond)

directorGender
['male' 'female' nan 'transgender female' 'non-binary' 'transgender male']
writerGender
[nan 'male']
screenwriterGender
['male' 'female' nan 'transgender female' 'non-binary']
executiveproducerGender
['male' nan 'female']
producerGender
['male' 'female' 'transgender female' nan]


In [6]:
# investigate writerGender because this should probably be dropped and changed to screenwriter
movies_2011_and_beyond.writerName.unique()

array([nan, 'Slavoj Žižek', 'Rana Abrar'], dtype=object)

In [7]:
movies_2011_and_beyond.loc[movies_2011_and_beyond.writerName == 'Rana Abrar']

Unnamed: 0,movieTitle,directorName,directorGender,writerName,writerGender,screenwriterName,screenwriterGender,executiveproducerName,executiveproducerGender,producerName,producerGender,budgetAmount,boxofficeAmount,imdbId
15766,Black Briefcase: The Nuclear Trigger,Rana Abrar,male,Rana Abrar,male,,,,,,,,,tt7903992
16053,Son Of Kashmir Burhan,Rana Abrar,male,Rana Abrar,male,,,,,,,,,tt7675680
16390,The Evil Marriage,Rana Abrar,male,Rana Abrar,male,,,,,,,,,tt8614336


In [None]:
# check to see if these movies are in Bechdel dataset - if not, all 3 will be dropped because there is not much useful
# information for these movies

In [8]:
bechdel_df = pd.read_csv('my_data/bechdel_test_movies.csv')
bechdel_df.head()

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
0,2019,Alita: Battle Angel,3,1,tt0437086,http://us.imdb.com/title/tt0437086/
1,2019,American Woman,3,1,tt9109492,http://us.imdb.com/title/tt9109492/
2,2019,Close,3,1,tt5316540,http://us.imdb.com/title/tt5316540/
3,2019,Cold Pursuit,1,0,tt5719748,http://us.imdb.com/title/tt5719748/
4,2019,How to Train Your Dragon: The Hidden World,2,0,tt2386490,http://us.imdb.com/title/tt2386490/


In [16]:
# could turn this imdb_id finder into a function
rana_abrar_imdb_ids = []
for movie in movies_2011_and_beyond.loc[movies_2011_and_beyond.writerName == 'Rana Abrar'].values:
    rana_abrar_imdb_ids.append(movie[-1])
rana_abrar_imdb_ids

['tt7903992', 'tt7675680', 'tt8614336']

In [14]:
def locate_rows_by_imdb_ids(dataset, name_of_imdb_id_col, imdb_id_list):
    selected_rows = []
    for imdb_id in imdb_id_list:
        row = dataset.loc[dataset[name_of_imdb_id_col] == imdb_id]
        selected_rows.append(row)
    return selected_rows

In [19]:
locate_rows_by_imdb_ids(bechdel_df, name_of_imdb_id_col='imdb_id', imdb_id_list=rana_abrar_imdb_ids)

[Empty DataFrame
 Columns: [year, title, score, passing, imdb_id, imdb_link]
 Index: [], Empty DataFrame
 Columns: [year, title, score, passing, imdb_id, imdb_link]
 Index: [], Empty DataFrame
 Columns: [year, title, score, passing, imdb_id, imdb_link]
 Index: []]

In [21]:
# confirm this function is actually saying there are no matching movies in Bechdel df rather than just being a 
# broken function

bechdel_df.loc[bechdel_df.imdb_id == 'tt7903992']

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link


In [22]:
bechdel_df.loc[bechdel_df.title == 'The Evil Marriage']

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link


In [23]:
# check to see if function works on movies that are definitely in both lists

movies_2011_and_beyond.loc[movies_2011_and_beyond.movieTitle == 'Alita: Battle Angel']

Unnamed: 0,movieTitle,directorName,directorGender,writerName,writerGender,screenwriterName,screenwriterGender,executiveproducerName,executiveproducerGender,producerName,producerGender,budgetAmount,boxofficeAmount,imdbId
8292,Alita: Battle Angel,Robert Rodriguez,male,,,James Cameron,male,,,James Cameron,male,170000000.0,359700000.0,tt0437086
8299,Alita: Battle Angel,Robert Rodriguez,male,,,Laeta Kalogridis,female,,,James Cameron,male,170000000.0,359700000.0,tt0437086
8307,Alita: Battle Angel,Robert Rodriguez,male,,,James Cameron,male,,,Robert Rodriguez,male,170000000.0,359700000.0,tt0437086
8314,Alita: Battle Angel,Robert Rodriguez,male,,,Laeta Kalogridis,female,,,Robert Rodriguez,male,170000000.0,359700000.0,tt0437086
8331,Alita: Battle Angel,Robert Rodriguez,male,,,James Cameron,male,,,Jon Landau,male,170000000.0,359700000.0,tt0437086
8333,Alita: Battle Angel,Robert Rodriguez,male,,,Laeta Kalogridis,female,,,Jon Landau,male,170000000.0,359700000.0,tt0437086


In [24]:
movies_2011_and_beyond.loc[movies_2011_and_beyond.movieTitle == 'Close']

Unnamed: 0,movieTitle,directorName,directorGender,writerName,writerGender,screenwriterName,screenwriterGender,executiveproducerName,executiveproducerGender,producerName,producerGender,budgetAmount,boxofficeAmount,imdbId
15209,Close,Vicky Jewson,female,,,Vicky Jewson,female,,,,,,,tt5316540


In [25]:
def locate_imdb_ids(df, column_to_search, value_list_to_search, imdb_id_column):
    rows_to_search = []
    imdb_ids = []
    for val in value_list_to_search:
        row = df.loc[df[column_to_search] == val]
        rows_to_search.append(row)
    for row in rows_to_search.values:
        print(row)

In [27]:
# okay that didn't really work

In [30]:
def locate_rows_in_df_by_values(df, column_to_search, value_list_to_search):
    for val in value_list_to_search:
        print(df.loc[df[column_to_search] == val])

In [32]:
movies_to_search = ['Close', 'Alita: Battle Angel', 'American Woman']
locate_rows_in_df_by_values(movies_2011_and_beyond, 'movieTitle', movies_to_search)

      movieTitle  directorName directorGender writerName writerGender  \
15209      Close  Vicky Jewson         female        NaN          NaN   

      screenwriterName screenwriterGender executiveproducerName  \
15209     Vicky Jewson             female                   NaN   

      executiveproducerGender producerName producerGender  budgetAmount  \
15209                     NaN          NaN            NaN           NaN   

       boxofficeAmount     imdbId  
15209              NaN  tt5316540  
               movieTitle      directorName directorGender writerName  \
8292  Alita: Battle Angel  Robert Rodriguez           male        NaN   
8299  Alita: Battle Angel  Robert Rodriguez           male        NaN   
8307  Alita: Battle Angel  Robert Rodriguez           male        NaN   
8314  Alita: Battle Angel  Robert Rodriguez           male        NaN   
8331  Alita: Battle Angel  Robert Rodriguez           male        NaN   
8333  Alita: Battle Angel  Robert Rodriguez           mal

In [34]:
test_imdb_ids = ['tt0437086', 'tt5316540']
locate_rows_by_imdb_ids(movies_2011_and_beyond, 'imdbId', test_imdb_ids)

[               movieTitle      directorName directorGender writerName  \
 8292  Alita: Battle Angel  Robert Rodriguez           male        NaN   
 8299  Alita: Battle Angel  Robert Rodriguez           male        NaN   
 8307  Alita: Battle Angel  Robert Rodriguez           male        NaN   
 8314  Alita: Battle Angel  Robert Rodriguez           male        NaN   
 8331  Alita: Battle Angel  Robert Rodriguez           male        NaN   
 8333  Alita: Battle Angel  Robert Rodriguez           male        NaN   
 
      writerGender  screenwriterName screenwriterGender executiveproducerName  \
 8292          NaN     James Cameron               male                   NaN   
 8299          NaN  Laeta Kalogridis             female                   NaN   
 8307          NaN     James Cameron               male                   NaN   
 8314          NaN  Laeta Kalogridis             female                   NaN   
 8331          NaN     James Cameron               male                   N

In [35]:
locate_rows_by_imdb_ids(bechdel_df, 'imdb_id', test_imdb_ids)

[   year                title  score  passing    imdb_id  \
 0  2019  Alita: Battle Angel      3        1  tt0437086   
 
                              imdb_link  
 0  http://us.imdb.com/title/tt0437086/  ,
    year  title  score  passing    imdb_id                            imdb_link
 2  2019  Close      3        1  tt5316540  http://us.imdb.com/title/tt5316540/]

In [33]:
movies_2011_and_beyond.loc[movies_2011_and_beyond.movieTitle == 'American Woman']

Unnamed: 0,movieTitle,directorName,directorGender,writerName,writerGender,screenwriterName,screenwriterGender,executiveproducerName,executiveproducerGender,producerName,producerGender,budgetAmount,boxofficeAmount,imdbId


In [36]:
# well the outputs don't look pretty, but at least they work!