## I. First Names Dataset

In [1]:
import pandas as pd
import numpy as np

In [41]:
name_df = pd.read_excel('SSA_Names_DB.xlsx')
name_df.head()

Unnamed: 0,Name,Gender,Frequency,Include?
0,Emma,F,20355,Yes
1,Olivia,F,19553,Yes
2,Noah,M,19511,Yes
3,Liam,M,18281,Yes
4,Sophia,F,17327,Yes


### Clean name dataset
* Drop `Frequency` and `Include?` columns
* Change genders to numerical values
    * `M` $\rightarrow$ `0`
    * `F` $\rightarrow$ `1`
    * Gender neutral names included in both categories $\rightarrow$ `2`
* Drop duplicated gender neutral names

In [93]:
name_df.Gender.value_counts()

F    18993
M    13959
Name: Gender, dtype: int64

In [94]:
name_df['Include?'].value_counts()

No     27252
Yes     5700
Name: Include?, dtype: int64

In [42]:
# change gender values to binary
name_df.Gender.replace({'M': 0, 'F': 1}, inplace=True)

In [17]:
name_df.head()

Unnamed: 0,Name,Gender,Frequency,Include?
0,Emma,1,20355,Yes
1,Olivia,1,19553,Yes
2,Noah,0,19511,Yes
3,Liam,0,18281,Yes
4,Sophia,1,17327,Yes


In [5]:
# according to the data dictionary, the Include? column has to do with number of social security applications and
# is not needed for this project

In [43]:
name_df.drop('Include?', axis=1, inplace=True)

In [44]:
name_df.head()

Unnamed: 0,Name,Gender,Frequency
0,Emma,1,20355
1,Olivia,1,19553
2,Noah,0,19511
3,Liam,0,18281
4,Sophia,1,17327


In [80]:
name_df.values[0]

array(['Emma', 1, 20355], dtype=object)

In [99]:
female_names = []
male_names = []

for x in name_df.values:
    if x[1] == 1:
        female_names.append(x[0])
    if x[1] == 0:
        male_names.append(x[0])

In [100]:
any(i in female_names for i in male_names)

True

In [101]:
neutral = set(female_names).intersection(set(male_names))

In [102]:
len(neutral)

2492

In [86]:
list(neutral)[:10]

[True,
 'Zyair',
 'Amour',
 'Kentley',
 'Jermiah',
 'Yichen',
 'Truett',
 'Channing',
 'Brody',
 'Christian']

In [8]:
name_df.loc[name_df.Name == 'Olivia']

Unnamed: 0,Name,Gender,Frequency
1,Olivia,1,19553
22217,Olivia,0,8


In [19]:
name_df.loc[name_df.Name == 'Emma']

Unnamed: 0,Name,Gender,Frequency
0,Emma,1,20355
18390,Emma,0,10


In [109]:
len(name_df.loc[name_df.Frequency < 20])

21698

In [112]:
len(name_df.loc[name_df.Frequency < 15])

19037

In [45]:
# try names that appear more than 30 times, because using 15 as the threshold still included some males with the name
# of 'Emily'
common_names = name_df.loc[name_df.Frequency > 30]

In [46]:
common_names.shape

(8128, 3)

In [47]:
female_names = []
male_names = []

for x in common_names.values:
    if x[1] == 1:
        female_names.append(x[0])
    if x[1] == 0:
        male_names.append(x[0])

neutral = set(female_names).intersection(set(male_names))

In [48]:
len(neutral)

469

In [49]:
list(neutral)[:50]

[True,
 'Jai',
 'Noor',
 'Monroe',
 'Chase',
 'Porter',
 'Kyler',
 'Landry',
 'Cypress',
 'Kyree',
 'Jayce',
 'Jordan',
 'Kingsley',
 'Bryar',
 'Cassidy',
 'Cooper',
 'Easton',
 'Brooklyn',
 'Austin',
 'Fallon',
 'Parker',
 'Tory',
 'Sunny',
 'Teagan',
 'Harper',
 'Lennon',
 'Tatum',
 'Daylin',
 'Maddox',
 'Lane',
 'Arrow',
 'Hudson',
 'Jamison',
 'Wisdom',
 'Henley',
 'Jessy',
 'Leighton',
 'Charley',
 'Micaiah',
 'Adrian',
 'Yuri',
 'Ira',
 'London',
 'Rylee',
 'Hendrix',
 'Jesse',
 'Devon',
 'Tenzin',
 'Clarke',
 'Tristyn']

In [59]:
# for x in list(neutral):
#     common_names.loc[common_names.Name == x, 'Gender'] = 2

In [51]:
common_names.head()

Unnamed: 0,Name,Gender,Frequency
0,Emma,1,20355
1,Olivia,1,19553
2,Noah,2,19511
3,Liam,0,18281
4,Sophia,1,17327


In [52]:
common_names.Gender.value_counts()

1    4089
0    3101
2     938
Name: Gender, dtype: int64

In [54]:
common_names[:25]

Unnamed: 0,Name,Gender,Frequency
0,Emma,1,20355
1,Olivia,1,19553
2,Noah,2,19511
3,Liam,0,18281
4,Sophia,1,17327
5,Mason,2,16535
6,Ava,1,16286
7,Jacob,0,15816
8,William,0,15809
9,Isabella,1,15504


In [58]:
# common_names.drop('Frequency', axis=1, inplace=True)
# common_names.drop_duplicates(inplace=True)

In [56]:
common_names.head()

Unnamed: 0,Name,Gender
0,Emma,1
1,Olivia,1
2,Noah,2
3,Liam,0
4,Sophia,1


In [57]:
common_names.Gender.value_counts()

1    4089
0    3101
2     469
Name: Gender, dtype: int64

#### Function to reassign gender neutral names to value 2 and to drop duplicates

In [45]:
# function to reassign gender value 2 to gender neutral names and drop duplicates from single dataframe
# for this dataset, would need to drop Include? and Frequency columns before using this function

# def reassign_gender_neutral_drop_duplicates(df):
    
#     girl_names = []
#     boy_names = []

#     for x in df.values:
#         if x[1] == 1:
#             girl_names.append(x[0])
#         if x[1] == 0:
#             boy_names.append(x[0])
    
#     neutral = set(girl_names).intersection(set(boy_names))
    
#     for x in list(neutral):
#         df.loc[df.name == x, 'gender'] = 2
        
#     df.drop_duplicates(inplace=True)

### Split dataframe into three series
* Female names
* Male names
* Gender-neutral names

In [60]:
male_names = common_names.loc[common_names.Gender == 0]
female_names = common_names.loc[common_names.Gender == 1]
neutral_names = common_names.loc[common_names.Gender == 2]

In [62]:
male_names.head()

Unnamed: 0,Name,Gender
3,Liam,0
7,Jacob,0
8,William,0
10,Ethan,0
13,Alexander,0


In [63]:
female_names.head()

Unnamed: 0,Name,Gender
0,Emma,1
1,Olivia,1
4,Sophia,1
6,Ava,1
9,Isabella,1


In [64]:
neutral_names.head()

Unnamed: 0,Name,Gender
2,Noah,2
5,Mason,2
12,James,2
14,Michael,2
18,Aiden,2


#### Drop gender from each new DataFrame

In [66]:
# for df in male_names, female_names, neutral_names:
#     df.drop('Gender', axis=1, inplace=True)

### Create new .csv files from each Series

In [68]:
male_names.to_csv('male_names.csv', index=False)

In [69]:
# check to see if that worked
males = pd.read_csv('male_names.csv')
males.head()

Unnamed: 0,Name
0,Liam
1,Jacob
2,William
3,Ethan
4,Alexander


In [70]:
female_names.to_csv('female_names.csv', index=False)
neutral_names.to_csv('neutral_names.csv', index=False)

## II. Movie Datasets

### A. Hydra Movies Dataset

In [73]:
hydra_df = pd.read_csv('original_data/Hydra-Movie-Scrape.csv')
hydra_df.head()

Unnamed: 0,Title,Year,Summary,Short Summary,Genres,IMDB ID,Runtime,YouTube Trailer,Rating,Movie Poster,Director,Writers,Cast
0,Patton Oswalt: Annihilation,2017,"Patton Oswald, despite a personal tragedy, pro...","Patton Oswalt, despite a personal tragedy, pro...",Uncategorized,tt7026230,66,4hZi5QaMBFc,7.4,https://hydramovies.com/wp-content/uploads/201...,Bobcat Goldthwait,Patton Oswalt,Patton Oswalt
1,New York Doll,2005,A recovering alcoholic and recently converted ...,A recovering alcoholic and recently converted ...,Documentary|Music,tt0436629,75,jwD04NsnLLg,7.9,https://hydramovies.com/wp-content/uploads/201...,Greg Whiteley,Arthur Kane,Sylvain Sylvain
2,Mickey's Magical Christmas: Snowed in at the H...,2001,After everyone is snowed in at the House of Mo...,Mickey and all his friends hold their own Chri...,Adventure|Animation|Comedy|Family|Fantasy,tt0300195,65,uCKwHHftrU4,6.8,https://hydramovies.com/wp-content/uploads/201...,Tony Craig,Thomas Hart,Carlos Alazraqui|Wayne Allwine
3,Mickey's House of Villains,2001,The villains from the popular animated Disney ...,The villains from the popular animated Disney ...,Animation|Comedy|Family|Fantasy|Horror,tt0329374,0,JA03ciYt-Ek,6.6,https://hydramovies.com/wp-content/uploads/201...,Jamie Mitchell,Thomas Hart,Tony Anselmo|Wayne Allwine
4,And Then I Go,2017,"In the cruel world of junior high, Edwin suffe...","In the cruel world of junior high, Edwin suffe...",Drama,tt2018111,99,8CdIiD6-iF0,7.6,https://hydramovies.com/wp-content/uploads/201...,Vincent Grashaw,Brett Haley,Arman Darbo|Sawyer Barth


### B. IMDB Movies Dataset

### C. TMDB Movies Dataset