In [1]:
import pandas as pd
from data_structures.Entity import Entity
import pickle as pkl

In [2]:
# Read datasets. RUN EACH RESTART OF KERNEL
df_title_principals = pd.read_csv('data/title.principals.tsv', sep='\t')
df_title_basics = pd.read_csv('data/title.basics.tsv', sep='\t')
df_name_basics = pd.read_csv('data/name.basics.tsv', sep='\t')

In [10]:
# Load saved datasets
with open("data/peopleids_principals.pkl", "rb") as file:
    all_people_per_film = pkl.load(file) # length 9,260,061

with open("data/films.pkl", "rb") as file:
    all_films = pkl.load(file) # length 10,225,586

with open("data/people.pkl", "rb") as file:
    all_people = pkl.load(file) # length 12,915,563

In [None]:
# if necessary
with open("data/df_title_basics_"+str(year)+".pkl", "rb") as file:
    films_year = pkl.load(file) # length 351,842 for 2013

In [285]:
# DESIRED YEAR
year = 2000

## Make films: id->name dict (run once)

In [None]:
# Map all ids to films
films = {}

for index, row in df_title_basics.iterrows():
    title_id = str(row['tconst'])
    title_name = str(row['originalTitle'])
    films[title_id] = title_name

with open("data/films.pkl", 'wb') as file:
    pkl.dump(films, file)

## Make people: id->name dict (run once)

In [None]:
# Map all ids to people
people = {}

for index, row in df_name_basics.iterrows():
    name_id = str(row['nconst'])
    name = str(row['primaryName'])
    people[name_id] = name

with open("data/people.pkl", 'wb') as file:
    pkl.dump(people, file)

## Get people associated with each film (run once)

In [None]:
# Get people names for each film
people_per_film = df_title_principals.groupby("tconst")["nconst"].agg(list).to_dict()
people_per_film

# Dump dictionary
with open("data/peopleids_principals.pkl", 'wb') as file:
    pkl.dump(people_per_film, file)

In [None]:
people_per_film

## Read datasets and set up datasets

In [286]:
# Convert year columns of mixed data into numeric
df_title_basics.loc[df_title_basics["startYear"] == "\\N", "startYear"] = 0
df_title_basics.loc[df_title_basics["endYear"] == "\\N", "endYear"] = 0

df_title_basics["startYear"] = pd.to_numeric(df_title_basics["startYear"])
df_title_basics["endYear"] = pd.to_numeric(df_title_basics["endYear"])

In [287]:
# Extract desired year films
df_title_basics_onlyyear = df_title_basics[df_title_basics['startYear'] == year]
df_title_basics_beforeyear = df_title_basics[df_title_basics['startYear'] < year]
df_title_basics_withinyear = df_title_basics_beforeyear[df_title_basics_beforeyear['endYear'] >= year]

# Print lengths
print(len(df_title_basics_onlyyear))
print(len(df_title_basics_beforeyear))
print(len(df_title_basics_withinyear))

# Combine all films and TV shows in desired year
df_title_basics_year = pd.concat([df_title_basics_onlyyear, df_title_basics_withinyear])

with open("data/df_title_basics_"+str(year)+".pkl", 'wb') as file:
    pkl.dump(df_title_basics_year, file)

print(len(df_title_basics_year)) # 351842

98137
3425998
3750
101887


In [288]:
df_title_basics_year.head(10)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
15176,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000,0,60,\N
100065,tt0102362,movie,Istota,Istota,0,2000,0,80,"Drama,Romance"
108549,tt0111056,tvSeries,Gensomaden Saiyuki,Gensomaden Saiyuki,0,2000,2001,23,"Action,Adventure,Animation"
109004,tt0111522,video,Twister: A Musical Catastrophe,Twister: A Musical Catastrophe,0,2000,0,128,"Action,Comedy"
110465,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000,0,86,"Musical,Romance"
110522,tt0113086,movie,Florentino y el diablo,Florentino y el diablo,0,2000,0,\N,Drama
110528,tt0113092,movie,For the Cause,For the Cause,0,2000,0,100,"Action,Adventure,Drama"
111154,tt0113742,short,Making Change,Making Change,0,2000,0,18,Short
113213,tt0115874,short,Chateaubriand - Cabeça de Paraíba,Chateaubriand - Cabeça de Paraíba,0,2000,0,15,"Drama,Short"
113273,tt0115937,movie,Consequence,Consequence,0,2000,0,91,Drama


In [289]:
df_title_principals.head(10)

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N
5,tt0000003,1,nm0721526,director,\N,\N
6,tt0000003,2,nm1770680,producer,producer,\N
7,tt0000003,3,nm1335271,composer,\N,\N
8,tt0000003,4,nm5442200,editor,\N,\N
9,tt0000004,1,nm0721526,director,\N,\N


## Make films, people dict for given year

### Load datasets

In [290]:
print(type(all_people_per_film))
print(type(all_films))
print(type(all_people))
print(type(films_year))

<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'pandas.core.frame.DataFrame'>


In [291]:
print(len(all_people_per_film))
print(len(all_films))
print(len(all_people))
print(len(films_year))

9260061
10225586
12915563
372627


### Get films for desired year

In [292]:
films_people_year = {"Films": [], "People": []}

# Get film names
films_titles_year = list(set(films_year['originalTitle'])) 
films_people_year["Films"] = films_titles_year

len(films_people_year["Films"]) # 224453 items

239800

### Get people for desired year

In [293]:
people_names_year = []

for t_id in set(films_year["tconst"]):
    try:
        people_names_year.extend([all_people[p_id] for p_id in all_people_per_film[t_id]])
    except:
        continue

films_people_year["People"] = people_names_year

print(len(people_names_year)) # 1933615 people

2032713


In [294]:
with open("data/films_people_"+str(year)+".pkl", 'wb') as file:
    pkl.dump(films_people_year, file)

In [295]:
films_people_year["Films"]

['Super Smash Bros. Dev Lore in a minute!',
 'SheWolf V: The Dark Gift',
 'Return to Yesterday',
 'After the Storm',
 'Dva mjeseca nakon poplava',
 'End of Empire',
 'Disco Nap',
 'Snail or the Boy with Blue Converse',
 'S5, Ep 94',
 'Meet the Pegusus',
 'Pathways 5',
 'Feng-Shui',
 'Grounded',
 'Cennetin düsüsü',
 'Les Lions de Cannes 2014: Le meilleur de la pub',
 'Finding Forever in Love',
 'Girl Friend (Kari)',
 'Carolas dag',
 'Oka Romantic Crime Katha',
 'Jeune',
 'Noam Chomsky, How to Ruin an Economy, Some Simple Ways: A Progressive Voice',
 'The Final Illusion',
 'El Colado',
 'Blowout',
 'Southern Hospitality 3',
 'The Mystery of Our Oceans',
 'Alexandria',
 'Eager to Learn',
 'Mountain State Surprise',
 "L'annonce",
 'Negative Energy',
 'SHIMMER Volume 63',
 'Tonari no okusama wa J-cup chônyû niku-kan tsuma: Terashima Shiho',
 'Rapid Eye Movement',
 'Modern Infatuation',
 'Jacques Pepin and the Ultimate Comfort Food: The Egg',
 "No, You Can't Sit with Us",
 'Onvoltooid verled

In [296]:
films_people_year["People"]

['Todd Helbing',
 'Grant Gustin',
 'Candice Patton',
 'Danielle Panabaker',
 'Rick Cosnett',
 'Dermott Downs',
 'Greg Berlanti',
 'Andrew Kreisberg',
 'Geoff Johns',
 'Aaron Helbing',
 'Roger Craig Smith',
 'Nika Futterman',
 'Johannes Oliver Hamm',
 "Colleen O'Shaughnessey",
 'Mike Pollock',
 'Natalys Raut Sieuzac',
 'Alan Denton',
 'Greg Hahn',
 'Tom Pugsley',
 'Cindy Robinson',
 'Bill Woods',
 'Dick Johnson',
 'John French',
 'John Bowe',
 'Nathan Prendergast',
 'Aaron Noonan',
 'Maddox Foster',
 'Levi Ponce',
 'Juan Carlos Saizarbitoria',
 'Mark Dedaj',
 'Amit Poznansky',
 'Sung-jae Choi',
 'Yoon Sang-Hyun',
 'Min-Jung Kim',
 'Sung Dong-il',
 'Lee Joon',
 'Soo-Won Jo',
 'Eum-mi Kwon',
 'Sung-Won Cho',
 'Kim Ji-won',
 'Min Choi',
 'Jazz Twemlow',
 'David Ferrier',
 'Tom Glasson',
 'Mark Humphries',
 'Kara Jensen-Mackinnon',
 'Nich Richardson',
 'Seaton Kay-Smith',
 'Alex Lee',
 'Sean Maguire',
 'Clarke Richards',
 'Jacob Gibbins',
 'Curtis Keene',
 'Justin Loiselle',
 'TJ Walker',
 