In [58]:
import pandas as pd
from data_structures.Entity import Entity
import pickle as pkl

## Read datasets and extract for desired year

In [3]:
# Read datasets
df_title_principals = pd.read_csv('data/title.principals.tsv', sep='\t')
df_title_basics = pd.read_csv('data/title.basics.tsv', sep='\t')
df_name_basics = pd.read_csv('data/name.basics.tsv', sep='\t')

  df_title_basics = pd.read_csv('data/title.basics.tsv', sep='\t')


In [59]:
# Convert year columns of mixed data into numeric
df_title_basics.loc[df_title_basics["startYear"] == "\\N", "startYear"] = 0
df_title_basics.loc[df_title_basics["endYear"] == "\\N", "endYear"] = 0

df_title_basics["startYear"] = pd.to_numeric(df_title_basics["startYear"])
df_title_basics["endYear"] = pd.to_numeric(df_title_basics["endYear"])

In [60]:
# Desired year
year = 2013

# Extract desired year films
df_title_basics_only2013 = df_title_basics[df_title_basics['startYear'] == year]
df_title_basics_before2013 = df_title_basics[df_title_basics['startYear'] < 2013]
df_title_basics_within2013 = df_title_basics_before2013[df_title_basics_before2013['endYear'] >= 2013]

# Print lengths
print(len(df_title_basics_only2013))
print(len(df_title_basics_before2013))
print(len(df_title_basics_within2013))

# Combine all films and TV shows in desired year
df_title_basics_2013 = pd.concat([df_title_basics_only2013, df_title_basics_within2013])

with open("data/df_title_basics_2013.pkl", 'wb') as file:
    pkl.dump(df_title_basics_2013, file)

print(len(df_title_basics_2013)) # 351842

346603
5830980
5239
351842


In [61]:
df_title_basics_2013.head(10)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
105819,tt0108270,short,Symphony 92.4 FM,Symphony 92.4 FM,0,2013,0,\N,Short
142619,tt0147076,short,Myszka i kotek,Myszka i kotek,0,2013,0,8,"Animation,Short"
154353,tt0159369,movie,Cooper and Hemingway: The True Gen,Cooper and Hemingway: The True Gen,0,2013,0,180,Documentary
219265,tt0228933,movie,That's Sexploitation!,That's Sexploitation!,0,2013,0,136,Documentary
244952,tt0255820,movie,Return to Babylon,Return to Babylon,0,2013,0,75,"Biography,Comedy,Drama"
291552,tt0304681,movie,The Secret Safari,The Secret Safari,0,2013,0,52,Documentary
339343,tt0354331,tvSeries,STAR News,STAR News,0,2013,0,90,News
340116,tt0355131,tvSeries,Saturday AFL,Saturday AFL,0,2013,0,\N,\N
344830,tt0359950,movie,The Secret Life of Walter Mitty,The Secret Life of Walter Mitty,0,2013,0,114,"Adventure,Comedy,Drama"
353703,tt0369168,tvSeries,SportsDesk,SportsDesk,0,2013,0,30,"Family,Sport"


In [62]:
df_title_principals.head(10)

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N
5,tt0000003,1,nm0721526,director,\N,\N
6,tt0000003,2,nm1770680,producer,producer,\N
7,tt0000003,3,nm1335271,composer,\N,\N
8,tt0000003,4,nm5442200,editor,\N,\N
9,tt0000004,1,nm0721526,director,\N,\N


## Make films: id->name dict

In [None]:
# Map all ids to films
films = {}

for index, row in df_title_basics.iterrows():
    title_id = str(row['tconst'])
    title_name = str(row['originalTitle'])
    films[title_id] = title_name

with open("data/films.pkl", 'wb') as file:
    pkl.dump(films, file)

## Make people: id->name dict

In [5]:
# Map all ids to people
people = {}

for index, row in df_name_basics.iterrows():
    name_id = str(row['nconst'])
    name = str(row['primaryName'])
    people[name_id] = name

with open("data/people.pkl", 'wb') as file:
    pkl.dump(people, file)

## Make films, people dict for given year

### Get people associated with each film

In [None]:
# Get people names for each film
people_per_film = df_title_principals.groupby("tconst")["nconst"].agg(list).to_dict()
people_per_film

# Dump dictionary
with open("data/peopleids_principals.pkl", 'wb') as file:
    pkl.dump(people_per_film, file)

In [69]:
people_per_film

{'tt0000001': ['nm1588970', 'nm0005690', 'nm0374658'],
 'tt0000002': ['nm0721526', 'nm1335271'],
 'tt0000003': ['nm0721526', 'nm1770680', 'nm1335271', 'nm5442200'],
 'tt0000004': ['nm0721526', 'nm1335271'],
 'tt0000005': ['nm0443482', 'nm0653042', 'nm0005690', 'nm0249379'],
 'tt0000006': ['nm0005690'],
 'tt0000007': ['nm0179163',
  'nm0183947',
  'nm0005690',
  'nm0374658',
  'nm0249379'],
 'tt0000008': ['nm0653028', 'nm0005690', 'nm0374658'],
 'tt0000009': ['nm0063086', 'nm0183823', 'nm1309758', 'nm0085156'],
 'tt0000010': ['nm0525910'],
 'tt0000011': ['nm3692297', 'nm0804434'],
 'tt0000012': ['nm2880396',
  'nm9735580',
  'nm0525900',
  'nm9735581',
  'nm0525908',
  'nm0525910',
  'nm9735579',
  'nm9653419'],
 'tt0000013': ['nm0525908', 'nm1715062', 'nm0525910'],
 'tt0000014': ['nm0166380', 'nm0244989', 'nm0525910'],
 'tt0000015': ['nm0721526'],
 'tt0000016': ['nm0525900', 'nm9735581', 'nm0525910'],
 'tt0000017': ['nm3691272', 'nm3692829', 'nm1587194', 'nm0804434'],
 'tt0000018': ['n

### Load datasets

In [70]:
# Load saved datasets
with open("data/peopleids_principals.pkl", "rb") as file:
    all_people_per_film = pkl.load(file) # length 9,260,061

with open("data/films.pkl", "rb") as file:
    all_films = pkl.load(file) # length 10,225,586

with open("data/people.pkl", "rb") as file:
    all_people = pkl.load(file) # length 12,915,563

with open("data/df_title_basics_2013.pkl", "rb") as file:
    films_2013 = pkl.load(file) # length 351,842 for 2013

In [71]:
print(type(all_people_per_film))
print(type(all_films))
print(type(all_people))
print(type(films_2013))

<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'pandas.core.frame.DataFrame'>


In [72]:
print(len(all_people_per_film))
print(len(all_films))
print(len(all_people))
print(len(films_2013))

9260061
10225586
12915563
351842


### Get films for desired year

In [74]:
films_people_2013 = {"Films": [], "People": []}

# Get film names
films_titles_2013 = list(set(films_2013['originalTitle'])) 
films_people_2013["Films"] = films_titles_2013

len(films_people_2013["Films"]) # 224453 items

224453

### Get people for desired year

In [87]:
people_names_2013 = []

for t_id in set(films_2013["tconst"]):
    try:
        people_names_2013.extend([all_people[p_id] for p_id in all_people_per_film[t_id]])
    except:
        continue

films_people_2013["People"] = people_names_2013

print(len(people_names_2013)) # 1933615 people

1933615


In [91]:
with open("data/films_people_2013.pkl", 'wb') as file:
    pkl.dump(films_people_2013, file)

In [90]:
films_people_2013["Films"]

['Atarashii Bu o Kessei Shite Shuraba',
 'Episode #19.28',
 'The Slow Mo Show',
 'Stickman',
 'Árshátíðarskaup Réttó 2013',
 'The Yard',
 'Übersteht Julia die Audition?',
 'Episode #1.3654',
 'Bellator MMA 97',
 'Live Results Show 6',
 'New Year Mama Drama',
 'Gefüllter Fisch',
 'Book Club',
 'Aflevering 4831',
 'Párty hrad na Craftcone!!! (Noc 2)',
 'New York Comic Con 2013: Extended Cut',
 'Your Ex Wife Took My Stuff & Vegas or Busted',
 'Erik Brandt',
 'Eight Games, One Memory',
 'Rock & Rock Couple',
 'Pidiendo explicaciones',
 "Mary Shelley's Frankenhole",
 'Day Zero 2',
 'Not Stanley',
 'Down to the wire',
 'Buying for Billionaires',
 'Na Ta Din Bhayo',
 'The Mommy Show',
 "Trilussa - Storia d'amore e di poesia",
 'Wrestled to the floor and cruelly bound and gagged',
 'The Muffin Man - Day 4',
 'Die Insider',
 'Victorian Villains',
 'Death of a Hero',
 'Midnight Special',
 'Ballerina Princess',
 'Potato Chips',
 'Favor for a friend',
 'Zaman Kimseyi Beklemez',
 'Zhong kou wei',
 

## Obsolete

In [6]:
count = 0
for index, row in df_title_basics_certainyear.iterrows(): #
    # Get attributes, mainly for the year
    title_name = films[str(row['tconst'])]
    title_startyear = str(row['startYear'])
    title_endyear = str(row['endYear'])

    # Distinguish TV series from films
    if title_endyear != "\\N": # TV series
        years_to_add = [str(i) for i in range(int(title_startyear), int(title_endyear)+1)]
    elif title_startyear == "\\N": # unknown
        continue
    else: # film
        years_to_add = [title_startyear]

    # Get people in film/series
    df_title_people_ids = df_title_principals[df_title_principals['tconst'] == title_id]
    people_names = []

    for index_people, row_people in df_title_people_ids.iterrows():
        person_name_lst = df_name_basics[df_name_basics['nconst'] == row_people['nconst']]['primaryName'].tolist()
        people_names.append(Entity(person_name_lst[0]))
    
    #print("PEOPLE NAMES: ", people_names)

    # for each year in years_to_add, add those people and the film name
    for year in years_to_add:
        film_entity = Entity(title_name)
        items_for_year["Films"].append(film_entity)
        items_for_year["People"].extend(people_names)

        #print(year, years[year])
    #print(count)
    count += 1

with open("data/"+str(year)+".pkl", 'wb') as file:
    pkl.dump(items_for_year, file)
