In [71]:
!pip install ratelimit



In [0]:
import pandas as pd
import requests
import json
import numpy as np
import networkx as nx
from operator import itemgetter
from ratelimit import limits, sleep_and_retry

In [0]:
api_key = ''
api_url = 'https://api.themoviedb.org/3/'
SearchPeople = pd.DataFrame.from_dict({"data":[]})
SearchMovies = pd.DataFrame.from_dict({"data":[]})

IDPeople = pd.DataFrame.from_dict({"data":[]})
IDMovies = pd.DataFrame.from_dict({"data":[]})

MovieGenres = [] 

In [0]:
# The official limit is 
# 40 request per 10 seconds

TEN_SECONDS = 1
REQUESTS = 3

@sleep_and_retry
@limits(calls=REQUESTS, period=TEN_SECONDS)
def call_api(url):
    response = requests.get(url)
    if response.status_code == 429:
        raise Exception('API response: {}'.format(response.status_code))
    return response
  
MovieGenres = call_api(api_url+"genre/movie/list?api_key="+api_key).json()['genres']

In [75]:
MovieGenres

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 80, 'name': 'Crime'},
 {'id': 99, 'name': 'Documentary'},
 {'id': 18, 'name': 'Drama'},
 {'id': 10751, 'name': 'Family'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 36, 'name': 'History'},
 {'id': 27, 'name': 'Horror'},
 {'id': 10402, 'name': 'Music'},
 {'id': 9648, 'name': 'Mystery'},
 {'id': 10749, 'name': 'Romance'},
 {'id': 878, 'name': 'Science Fiction'},
 {'id': 10770, 'name': 'TV Movie'},
 {'id': 53, 'name': 'Thriller'},
 {'id': 10752, 'name': 'War'},
 {'id': 37, 'name': 'Western'}]

In [0]:
# Search a person
def searchPerson(person):
  global SearchPeople
  try:
    return SearchPeople.loc[person,"data"]
  except: 
    url = api_url+"search/person?api_key="+api_key+"&query=\""+person+"\""
    response = call_api(url)
    data = response.json()
    SearchPeople = SearchPeople.append(pd.Series({'data':data},name=person))
    return data
  
# Search a movie
def searchMovie(movie):
  global SearchMovies
  try:
    return SearchMovies.loc[movie,"data"]
  except:
    page = 1
    data = {}
    url = api_url+"search/movie?api_key="+api_key+"&query=\""+movie+"\"&page="+str(page)
    response = call_api(url).json()
    data[page] = response["results"]
    total_pages = response["total_pages"]
    page += 1
    
    while(page <= total_pages):
      url = api_url+"search/movie?api_key="+api_key+"&query=\""+movie+"\"&page="+str(page)
      response = call_api(url).json()
      data[page] = response["results"]
      page += 1
      
    
    results = [result for page in data.values() for result in page]
    SearchMovies = SearchMovies.append(pd.Series({'data':results},name=movie))
    return results
  
# Search a movie on a year
def searchMovie(movie, year):
  global SearchMovies
  try:
    return SearchMovies.loc[movie+"_"+str(year),"data"]
  except:
    page = 1
    data = {}
    url = api_url+"search/movie?api_key="+api_key+"&query=\""+movie+"\"&page="+str(page)+"&year="+str(year)
    response = call_api(url).json()
    data[page] = response["results"]
    total_pages = response["total_pages"]
    page += 1
    
    while(page <= total_pages):
      url = api_url+"search/movie?api_key="+api_key+"&query=\""+movie+"\"&page="+str(page)+"&year="+str(year)
      response = call_api(url).json()
      data[page] = response["results"]
      page += 1
      
    
    results = [result for page in data.values() for result in page]
    SearchMovies = SearchMovies.append(pd.Series({'data':results},name=movie))
    return results

In [0]:
# Person and movie credits
def getPerson(person_id):
  global IDPeople
  try:
    return IDPeople.loc[person_id,"data"]
  except: 
    url = api_url+"person/"+str(person_id)+"?api_key="+api_key+"&append_to_response=movie_credits"
    response = call_api(url)
    data = response.json()
    IDPeople = IDPeople.append(pd.Series({'data':data},name=person_id))
    return data

# Movie and credits
def getMovie(movie_id):
  global IDMovies
  try:
    return IDMovies.loc[movie_id,"data"]
  except: 
    url = api_url+"movie/"+str(movie_id)+"?api_key="+api_key+"&append_to_response=credits"
    response = call_api(url)
    data = response.json()
    IDMovies = IDMovies.append(pd.Series({'data':data},name=movie_id))
    return data

In [0]:
# Get the genre object from the genre_id
def getGenre(genre_id):
  return [genre for genre in MovieGenres if genre["id"] == genre_id][0]

# get the movies with a genre from a list of movies
def getMoviesWithGenre(genre_id, movies):
  return [movie for movie in movies if getGenre(genre_id) in movie["genres"]]

In [0]:
# List of person with a job in a crew
def getJobFrom(job, crew):
  return [value for value in crew if value["job"] == job]

# List of works of someone in a crew or cast
def getNameFrom(name, crew):
  return [value for value in crew if value["name"] == name]

# Crew of a movie
def getCrew(movie_id):
  movie = getMovie(movie_id)
  try:
    return movie["credits"]["crew"]
  except:
    return movie["status_message"]  
  
# Cast of a movie
def getCast(movie_id):
  movie = getMovie(movie_id)
  try:
    return movie["credits"]["cast"]
  except:
    return movie["status_message"]

In [0]:
# Get the roles of someone in a list of movies
def getRoles(name, movies):
  return [(movie["title"],cast["character"]) for movie in movies for cast in movie["credits"]["cast"] if cast["name"] == name] 

# Get the people that has worked as a character in a list of movies
def getCharacter(char, movies):
  return [(movie["title"],cast["name"], cast["character"]) for movie in movies for cast in movie["credits"]["cast"] if char in cast["character"]] 

# Get the movies that has worked as a character in a list of movies
def getCharacter(char, movies):
  return [(movie["title"],cast["name"], cast["character"]) for movie in movies for cast in movie["credits"]["cast"] if char in cast["character"]] 

In [0]:
SEED = ("Quentin Tarantino", searchPerson("Quentin Tarantino")["results"][0]["id"])
todo_lst = [(0, SEED)] # The SEED is in the layer 0
todo_set = set(SEED) # The SEED itself
done_set = set() # Nothing is done yet

In [0]:
g = nx.Graph()
VALUE = 5.0
layer, person = todo_lst[0]

In [96]:
%%time
while layer < 2:
  # Remove the name page of the current page from the todo_lst, 
  # and add it to the set of processed pages. 
  # If the script encounters this page again, it will skip over it.
  del todo_lst[0]
  done_set.add(person)
  
  # Show progress
  print(layer, person) 
  
  try:
    person_data = getPerson(person[1])
    if(person_data["popularity"] < VALUE):
      try:
        g.remove_node(person)
      except:
        print("not in the graph yet")
      print("popularity under " + str(VALUE) + " skiping...")
      layer, person = todo_lst[0]
      continue
          
    g.add_node(person, nacionality = person_data["place_of_birth"].split(sep=',')[-1][1:])
  except:
    print("Could not load", person)
    layer, person = todo_lst[0]
    continue
    
  for movie in getJobFrom("Director", person_data["movie_credits"]["crew"]):
    movie = getMovie(movie["id"])
    credits = movie["credits"]
    for actor in credits["cast"]:
      pair = (actor["name"],actor["id"])
      #g.add_node(pair, nacionality = actor["place_of_birth"].split(sep=',')[-1][1:])
      if pair not in todo_set and pair not in done_set:
        todo_lst.append((layer + 1, pair))
        todo_set.add(pair)
      g.add_edge(person, pair, relation="director-actor")
    
  
  for movie in person_data["movie_credits"]["cast"]:
    movie = getMovie(movie["id"])
    credits = movie["credits"]
    for actor in credits["cast"]:
      pair = (actor["name"],actor["id"])
      #g.add_node(pair, nacionality = actor["place_of_birth"].split(sep=',')[-1][1:])
      if pair not in todo_set and pair not in done_set:
        todo_lst.append((layer + 1, pair))
        todo_set.add(pair)
      g.add_edge(person, pair, relation="actor-actor")
    directors_of_film = getJobFrom("Director",credits["crew"])
    for actor in directors_of_film:
      pair = (actor["name"],actor["id"])
      #g.add_node(pair, nacionality = actor["place_of_birth"].split(sep=',')[-1][1:], movies=set())
      if pair not in todo_set and pair not in done_set:
        todo_lst.append((layer + 1, pair))
        todo_set.add(pair)
      g.add_edge(person, pair, relation="director-actor")  
    
  layer, person = todo_lst[0]

0 ('Quentin Tarantino', 138)
1 ('Tim Roth', 3129)
1 ('Antonio Banderas', 3131)
1 ('Jennifer Beals', 3130)
1 ('Madonna', 3125)
popularity under 5.0 skiping...
1 ('Marisa Tomei', 3141)
1 ('Bruce Willis', 62)
1 ('Sammi Davis', 3122)
popularity under 5.0 skiping...
1 ('Amanda de Cadenet', 3123)
popularity under 5.0 skiping...
1 ('Valeria Golino', 3124)
popularity under 5.0 skiping...
1 ('Lili Taylor', 3127)
popularity under 5.0 skiping...
1 ('Lawrence Bender', 2545)
popularity under 5.0 skiping...
1 ('Ione Skye', 3126)
popularity under 5.0 skiping...
1 ('Alicia Witt', 3128)
popularity under 5.0 skiping...
1 ('David Proval', 2555)
popularity under 5.0 skiping...
1 ('Lana McKissack', 3132)
popularity under 5.0 skiping...
1 ('Tamlyn Tomita', 3134)
popularity under 5.0 skiping...
1 ('Kathy Griffin', 3138)
popularity under 5.0 skiping...
1 ('Paul Calderon', 3137)
popularity under 5.0 skiping...
1 ('Salma Hayek', 3136)
1 ('Patricia Vonne', 3133)
popularity under 5.0 skiping...
1 ('Danny Verduzco

In [97]:
# filter nodes with degree greater than or equal to 2
core = [node for node, deg in dict(g.degree()).items() if deg >= 2]

# select a subgraph with 'core' nodes
gsub = nx.subgraph(g, core)

print("{} nodes, {} edges".format(len(gsub), nx.number_of_edges(gsub)))

nx.write_graphml(gsub, SEED[0]+".graphml")

22821 nodes, 88459 edges


In [100]:
top_degree = sorted(dict(gsub.degree()).items(),
                      reverse=True, key=itemgetter(1))[:100]
print("\n".join(map(lambda t: "{} {}".format(*reversed(t)), top_degree)))

2651 ('Samuel L. Jackson', 2231)
2405 ('Robert De Niro', 380)
2112 ('Bruce Willis', 62)
1970 ('Steve Buscemi', 884)
1908 ('Martin Scorsese', 1032)
1780 ('Christopher Walken', 4690)
1621 ('Steven Spielberg', 488)
1616 ('Harvey Keitel', 1037)
1594 ('Brad Pitt', 287)
1541 ('Nicolas Cage', 2963)
1465 ('Sylvester Stallone', 16483)
1460 ('Owen Wilson', 887)
1427 ('Bill Hader', 19278)
1409 ('Adam Sandler', 19292)
1396 ('Josh Brolin', 16851)
1386 ('Matthew McConaughey', 10297)
1357 ('Marisa Tomei', 3141)
1354 ('John Travolta', 8891)
1309 ('Ving Rhames', 10182)
1282 ('Clint Eastwood', 190)
1264 ('Al Pacino', 1158)
1242 ('Edward Norton', 819)
1224 ('Ben Affleck', 880)
1174 ('Jeff Bridges', 1229)
1166 ('Michael Keaton', 2232)
1163 ('Robert Forster', 5694)
1158 ('Rob Schneider', 60949)
1144 ('Sharon Stone', 4430)
1142 ('Simon Pegg', 11108)
1124 ('Jonah Hill', 21007)
1120 ('Jackie Chan', 18897)
1106 ('Bruce Dern', 6905)
1105 ('Gina Gershon', 11150)
1105 ('George Clooney', 1461)
1100 ('Rosario Dawso