# Creating the graphml files

Here we get the data to create the graphs of the network of actors and directors.

In [None]:
!pip install ratelimit

In [None]:
import pandas as pd
import requests
import json
import numpy as np
import networkx as nx
from operator import itemgetter
from ratelimit import limits, sleep_and_retry
import re

In [None]:
# Paste here your key
api_key = ''

api_url = 'https://api.themoviedb.org/3/'

# Store data of searchs
SearchPeople = pd.DataFrame.from_dict({"data":[]})
SearchMovies = pd.DataFrame.from_dict({"data":[]})

# Store data of people and movies
IDPeople = pd.DataFrame.from_dict({"data":[]})
IDMovies = pd.DataFrame.from_dict({"data":[]})

# If you saved your data previously
# IDPeople = pd.read_csv("IDPeople.csv", index_col=0)
# IDMovies = pd.read_csv("IDMovies.csv", index_col=0)

MovieGenres = [] 

In [None]:
# The official limit is 
# 40 request per 10 seconds

TIME_IN_SECONDS = 1
REQUESTS = 3

@sleep_and_retry
@limits(calls=REQUESTS, period=TIME_IN_SECONDS)
def call_api(url):
    response = requests.get(url)
    if response.status_code == 429:
        raise Exception('API response: {}'.format(response.status_code))
    return response
  
MovieGenres = call_api(api_url+"genre/movie/list?api_key="+api_key).json()['genres']

In [None]:
MovieGenres

In [None]:
# Search a person
def searchPerson(person):
    global SearchPeople
    try:
        return json.loads(SearchPeople.loc[person,"data"])
    except: 
        url = api_url+"search/person?api_key="+api_key+"&query=\""+person+"\""
        response = call_api(url)
        data = response.json()
        SearchPeople = SearchPeople.append(pd.Series({'data':json.dumps(data)},name=person))
        return data
    
# Search a movie
def searchMovie(movie):
    global SearchMovies
    try:
        return json.loads(SearchMovies.loc[movie,"data"])
    except:
        page = 1
        data = {}
        url = api_url+"search/movie?api_key="+api_key+"&query=\""+movie+"\"&page="+str(page)
        response = call_api(url).json()
        data[page] = response["results"]
        total_pages = response["total_pages"]
        page += 1

        while(page <= total_pages):
            url = api_url+"search/movie?api_key="+api_key+"&query=\""+movie+"\"&page="+str(page)
            response = call_api(url).json()
            data[page] = response["results"]
            page += 1
      
        results = [result for page in data.values() for result in page]
        SearchMovies = SearchMovies.append(pd.Series({'data':json.dumps(results)},name=movie))
        return results
    
# Search a movie on a year
def searchMovieOnYear(movie, year):
    global SearchMovies
    try:
        return json.loads(SearchMovies.loc[movie+"_"+str(year),"data"])
    except:
        page = 1
        data = {}
        url = api_url+"search/movie?api_key="+api_key+"&query=\""+movie+"\"&page="+str(page)+"&year="+str(year)
        response = call_api(url).json()
        data[page] = response["results"]
        total_pages = response["total_pages"]
        page += 1

        while(page <= total_pages):
            url = api_url+"search/movie?api_key="+api_key+"&query=\""+movie+"\"&page="+str(page)+"&year="+str(year)
            response = call_api(url).json()
            data[page] = response["results"]
            page += 1

        results = [result for page in data.values() for result in page]
        SearchMovies = SearchMovies.append(pd.Series({'data':json.dumps(results)},name=movie))
        return results

In [None]:
# Person and movie credits
def getPerson(person_id):
    global IDPeople
    try:
        return json.loads(IDPeople.loc[person_id,"data"])
    except: 
        url = api_url+"person/"+str(person_id)+"?api_key="+api_key+"&append_to_response=movie_credits"
        response = call_api(url)
        data = response.json()
        IDPeople = IDPeople.append(pd.Series({'data':json.dumps(data)},name=person_id))
        return data

# Movie and credits
def getMovie(movie_id):
    global IDMovies
    try:
        return json.loads(IDMovies.loc[movie_id,"data"])
    except: 
        url = api_url+"movie/"+str(movie_id)+"?api_key="+api_key+"&append_to_response=credits"
        response = call_api(url)
        data = response.json()
        IDMovies = IDMovies.append(pd.Series({'data':json.dumps(data)},name=movie_id))
        return data

In [None]:
# Get the genre object from the genre_id
def getGenre(genre_id):
    return [genre for genre in MovieGenres if genre["id"] == genre_id][0]

# get the movies with a genre from a list of movies
def getMoviesWithGenre(genre_id, movies):
    return [movie for movie in movies if getGenre(genre_id) in movie["genres"]]

In [None]:
# List of person with a job in a crew
def getJobFrom(job, crew):
    return [value for value in crew if value["job"] == job]

# List of works of someone in a crew or cast
def getNameFrom(name, crew):
    return [value for value in crew if value["name"] == name]

# Crew of a movie
def getCrew(movie_id):
    movie = getMovie(movie_id)
    try:
        return movie["credits"]["crew"]
    except:
        return movie["status_message"]  
    
# Cast of a movie
def getCast(movie_id):
    movie = getMovie(movie_id)
    try:
        return movie["credits"]["cast"]
    except:
        return movie["status_message"]

In [None]:
# Get the roles of someone in a list of movies
def getRoles(name, movies):
    return [(movie["title"],cast["character"]) for movie in movies for cast in movie["credits"]["cast"] if cast["name"] == name] 

# Get the people that has worked as a character in a list of movies
def getCharacter(char, movies):
    return [(movie["title"],cast["name"], cast["character"]) for movie in movies for cast in movie["credits"]["cast"] if char in cast["character"]] 

# Get the movies that has worked as a character in a list of movies
def getCharacter(char, movies):
    return [(movie["title"],cast["name"], cast["character"]) for movie in movies for cast in movie["credits"]["cast"] if char in cast["character"]] 

In [None]:
def getNacionality(person_data):
    try:
        nacionality = re.split(', | - ', person_data["place_of_birth"])[-1].strip()
        nacionality = nacionality.replace("U.S.", "USA")
        nacionality = nacionality.replace("U.S.A.", "USA")
        nacionality = nacionality.replace("USAA.", "USA")
        nacionality = nacionality.replace("United States", "USA")
        nacionality = nacionality.replace("États-Unis", "USA")
        nacionality = nacionality.replace("Stati Uniti", "USA")
        nacionality = nacionality.replace("United Kingdom", "UK")
        nacionality = nacionality.replace("U.K.", "UK")
        
        if("California" in nacionality or nacionality == "CA"):
            nacionality = "USA"
        if(" US" in nacionality):
            nacionality = "USA"
        if("New York" in nacionality or "Long Island" in nacionality or nacionality == "NY"):
            nacionality = "USA"
        if("USSR" in nacionality):
            nacionality = "USSR"
        if("England" in nacionality):
            nacionality = "UK" 
    except:
        nacionality = "Unknown"
    return nacionality

In [None]:
def create_graph(SEED, LAYER=2):
  
    todo_lst = [(0, SEED)] # The SEED is in the layer 0
    todo_set = set(SEED) # The SEED itself
    done_set = set() # Nothing is done yet

    g = nx.Graph()
    VALUE = 5.0
    layer, person = todo_lst[0]

    while layer < LAYER:
        # Remove the name page of the current page from the todo_lst, 
        # and add it to the set of processed pages. 
        # If the script encounters this page again, it will skip over it.
        del todo_lst[0]
        done_set.add(person)

        # Show progress
        print(layer, person) 

        try:
            person_data = getPerson(person[1])
            if(person_data["popularity"] < VALUE):
                try:
                    g.remove_node(person)
                except:
                    print("not in the graph yet")
                print("popularity under " + str(VALUE) + " skiping...")
                layer, person = todo_lst[0]
                continue

            g.add_node(person[0], nacionality = getNacionality(person_data))
        except:
            print("Could not load", person)
            layer, person = todo_lst[0]
            continue

        for movie in getJobFrom("Director", person_data["movie_credits"]["crew"]):
            movie = getMovie(movie["id"])
            credits = movie["credits"]
            for actor in credits["cast"]:
                pair = (actor["name"],actor["id"])
                data = getPerson(actor["id"])

                if data["popularity"] >= VALUE and pair not in todo_set and pair not in done_set:
                    todo_lst.append((layer + 1, pair))
                    todo_set.add(pair)
                g.add_node(pair[0], nacionality = getNacionality(data))
                g.add_edge(person[0], pair[0], relation="director-actor")


        for movie in person_data["movie_credits"]["cast"]:
            movie = getMovie(movie["id"])
            credits = movie["credits"]
            for actor in credits["cast"]:
                pair = (actor["name"],actor["id"])
                data = getPerson(actor["id"])

                if data["popularity"] >= VALUE and pair not in todo_set and pair not in done_set:
                    todo_lst.append((layer + 1, pair))
                    todo_set.add(pair)
                g.add_node(pair[0], nacionality = getNacionality(data))
                g.add_edge(person[0], pair[0], relation="actor-actor")

            directors_of_film = getJobFrom("Director",credits["crew"])
            for actor in directors_of_film:
                pair = (actor["name"],actor["id"])
                data = getPerson(actor["id"])
                if data["popularity"] >= VALUE and pair not in todo_set and pair not in done_set:
                    todo_lst.append((layer + 1, pair))
                    todo_set.add(pair)
                g.add_node(pair[0], nacionality = getNacionality(data))
                g.add_edge(person[0], pair[0], relation="director-actor")  

        layer, person = todo_lst[0]

    return g

In [None]:
def create_graph_simple(SEED):

    todo_lst = []
    done_set = set(SEED)

    g = nx.Graph()
    VALUE = 5.0

    try:
        person_data = getPerson(SEED[1])

        g.add_node(SEED[0], nacionality = getNacionality(person_data))
    except:
        print(SEED[0] + " not found")
        return

    as_director = getJobFrom("Director", person_data["movie_credits"]["crew"])
    for movie in as_director:
        movie = getMovie(movie["id"])
        try:
            credits = movie["credits"]
        except:
            continue
        for actor in credits["cast"]:
            pair = (actor["name"],actor["id"])
            if pair not in done_set:
                data = getPerson(pair[1])
                if(data["popularity"] >= VALUE):
                    print(pair)
                    done_set.add(pair)
                    todo_lst.append(pair)
                    g.add_node(pair[0], nacionality = getNacionality(data))
                    g.add_edge(SEED[0], pair[0], relation="director-actor")

    counter = 0
    for movie in person_data["movie_credits"]["cast"]:
        counter += 1
        movie = getMovie(movie["id"])
        try:
            credits = movie["credits"]
        except:
            continue
        for actor in credits["cast"]:
            pair = (actor["name"],actor["id"])
            if pair not in done_set:
                data = getPerson(pair[1])
                if(data["popularity"] >= VALUE):
                    print(pair)
                    done_set.add(pair)
                    todo_lst.append(pair)
                    g.add_node(pair[0], nacionality = getNacionality(data))
                    g.add_edge(SEED[0], pair[0], relation="actor-actor")

        directors_of_film = getJobFrom("Director",credits["crew"])
        for director in directors_of_film:
            pair = (director["name"],director["id"])
            if pair not in done_set:
                data = getPerson(pair[1])
                if(data["popularity"] >= VALUE):
                    print(pair)
                    done_set.add(pair)
                    todo_lst.append(pair)
                    g.add_node(pair[0], nacionality = getNacionality(data))
                    g.add_edge(SEED[0], pair[0], relation="director-actor")
                    
    for person in todo_lst:
        neigh_set = set()
        try:
            person_data = getPerson(person[1])
            print(person[0] + " relations")
        except:
            print(person[0] + " not found")
            continue

        as_director = getJobFrom("Director", person_data["movie_credits"]["crew"])
        for movie in as_director:
            movie = getMovie(movie["id"])
            try:
                credits = movie["credits"]
            except:
                continue
            for actor in credits["cast"]:
                pair = (actor["name"],actor["id"])
                if pair in done_set and pair not in neigh_set:
                    neigh_set.add(pair)
                    g.add_edge(person[0], pair[0], relation="director-actor")
        
        neigh_set = set()
        neigh_set_dir = set()
        for movie in person_data["movie_credits"]["cast"]:
            movie = getMovie(movie["id"])
            try:
                credits = movie["credits"]
            except:
                continue
            for actor in credits["cast"]:
                pair = (actor["name"],actor["id"])
                if pair in done_set and pair not in neigh_set:
                    neigh_set.add(pair)
                    g.add_edge(person[0], pair[0], relation="actor-actor")
            directors_of_film = getJobFrom("Director",credits["crew"])
            for actor in directors_of_film:
                pair = (actor["name"],actor["id"])
                if pair in done_set and pair not in neigh_set_dir:
                    neigh_set_dir.add(pair)
                    g.add_edge(person[0], pair[0], relation="director-actor")
                    
    return g

In [None]:
def save_core_graph(name, g):
    # filter nodes with degree greater than or equal to 2
    core = [node for node, deg in dict(g.degree()).items() if deg >= 3]

    # select a subgraph with 'core' nodes
    gsub = nx.subgraph(g, core)

    print("{} nodes, {} edges".format(len(gsub), nx.number_of_edges(gsub)))

    nx.write_graphml(gsub, name+".graphml")
    return gsub

In [None]:
def show_top_degrees(graph):
    top_degree = sorted(dict(graph.degree()).items(), reverse=True, key=itemgetter(1))[:100]
    print("\n".join(map(lambda t: "{} {}".format(*reversed(t)), top_degree)))

In [None]:
%%time
quentin_tarantino = ("Quentin Tarantino", searchPerson("Quentin Tarantino")["results"][0]["id"])
graph = create_graph(quentin_tarantino)
graph = save_core_graph(quentin_tarantino[0], graph)

In [None]:
pulp_fiction = ("Pulp Fiction", searchMovie("Pulp Fiction")[0]["id"])

In [None]:
movie_data = getMovie(pulp_fiction[1])

In [None]:
%%time
for person in movie_data["credits"]["cast"]:
    person_data = getPerson(person["id"])
    if(person_data["popularity"] >= 5):
        pair = (person_data["name"], person_data["id"])
        print("Searching " + str(pair))
        graph = create_graph_simple(pair)
        nx.write_graphml(graph, pair[0]+".graphml")
        print("Saved! " + str(pair))

In [None]:
# Saving data
IDMovies.to_csv("IDMovies.csv")
IDPeople.to_csv("IDPeople.csv")