In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import re
import io
import matplotlib.pylab as plt
from fa2 import ForceAtlas2
import json
import ast
from collections import Counter
import pickle
import time, sys
from IPython.display import clear_output

We will now create the network of directors. We do this by running through the credits data file, which contains information about directors and actors of each movie. 

In [214]:
# progress bar
def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [215]:
# open credits data file
df = pd.read_csv('./../../dataset/credits_short.csv', encoding ='utf-8')
number_of_movies = len(df.index)

In [7]:
# helper functions

# get all actors of a movie
def get_actors(movie_cast):
    cast_list = []
    # go though the cast list and append the actors name to the list
    for cast_object in movie_cast:
        cast_list.append(cast_object['name'])
    return cast_list

# get the director of a movie
def get_director(movie_crew):
    for crew_object in movie_crew:
        # find the crew member which is the director
        if (crew_object['job'] == 'Director'):
            return crew_object['name']

In [8]:
# Create director network where directors are connected if they employed the same actors
# Warning! Takes forever to run!
# we do this in a try-except clause because there are a few movies where the director or cast is missing
directors_network = nx.Graph()
directors_errors = []

for i, movie_id in enumerate(df['id'], start = 0):
    update_progress(i / number_of_movies)
    try:
        
        # get cast
        cast = df.loc[df['id']==movie_id]['cast'].item()
        cast_objects = ast.literal_eval(cast)
        list_of_actors = get_actors(cast_objects)
        
        # get director
        crew = df.loc[df['id']==movie_id]['crew'].item()
        crew_object = ast.literal_eval(crew)
        director = get_director(crew_object)
        # add director to network
        if not director in directors_network:
            directors_network.add_node(director)
        # find movies who share at least one actor 
        # we only run the loop though movies that we have not already used i.e. staring from i+1
        for other_movie in df['id'][i+1:]:
            try:
                # get cast
                cast_other_movie = df.loc[df['id']==other_movie]['cast'].item()
                cast_objects_other_movie = ast.literal_eval(cast_other_movie)
                actors = get_actors(cast_objects_other_movie)
                
                # get director
                other_crew = df.loc[df['id']==other_movie]['crew'].item()
                other_crew_object = ast.literal_eval(other_crew)
                other_director = get_director(other_crew_object)
                # if there are actors in common
                if (set(list_of_actors) & set(actors) and director != other_director):
                    if not other_director in directors_network:
                        directors_network.add_node(other_director)
                    # if edge exists, then update weight
                    if (directors_network.has_edge(other_director, director)):
                        directors_network[other_director][director]['weight'] += 1
                        directors_network[other_director][director]['movies'].append((movie_id,other_movie, set(list_of_actors) & set(actors)))
                    # otherwise add edge
                    else:
                        directors_network.add_edge(other_director, director, weight=1)
                        directors_network[other_director][director]['movies']=[(movie_id,other_movie, set(list_of_actors) & set(actors))]
            except:
                directors_errors.append(['error in other movie with id: '+ (other_movie)])
    except:
        directors_errors.append(['error in movie with id: '+ str(movie_id)])
update_progress(1)

Progress: [####################] 100.0%


In [64]:
# add more attributes to the nodes 
# (we could of course have done this when we created the network too)

#Read files
director_popularity = pd.read_csv('../../director_popularity.csv', encoding ='utf-8')
metadata = pd.read_csv('../../dataset/movies_metadata_short.csv', encoding ='utf-8')

for node in directors_network.nodes():
    # get director 
    movies = ast.literal_eval(director_popularity.loc[director_popularity['director']==node]['work'].item())
    languages = []
    genres = []
    rating = (director_popularity.loc[director_popularity['director']==node]['vote_average'].item())
    rating_count = (director_popularity.loc[director_popularity['director']==node]['vote_count'].item())
    
    for index, m in enumerate(movies, start = 0):
        release_date = (metadata.loc[metadata['id']==m]['release_date'].item())
        language = (metadata.loc[metadata['id']==m]['original_language'].item())
        languages.append(language)
        movie_genres = ast.literal_eval(metadata.loc[metadata['id']==m]['genres'].item())
        list_of_genres = []
        # get the names of each genre (they are originally saved as objects)
        for g in movie_genres:
            list_of_genres.append(g['name'])
            
        genres += list_of_genres
        
        # initialize first and last movie to release date
        if index == 0:
            first_movie = release_date
            last_movie = release_date
        else:
            # update first and last movie if necessary 
            first_movie = min(first_movie, release_date)
            last_movie = max(last_movie, release_date)
        
    directors_network.node[node]['first_movie']  = first_movie
    directors_network.node[node]['last_movie'] = last_movie
    directors_network.node[node]['languages']  = languages
    directors_network.node[node]['genres']  = genres
    directors_network.node[node]['movies']  = movies
    directors_network.node[node]['rating']  = rating
    directors_network.node[node]['rating_count']  = rating_count

In [65]:
nx.write_gpickle(directors_network,"popular_directors_network.gpickle")