In [1]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval
from itertools import combinations

pd.options.mode.chained_assignment = None

In [2]:
data_path = "./data"
if os.listdir(data_path) != ['tmdb_5000_credits.csv', 'tmdb_5000_movies.csv']:
    print("[ERROR] Please download and unzip the dataset in a subdirectory './data'.")
else:
    print("[INFO] The dataset is correctly placed.")

[INFO] The dataset is correctly placed.


### Loading the data

In [3]:
tmdb_credits = pd.read_csv(os.path.join(data_path, "tmdb_5000_credits.csv"))
tmdb_credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [4]:
tmdb_movies = pd.read_csv(os.path.join(data_path, "tmdb_5000_movies.csv"))
tmdb_movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [5]:
def mapping_casts(list_casts, useful_keys = {"id", "gender", "name"}):
    """Mapping casts to keep useful information."""
    result = []
    for cast in list_casts:
        filtering = {k:v for k,v in cast.items() if k in useful_keys}
        if len(filtering) == len(useful_keys):
            result.append(filtering)
    return result
mapping_casts(literal_eval(tmdb_credits.cast[0]))[:5]

[{'gender': 2, 'id': 65731, 'name': 'Sam Worthington'},
 {'gender': 1, 'id': 8691, 'name': 'Zoe Saldana'},
 {'gender': 1, 'id': 10205, 'name': 'Sigourney Weaver'},
 {'gender': 2, 'id': 32747, 'name': 'Stephen Lang'},
 {'gender': 1, 'id': 17647, 'name': 'Michelle Rodriguez'}]

In [6]:
tmdb_credits_clean = tmdb_credits.copy()
tmdb_credits_clean['cast'] = tmdb_credits_clean['cast'].apply(lambda x: mapping_casts(literal_eval(x)))
tmdb_credits_clean.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{'gender': 2, 'id': 65731, 'name': 'Sam Worth...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{'gender': 2, 'id': 85, 'name': 'Johnny Depp'...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{'gender': 2, 'id': 8784, 'name': 'Daniel Cra...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{'gender': 2, 'id': 3894, 'name': 'Christian ...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{'gender': 2, 'id': 60900, 'name': 'Taylor Ki...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [7]:
tmdb_credits_clean_cast = tmdb_credits_clean[["movie_id", "title", "cast"]].explode("cast")


def get_gender(cast):
    splitted = str(cast).split()
    if len(splitted)>1:
        return splitted[1][0]
    else:
        -1
        
def get_id(cast):
    splitted = str(cast).split()
    if len(splitted)>1:
        return splitted[3].replace(",", "")
    else:
        -1
        
def get_name(cast):
    splitted = str(cast).split(", ")
    if len(splitted)>1:
        return splitted[2].replace("'", "").replace(",", "").replace("name: ", "").replace("}", "")
    else:
        -1

tmdb_credits_clean_cast['gender'] = tmdb_credits_clean_cast.cast.apply(get_gender)
tmdb_credits_clean_cast = tmdb_credits_clean_cast[tmdb_credits_clean_cast.gender != -1]

tmdb_credits_clean_cast['person_id'] = tmdb_credits_clean_cast.cast.apply(get_id)
tmdb_credits_clean_cast = tmdb_credits_clean_cast[tmdb_credits_clean_cast.person_id != -1]

tmdb_credits_clean_cast['name'] = tmdb_credits_clean_cast.cast.apply(get_name)
tmdb_credits_clean_cast = tmdb_credits_clean_cast[tmdb_credits_clean_cast.name != -1]

tmdb_credits_clean_cast.drop(columns=['cast'], inplace=True)
tmdb_credits_clean_cast.head()

Unnamed: 0,movie_id,title,gender,person_id,name
0,19995,Avatar,2,65731,Sam Worthington
0,19995,Avatar,1,8691,Zoe Saldana
0,19995,Avatar,1,10205,Sigourney Weaver
0,19995,Avatar,2,32747,Stephen Lang
0,19995,Avatar,1,17647,Michelle Rodriguez


In [8]:
create_pair = lambda x : pd.DataFrame(list(combinations(x.values,2)), 
                            columns=['actor_1','actor_2'])

combo = (tmdb_credits_clean_cast.groupby('movie_id')['name'].apply(create_pair)
                               .reset_index(level=1, drop=True)
                               .reset_index())
combo['check_string'] = combo.apply(lambda row: ''.join(sorted([row['actor_1'], row['actor_2'], str(row['movie_id'])])), axis=1)
combo.drop_duplicates('check_string', inplace=True)
combo.drop(columns=['check_string'], inplace=True)

In [9]:
combo = combo[combo.actor_1 != combo.actor_2]
combo.head()

Unnamed: 0,movie_id,actor_1,actor_2
0,5,Tim Roth,Antonio Banderas
1,5,Tim Roth,Jennifer Beals
2,5,Tim Roth,Madonna
3,5,Tim Roth,Marisa Tomei
4,5,Tim Roth,Bruce Willis


In [10]:
pair = combo.groupby(by=['actor_1', 'actor_2'])['movie_id'].apply(list)
pair = pair.reset_index()
pair.head()

Unnamed: 0,actor_1,actor_2,movie_id
0,Jorge de los Reyes,Anna Haack,[242575]
1,Jorge de los Reyes,Esther Maria Pietsch,[242575]
2,Jorge de los Reyes,Frank W. Rima,[242575]
3,Jorge de los Reyes,Jorge Ramírez Suárez,[242575]
4,Jorge de los Reyes,Karl Friedrich,[242575]


In [30]:
def filter_num_common_movies(n):
    """ Return pair with at least n common movies. """
    temp_df = pair[pair['movie_id'].map(len) >= n]
    remove_qm = lambda serie: serie.apply(lambda name: name.replace('"', ''))
    temp_df['actor_1'] = remove_qm(temp_df['actor_1'])
    temp_df['actor_2'] = remove_qm(temp_df['actor_2'])
    return temp_df[temp_df['movie_id'].map(len) >= n]
filter_num_common_movies(3).head()

Unnamed: 0,actor_1,actor_2,movie_id
2131,Brian OHalloran,Jeff Anderson,"[1832, 2292, 2295]"
2155,Brian OHalloran,Scott Mosier,"[1832, 2292, 2293, 2295]"
2163,Brian OHalloran,Walt Flanagan,"[1832, 2292, 2293, 2295]"
3050,Catherine OHara,Bob Balaban,"[13785, 14799, 16448]"
3079,Catherine OHara,Don Lake,"[13370, 14799, 16448]"


In [39]:
def get_unique_movies(n):
    """ Return movies of actors with at least n common movies. """
    slist =[]
    for x in filter_num_common_movies(n)['movie_id']:
        slist.extend(x)
    return sorted(list(set(slist)))
get_unique_movies(3)[:3]

[5, 11, 12]

In [40]:
def parse_genres(genres):
    L=[]
    for genre in genres:
        L.append(genre['name'])
    if len(L) == 0:
        return np.nan
    return L

def parse_production_companies(prod_companies):
    L=[]
    for company in prod_companies:
        L.append(company['name'])
    if len(L) == 0:
        return np.nan
    return L

def get_movies_info(n):
    """ Return info for movies with n common movies. """
    movies_to_keep = get_unique_movies(n)
    temp_df = tmdb_movies[tmdb_movies['id'].isin(movies_to_keep)]
    temp_df.drop(columns=['original_language', 'keywords', 'production_countries', 'spoken_languages', 'status', 'original_title'], inplace=True)
    temp_df["genres"] = temp_df.genres.apply(lambda x: parse_genres(literal_eval(x)))
    temp_df["production_companies"] = temp_df.production_companies.apply(lambda x: parse_genres(literal_eval(x)))
    return temp_df
get_movies_info(3).head(2)

Unnamed: 0,budget,genres,homepage,id,overview,popularity,production_companies,release_date,revenue,runtime,tagline,title,vote_average,vote_count
1,300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,285,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films...",2007-05-19,961000000,169.0,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[Action, Adventure, Crime]",http://www.sonypictures.com/movies/spectre/,206647,A cryptic message from Bond’s past sends him o...,107.376788,"[Columbia Pictures, Danjaq, B24]",2015-10-26,880674609,148.0,A Plan No One Escapes,Spectre,6.3,4466


In [57]:
def make_nodes(n):
    """ Return nodes field. """
    df = filter_num_common_movies(n)
    gender_df = tmdb_credits_clean_cast[['name', 'gender', 'person_id']].drop_duplicates(subset='name').dropna()
    gender_df.name = gender_df.name.apply(lambda name: name.replace('"', ''))
    l = df['actor_1'].append(df['actor_2'], ignore_index=True).unique()
    actors = pd.DataFrame(l,columns=['name'])
    joined = actors.join(gender_df.set_index('name'), on='name', how='left').reset_index()
    joined.fillna(0, inplace=True)

    mapper = lambda row: {"id":row.values[1], "gender":row.values[2], "person_id":row.values[3]}
    nodes = []
    for i, row in joined.iterrows():
        nodes.append(mapper(row))
    return nodes
make_nodes(3)[0]

{'id': 'Brian OHalloran', 'gender': '2', 'person_id': '23629'}

In [58]:
def make_links(n):
    df = filter_num_common_movies(n)
    mapper = lambda row: {"source": row['actor_1'], "target": row['actor_2'], "movie_id": row['movie_id']}
    links = []
    for i, row in df.iterrows():
        links.append(mapper(row))
    return links
make_links(3)[0]

{'source': 'Brian OHalloran',
 'target': 'Jeff Anderson',
 'movie_id': [1832, 2292, 2295]}

In [59]:
def make_movie_infos(n):
    df = get_movies_info(n).fillna("")
    movies_info = []
    print(df)
    #df.dropna(inplace=True)
    mapper = lambda row: {"movie_id": row.id,\
                          "genres": row.genres,\
                          "homepage": row.homepage,\
                          "title": row.title,\
                          "popularity": row.popularity,\
                          "release_date": row.release_date,\
                          "vote_average": row.vote_average,\
                          "production_companies": row.production_companies,\
                          "overview": row.overview,\
                          "revenue": row.revenue,\
                          "budget": row.budget}
    for i, row in df.iterrows():
        movies_info.append(mapper(row))
    return movies_info
make_movie_infos(3)[0]

         budget                            genres  \
1     300000000      [Adventure, Fantasy, Action]   
2     245000000        [Action, Adventure, Crime]   
3     250000000  [Action, Crime, Drama, Thriller]   
5     258000000      [Fantasy, Action, Adventure]   
6     260000000               [Animation, Family]   
...         ...                               ...   
4697          0                 [Comedy, Romance]   
4738      60000        [Mystery, Drama, Thriller]   
4772      31192           [Drama, Action, Comedy]   
4773      27000                          [Comedy]   
4802          0                     [Documentary]   

                                             homepage      id  \
1        http://disney.go.com/disneypictures/pirates/     285   
2         http://www.sonypictures.com/movies/spectre/  206647   
3                  http://www.thedarkknightrises.com/   49026   
5     http://www.sonypictures.com/movies/spider-man3/     559   
6        http://disney.go.com/disneypi

{'movie_id': 285,
 'genres': ['Adventure', 'Fantasy', 'Action'],
 'homepage': 'http://disney.go.com/disneypictures/pirates/',
 'title': "Pirates of the Caribbean: At World's End",
 'popularity': 139.082615,
 'release_date': '2007-05-19',
 'vote_average': 6.9,
 'production_companies': ['Walt Disney Pictures',
  'Jerry Bruckheimer Films',
  'Second Mate Productions'],
 'overview': 'Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems.',
 'revenue': 961000000,
 'budget': 300000000}

In [60]:
import json

def write_json(n):
    """ Write a json out of nodes, links, movie_info fields with num_common being the number n of common movies. """
    data = {}
    data['nodes'] = make_nodes(n)
    data['links'] = make_links(n)
    data['movies_info'] = make_movie_infos(n)
    with open('dataset_{}_common_movies.json'.format(n), 'w') as outfile:
        json.dump(data, outfile)
for n in range(3,11):
    write_json(n)

         budget                            genres  \
1     300000000      [Adventure, Fantasy, Action]   
2     245000000        [Action, Adventure, Crime]   
3     250000000  [Action, Crime, Drama, Thriller]   
5     258000000      [Fantasy, Action, Adventure]   
6     260000000               [Animation, Family]   
...         ...                               ...   
4697          0                 [Comedy, Romance]   
4738      60000        [Mystery, Drama, Thriller]   
4772      31192           [Drama, Action, Comedy]   
4773      27000                          [Comedy]   
4802          0                     [Documentary]   

                                             homepage      id  \
1        http://disney.go.com/disneypictures/pirates/     285   
2         http://www.sonypictures.com/movies/spectre/  206647   
3                  http://www.thedarkknightrises.com/   49026   
5     http://www.sonypictures.com/movies/spider-man3/     559   
6        http://disney.go.com/disneypi

         budget                                genres  \
5     258000000          [Fantasy, Action, Adventure]   
7     280000000  [Action, Adventure, Science Fiction]   
8     250000000          [Adventure, Fantasy, Family]   
13    255000000          [Action, Adventure, Western]   
16    220000000  [Science Fiction, Action, Adventure]   
...         ...                                   ...   
4359          0                               [Drama]   
4593          0                         [Documentary]   
4631     250000              [Comedy, Drama, Romance]   
4697          0                     [Comedy, Romance]   
4773      27000                              [Comedy]   

                                               homepage      id  \
5       http://www.sonypictures.com/movies/spider-man3/     559   
7     http://marvel.com/movies/movie/193/avengers_ag...   99861   
8     http://harrypotter.warnerbros.com/harrypottera...     767   
13                http://disney.go.com/the-lone

         budget                                genres  \
7     280000000  [Action, Adventure, Science Fiction]   
8     250000000          [Adventure, Fantasy, Family]   
16    220000000  [Science Fiction, Action, Adventure]   
26    250000000  [Adventure, Action, Science Fiction]   
31    200000000  [Action, Adventure, Science Fiction]   
...         ...                                   ...   
3884    2500000         [Adventure, Action, Thriller]   
4071    2000000         [Action, Thriller, Adventure]   
4339     950000         [Adventure, Action, Thriller]   
4359          0                               [Drama]   
4593          0                         [Documentary]   

                                               homepage      id  \
7     http://marvel.com/movies/movie/193/avengers_ag...   99861   
8     http://harrypotter.warnerbros.com/harrypottera...     767   
16                    http://marvel.com/avengers_movie/   24428   
26             http://marvel.com/captainamerica

         budget                                          genres  \
121   150000000    [Adventure, Fantasy, Action, Comedy, Family]   
195   127000000            [Adventure, Comedy, Fantasy, Family]   
243   110000000    [Action, Adventure, Comedy, Family, Fantasy]   
284   100000000                               [Comedy, Romance]   
361    90000000                                [Comedy, Action]   
362    88000000               [Action, Comedy, Science Fiction]   
390    85000000            [Animation, Comedy, Family, Fantasy]   
416    82500000               [Comedy, Drama, Fantasy, Romance]   
434    82000000                                 [Comedy, Drama]   
436    80000000                                        [Comedy]   
443    80000000              [Fantasy, Comedy, Family, Romance]   
445    80000000                               [Romance, Comedy]   
460    80000000                       [Comedy, Romance, Family]   
464    80000000                     [Animation, Comedy, Family

         budget                                          genres  \
121   150000000    [Adventure, Fantasy, Action, Comedy, Family]   
195   127000000            [Adventure, Comedy, Fantasy, Family]   
243   110000000    [Action, Adventure, Comedy, Family, Fantasy]   
284   100000000                               [Comedy, Romance]   
362    88000000               [Action, Comedy, Science Fiction]   
390    85000000            [Animation, Comedy, Family, Fantasy]   
436    80000000                                        [Comedy]   
443    80000000              [Fantasy, Comedy, Family, Romance]   
445    80000000                               [Romance, Comedy]   
460    80000000                       [Comedy, Romance, Family]   
464    80000000                     [Animation, Comedy, Family]   
471    85000000                      [Comedy, Fantasy, Romance]   
499    79000000                                        [Comedy]   
515    75000000                               [Comedy, Romance