# Intermovie

## Imports and useful functions

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from modules.loader import IntermovieDataLoader 

CURATED_LOCAL_PATH = '../data/CURATED/'
RAW_LOCAL_PATH = '../data/RAW/'

data_loader = IntermovieDataLoader()

In [None]:
data_loader.split_data('title.principals.tsv', 'category', ['actor', 'actress', 'director'])
data_loader.split_data('title.akas.tsv', 'region', ['US'])

In [None]:
global_df_names = pd.read_csv(f'{RAW_LOCAL_PATH}name.basics.tsv', sep='\t', usecols=["nconst", "primaryName"], encoding='utf-8')
global_df_title_ratings = pd.read_csv(f'{RAW_LOCAL_PATH}title.ratings.tsv', sep='\t', usecols=['tconst', 'averageRating'], index_col='tconst', encoding='utf-8')

In [12]:
global_df_title_basics = pd.read_csv(f'{RAW_LOCAL_PATH}title.basics.tsv', sep='\t', usecols=['tconst', 'titleType', 'originalTitle', 'genres', ], index_col='tconst', encoding='utf-8')


## 1. La liste des *acteurs* par *film*

In [None]:
# Concatenate actors and actress
df_actors = pd.concat([pd.read_csv(f'{CURATED_LOCAL_PATH}actor.csv', usecols=['tconst', 'nconst'], encoding='utf-8'), pd.read_csv(f'{CURATED_LOCAL_PATH}actress.csv', usecols=['tconst', 'nconst'], encoding='utf-8')])

# Get actors name
df_actors = df_actors.merge(global_df_names)

cast = df_actors.groupby('tconst')['primaryName'].apply(', '.join)

In [29]:
df_movies = global_df_title_basics[global_df_title_basics['titleType'] == 'movie']
df_movies.drop(columns=['titleType'], inplace=True)
df_actors_by_movie = cast.to_frame().merge(df_movies, on='tconst')
df_actors_by_movie.rename(columns={'primaryName': 'cast'}, inplace='True')
df_actors_by_movie.to_csv(CURATED_LOCAL_PATH +'cast.movies.csv', columns=['originalTitle', 'cast'])
df_actors_by_movie

Unnamed: 0_level_0,cast,originalTitle,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0000009,"William Courtenay, Chauncey Depew, Blanche Bay...",Miss Jerry,[Romance]
tt0000335,"Harold Graham, Mr. Graham, John Jones, Orrie P...",Soldiers of the Cross,"[Biography, Drama]"
tt0000502,"Antonio del Pozo, El Mochuelo",Bohemios,[\N]
tt0000574,"John Tait, Norman Campbell, Elizabeth Tait, Be...",The Story of the Kelly Gang,"[Biography, Crime, Drama]"
tt0000615,"Jim Gerald, George Merriman, Lance Vane, Willi...",Robbery Under Arms,[Drama]
...,...,...,...
tt9916428,"Kenan Heppe, Vincent Matile, Shenyang Xiao, A....",The Secret of China,"[Adventure, History, War]"
tt9916538,Sahil Shah,Kuambil Lagi Hatiku,[Drama]
tt9916622,Oldair Soares Ammom,Rodolpho Teóphilo - O Legado de um Pioneiro,[Documentary]
tt9916706,"Vijay Patkar, Makarand Anaspure, Sandip Pathak",Dankyavar Danka,[Comedy]


## 2. La liste des *films Américains* (en gardant leur *nom en français*) et *leur note moyenne*

### Titres Américains

In [17]:
df_us = pd.read_csv(f'{CURATED_LOCAL_PATH}US.csv', usecols=['titleId', 'region'], index_col='titleId', encoding='utf-8')
df_us = df_us[~df_us.index.duplicated(keep='first')]
df_us

Unnamed: 0_level_0,region
titleId,Unnamed: 1_level_1
tt0000001,US
tt0000002,US
tt0000005,US
tt0000006,US
tt0000007,US
...,...
tt9914458,US
tt9914772,US
tt9915188,US
tt9915436,US


### En gardant le nom français

In [22]:
df_movies_us = df_movies.merge(df_us, left_index=True, left_on='tconst', right_index=True, right_on='titleId')
df_movies_us_ratings = df_movies_us.merge(global_df_title_ratings, left_on='tconst', right_index=True, right_on='tconst', how='left')
df_movies_us_ratings.to_csv(f'{CURATED_LOCAL_PATH}movies.us.ratings.csv', index=False)
df_movies_us_ratings

Unnamed: 0,tconst,originalTitle,region,averageRating
tt0000009,tt0000009,Miss Jerry,US,5.4
tt0000147,tt0000147,The Corbett-Fitzsimmons Fight,US,5.2
tt0000630,tt0000630,Amleto,US,2.7
tt0000679,tt0000679,The Fairylogue and Radio-Plays,US,4.8
tt0000886,tt0000886,Hamlet,US,5.2
...,...,...,...,...
tt9904328,tt9904328,Christmas in New England,US,
tt9906644,tt9906644,Manoharam,US,7.6
tt9908592,tt9908592,Filmmakers Unite (FU),US,
tt9909228,tt9909228,Ximei,US,5.5


## Les notes moyennes des différents genres


In [18]:
global_df_title_basics['genres'] = global_df_title_basics['genres'].str.split(',')
df_genre_ratings = global_df_title_basics.merge(global_df_title_ratings, on='tconst')
df_genre_ratings = df_genre_ratings.explode('genres').groupby('genres').mean()
df_genre_ratings.to_csv(CURATED_LOCAL_PATH +'genres.ratings.csv')
df_genre_ratings

Unnamed: 0_level_0,averageRating
genres,Unnamed: 1_level_1
Action,6.951029
Adult,6.331053
Adventure,7.05673
Animation,7.046786
Biography,7.180115
Comedy,6.919199
Crime,7.165008
Documentary,7.241741
Drama,7.018454
Family,6.989731


## La note moyenne de chaque acteur par rapport aux films dans lesquels il apparaît

In [21]:
df_movies_ratings = df_movies.merge(global_df_title_ratings, left_on='tconst', right_index=True, right_on='tconst', how='left')
df_actors_ratings = df_actors.merge(df_movies_ratings, on='tconst')
df_actors_ratings = df_actors_ratings.groupby(['nconst', 'primaryName'])['averageRating'].mean()
df_actors_ratings.to_csv(CURATED_LOCAL_PATH +'actors.ratings.csv')
df_actors_ratings

nconst     primaryName              
nm0000001  Fred Astaire                 6.802778
nm0000004  John Belushi                 6.371429
nm0000005  Ingmar Bergman               7.033333
nm0000007  Humphrey Bogart              6.912500
nm0000008  Marlon Brando                6.738462
                                          ...   
nm9993567  Hachiro Chiba                     NaN
nm9993616  Ryan Mac Lennan              6.500000
nm9993636  Adam French                       NaN
nm9993650  Marcin Balcerak              5.200000
nm9993680  Christopher-Lawson Palmer         NaN
Name: averageRating, Length: 359686, dtype: float64

## Bonus

Content-based recommenders: suggest similar items based on a particular item. This system uses item metadata, such as genre, director, description, actors, etc. for movies, to make these recommendations. The general idea behind these recommender systems is that if a person likes a particular item, he or she will also like an item that is similar to it. And to recommend that, it will make use of the user's past item metadata. A good example could be YouTube, where based on your *history*, it suggests you new videos that you could potentially watch.

### Problem formulation
To build a recommender system that recommends movies based on the genre, cast and crew of a previously watched movie.

In [47]:
df_cast = cast.to_frame()
df_cast.rename(columns={'primaryName': 'cast'}, inplace='True')

In [48]:
df_directors = pd.read_csv(f'{CURATED_LOCAL_PATH}director.csv', usecols=['tconst', 'nconst'], encoding='utf-8')
df_directors = df_directors.merge(global_df_names)
directors = df_directors.groupby('tconst')['primaryName'].apply(', '.join)

In [49]:
df_directors = directors.to_frame()
df_directors.rename(columns={'primaryName': 'crew'}, inplace='True')

In [50]:
df_cast_crew = df_cast.merge(df_directors, on='tconst')

In [51]:
def get_list(x):
    if isinstance(x, list):
        names = [i for i in x]
        # Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    # Return empty list in case of missing/malformed data
    return []

df_imdb = df_cast_crew.merge(df_movies, on='tconst')
# Remove space
df_imdb['cast'] = df_imdb['cast'].apply(lambda x: str(x).replace(' ', ''))
df_imdb['crew'] = df_imdb['crew'].apply(lambda x: str(x).replace(' ', ''))
df_imdb['cast'] = df_imdb['cast'].apply(lambda x: x.split(','))
df_imdb['crew'] = df_imdb['crew'].apply(lambda x: x.split(','))
df_imdb['metadata'] = df_imdb.apply(lambda x : ' '.join(x['genres']) + ' ' + ' '.join(x['cast']) + ' ' + ' '.join(x['crew']), axis = 1)
df_imdb['cast'] = df_imdb['cast'].apply(get_list)
df_imdb

Unnamed: 0_level_0,cast,crew,originalTitle,genres,metadata
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0000009,"[WilliamCourtenay, ChaunceyDepew, BlancheBayliss]",[AlexanderBlack],Miss Jerry,[Romance],Romance WilliamCourtenay ChaunceyDepew Blanche...
tt0000335,"[HaroldGraham, Mr.Graham, JohnJones]","[HerbertBooth, JosephPerry]",Soldiers of the Cross,"[Biography, Drama]",Biography Drama HaroldGraham Mr.Graham JohnJon...
tt0000502,"[AntoniodelPozo, ElMochuelo]",[RicardodeBaños],Bohemios,[\N],\N AntoniodelPozo ElMochuelo RicardodeBaños
tt0000574,"[JohnTait, NormanCampbell, ElizabethTait]",[CharlesTait],The Story of the Kelly Gang,"[Biography, Crime, Drama]",Biography Crime Drama JohnTait NormanCampbell ...
tt0000615,"[JimGerald, GeorgeMerriman, LanceVane]",[CharlesMacMahon],Robbery Under Arms,[Drama],Drama JimGerald GeorgeMerriman LanceVane Willi...
...,...,...,...,...,...
tt0004302,"[FredMontague, MaxFigman, HarryFisher]","[OscarApfel, CecilB.DeMille]",The Man on the Box,"[Comedy, Drama]",Comedy Drama FredMontague MaxFigman HarryFishe...
tt0004303,"[HenryWeaver, LucienMuratore, WilliamL.Abingdon]",[HerbertHallWinslow],Manon Lescaut,"[Drama, Romance]",Drama Romance HenryWeaver LucienMuratore Willi...
tt0004306,"[WellingtonA.Playter, GeorgeMoss, HalClarendon]",[J.SearleDawley],Marta of the Lowlands,[Drama],Drama WellingtonA.Playter GeorgeMoss HalClaren...
tt0004307,"[RayMyers, HerbertRawlinson, LawrencePeyton]",[HobartBosworth],Martin Eden,[Drama],Drama RayMyers HerbertRawlinson LawrencePeyton...


In [58]:
count_vec = CountVectorizer()
count_vec_matrix = count_vec.fit_transform(df_imdb['metadata'])
cosine_sim_matrix = cosine_similarity(count_vec_matrix, count_vec_matrix)
df_imdb = df_imdb.reset_index()
mapping = pd.Series(df_imdb.index, index = df_imdb['originalTitle'])

In [65]:
#recommender function to recommend movies based on metadata
def recommend_movies_based_on_metadata(movie_input):
    movie_index = mapping[movie_input]
    #get similarity values with other movies
    similarity_score = list(enumerate(cosine_sim_matrix[movie_index]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    # Get the scores of the 15 most similar movies. Ignore the first movie.
    similarity_score = similarity_score[1:15]
    movie_indices = [i[0] for i in similarity_score]
    return (df_imdb['originalTitle'].iloc[movie_indices])

In [66]:
recommend_movies_based_on_metadata('Germinal')

41     Les misérables - Époque 1: Jean Valjean
155         Les misérables - Époque 2: Fantine
5                                       Amleto
8                                Andreas Hofer
288                         A Factory Magdalen
317                                     Amleto
6              Don Álvaro o la fuerza del sino
10                                      Hamlet
15                                      Amleto
48                                      Anfisa
101                Buried Alive in a Coal Mine
140             Kleiner Svend und seine Mutter
148                       The Lure of New York
160             One Hundred Years of Mormonism
Name: originalTitle, dtype: object