# Import packages

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import Levenshtein
import time

# Read in data

## Read in Mubi movie data

In [2]:
mubi_movie = pd.read_csv('mubi_movie_data.csv')

In [3]:
mubi_movie.head()

Unnamed: 0,movie_id,movie_title,movie_release_year,movie_url,movie_title_language,movie_popularity,movie_image_url,director_id,director_name,director_url
0,1,La Antena,2007.0,http://mubi.com/films/la-antena,en,105,https://images.mubicdn.net/images/film/1/cache...,131,Esteban Sapir,http://mubi.com/cast/esteban-sapir
1,2,Elementary Particles,2006.0,http://mubi.com/films/elementary-particles,en,23,https://images.mubicdn.net/images/film/2/cache...,73,Oskar Roehler,http://mubi.com/cast/oskar-roehler
2,3,It's Winter,2006.0,http://mubi.com/films/its-winter,en,21,https://images.mubicdn.net/images/film/3/cache...,82,Rafi Pitts,http://mubi.com/cast/rafi-pitts
3,4,Kirikou and the Wild Beasts,2005.0,http://mubi.com/films/kirikou-and-the-wild-beasts,en,46,https://images.mubicdn.net/images/film/4/cache...,"89, 90","Michel Ocelot, Bénédicte Galup","http://mubi.com/cast/michel-ocelot, http://mub..."
4,5,Padre Nuestro,2007.0,http://mubi.com/films/padre-nuestro,en,7,https://images.mubicdn.net/images/film/5/cache...,92,Christopher Zalla,http://mubi.com/cast/christopher-zalla


## Read in Imdb movie data

In [4]:
imdb_movie = pd.read_csv('title_basic.tsv',sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
imdb_movie.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Separate Imdb movie data

### Movies without a start year

In [6]:
imdb_no_year = imdb_movie[imdb_movie.startYear == '\\N']

In [7]:
imdb_no_year.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
65778,tt0067098,tvEpisode,Willi Forst,Willi Forst,0,\N,\N,55,\N
69722,tt0071158,tvEpisode,The Arcata Promise,The Arcata Promise,0,\N,\N,\N,\N
70395,tt0071844,movie,Till Marriage Do Us Part,"Mio Dio, come sono caduta in basso!",0,\N,\N,110,Comedy
83832,tt0085677,tvEpisode,High Country,High Country,0,\N,\N,\N,Sport
84395,tt0086249,tvEpisode,Pilot,Pilot,0,\N,\N,\N,"Adventure,Comedy,Romance"


### Movies with a start year

In [8]:
tmp = imdb_movie[imdb_movie.startYear != '\\N']
imdb_has_year = tmp.copy()

In [9]:
# Change startYear dtype
imdb_has_year.startYear = pd.to_numeric(tmp.startYear)

In [10]:
imdb_has_year.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# Create unique movie list of Imdb

## Create whole movie list

In [11]:
whole_movie_set = set(imdb_movie.primaryTitle.unique())

In [12]:
# remove nan
whole_movie_set.remove(np.nan)

## Create movie list and movie dataframe by year

In [13]:
years = imdb_has_year.startYear.unique()
years.sort()

In [14]:
# create movie df with a year
movie_df_dict = {}
for i in years:
    movie_df_dict[i] = imdb_has_year[imdb_has_year.startYear == i]

In [15]:
# create movie list for each year
movie_list_dict = {}
for i in years:
    movie_list_dict[i] = set(movie_df_dict[i].primaryTitle.unique())

## Create movie list with no start year

In [16]:
movie_list_no_year = set(imdb_no_year.primaryTitle.unique())

# Create final movie genre dataframe

In [43]:
mubi_movie_list = mubi_movie.movie_title
mubi_movie_list_year = mubi_movie.movie_release_year
mubi_movie_list_id = mubi_movie.movie_id

In [44]:
movie_df = pd.DataFrame(np.nan,index = range(len(mubi_movie_list)), 
                        columns = ['mubi_id','mubi_release_year','mubi_title','imdb_title','title_similarity','genres'])

In [45]:
movie_df.mubi_id = mubi_movie_list_id
movie_df.mubi_release_year = mubi_movie_list_year.fillna(0)
movie_df.mubi_title = mubi_movie_list

In [46]:
movie_df.tail()

Unnamed: 0,mubi_id,mubi_release_year,mubi_title,imdb_title,title_similarity,genres
226570,263706,2020.0,Kill Chain: The Cyber War on America's Elections,,,
226571,263707,2020.0,We're Here,,,
226572,263708,1962.0,The Weasel,,,
226573,263709,1931.0,Scenes from the Family Life,,,
226574,263710,2001.0,Karine,,,


## Find mubi movies that also exist in Imdb movie dataframe

### Perfect match

In [47]:
movie_df.head()

Unnamed: 0,mubi_id,mubi_release_year,mubi_title,imdb_title,title_similarity,genres
0,1,2007.0,La Antena,,,
1,2,2006.0,Elementary Particles,,,
2,3,2006.0,It's Winter,,,
3,4,2005.0,Kirikou and the Wild Beasts,,,
4,5,2007.0,Padre Nuestro,,,


In [60]:
i = 0
for idx in tqdm(movie_df.index):
    mv = movie_df.loc[idx,'mubi_title']
    tmp_year = movie_df.loc[idx,'mubi_release_year']
    year_list = [tmp_year,tmp_year-1,tmp_year+1] # Give release year some tolerrance
    for y in year_list:
        if y in movie_list_dict:
            if mv in movie_list_dict[y]:
                movie_df.loc[i,'imdb_title'] = mv
                movie_df.loc[i,'title_similarity'] = 1
                tmp_df = movie_df_dict[y]
                movie_df.loc[i,'genres'] = tmp_df[tmp_df.primaryTitle == mv].iloc[0,-1]
                break
    if i >=500000:
        break
    i+=1
#     if mv in whole_movie_set:
#         movie_df.loc[i,'imdb_title'] = mv
#         movie_df.loc[i,'title_similarity'] = 1
#         movie_df.loc[i,'genres'] = imdb_movie[imdb_movie.primaryTitle == mv].head(1).genres

100%|██████████| 226575/226575 [1:00:38<00:00, 62.28it/s]


In [71]:
movie_df.to_csv('movie_genre_perfect_match.csv')

In [187]:
movie_after_perfect_match = movie_df.copy()

### Fuzzy match

In [188]:
# get the index of unmatchedd movies
unmatched_index = movie_df[movie_df.imdb_title.isna()].index

In [201]:
# i=0
for idx in tqdm(unmatched_index):
#     if idx <= 223571:
#         pass
    
#     if i >= 50:
#         break
#     else:
    mv = movie_after_perfect_match.loc[idx,'mubi_title']
    tmp_year = movie_after_perfect_match.loc[idx,'mubi_release_year']
    if tmp_year not in [0,1884,1886]:
        tmp_df = movie_df_dict[tmp_year]
        k = tmp_df.primaryTitle.dropna().apply(lambda x: Levenshtein.ratio(x.lower(),mv.lower()))
        tmp_max_score = k.max()
        tmp_idx = k.idxmax()
        movie_after_perfect_match.loc[idx,'title_similarity'] = tmp_max_score
        movie_after_perfect_match.loc[idx,'imdb_title'] = tmp_df.loc[tmp_idx].primaryTitle
        movie_after_perfect_match.loc[idx,'genres'] = tmp_df.loc[tmp_idx].genres
    else:
        tmp_df = imdb_movie.copy()
        k = tmp_df.primaryTitle.dropna().apply(lambda x: Levenshtein.ratio(x.lower(),mv.lower()))
        tmp_max_score = k.max()
        tmp_idx = k.idxmax()
        movie_after_perfect_match.loc[idx,'title_similarity'] = tmp_max_score
        movie_after_perfect_match.loc[idx,'imdb_title'] = tmp_df.loc[tmp_idx].primaryTitle
        movie_after_perfect_match.loc[idx,'genres'] = tmp_df.loc[tmp_idx].genres

#     i+=1

100%|██████████| 67115/67115 [04:02<00:00, 277.22it/s]   


In [207]:
movie_after_all_match = movie_after_perfect_match.copy()

In [223]:
movie_after_all_match.to_csv('movie_genres_after_all_match.csv')