In [1]:
# Import libraries and datasets
import pandas as pd
import numpy as np

film_rating = pd.read_csv('movie_rating_df.csv')
dir_wri = pd.read_csv('directors_writers.csv')
actor = pd.read_csv('actor_name.csv')

# film_rating dataset preview and information about the dataset
print('film_rating dataset preview\n')
print(film_rating.head(),'\n')
print('film_rating dataset info\n')
print(film_rating.info(),'\n')

# director and writer name dataset preview and information
print('dir_wri dataset preview\n')
print(dir_wri.head(),'\n')
print('dir_wri dataset info\n')
print(dir_wri.info(),'\n')

# actor and actress names dataset preview and information
print('actor dataset preview\n')
print(actor.head(),'\n')
print('actor dataset info\n')
print(actor.info(),'\n')

film_rating dataset preview

      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

   isAdult  startYear  endYear  runtimeMinutes                    genres  \
0        0     1894.0      NaN             1.0         Documentary,Short   
1        0     1892.0      NaN             5.0           Animation,Short   
2        0     1892.0      NaN             4.0  Animation,Comedy,Romance   
3        0     1892.0      NaN            12.0           Animation,Short   
4        0     1893.0      NaN             1.0              Comedy,Short   

   averageRating  numVotes  
0            5.6      1608  
1          

In [2]:
# Splitting director and writer names
for a in ['director_name', 'writer_name']:
    dir_wri[a] = dir_wri[a].apply(lambda x: x.split(','))
print(dir_wri.head())

      tconst                      director_name  \
0  tt0011414                   [David Kirkland]   
1  tt0011890                [Roy William Neill]   
2  tt0014341  [Buster Keaton, John G. Blystone]   
3  tt0018054                 [Cecil B. DeMille]   
4  tt0024151                      [James Cruze]   

                                         writer_name  
0                         [John Emerson, Anita Loos]  
1   [Arthur F. Goodrich, Burns Mantle, Mary Murillo]  
2  [Jean C. Havez, Clyde Bruckman, Joseph A. Mitc...  
3                                [Jeanie Macpherson]  
4               [Max Miller, Wells Root, Jack Jevne]  


In [3]:
# Rearrange actor dataset, each data point consists of 1 movie title (knownForTitle column)
actor['knownForTitles'] = actor['knownForTitles'].apply(lambda x: x.split(','))
for b in ['knownForTitles']:
    idx = actor.index.repeat(actor['knownForTitles'].str.len())
    repeated_index = pd.DataFrame({b:np.concatenate(actor[b].values)}, index = idx)
unnested = repeated_index.join(actor.drop(columns = 'knownForTitles', axis = 1), how = 'left')
unnested = unnested[actor.columns.to_list()]
print(unnested.head(),'\n')

       nconst        primaryName birthYear deathYear  \
0   nm1774132  Nathan McLaughlin      1973        \N   
0   nm1774132  Nathan McLaughlin      1973        \N   
0   nm1774132  Nathan McLaughlin      1973        \N   
0   nm1774132  Nathan McLaughlin      1973        \N   
1  nm10683464      Bridge Andrew        \N        \N   

                    primaryProfession knownForTitles  
0  special_effects,make_up_department      tt0417686  
0  special_effects,make_up_department      tt1713976  
0  special_effects,make_up_department      tt1891860  
0  special_effects,make_up_department      tt0454839  
1                               actor      tt7718088   



In [4]:
# Merge all datasets into single dataframe as base of movie recommender system
unnested_drop = unnested.drop(columns = 'nconst', axis = 1)
for c in ['primaryName']:
    df_ud = unnested_drop.groupby('knownForTitles')[c].apply(list).reset_index()
df_ud = df_ud.rename(columns = {'primaryName':'cast_name'})
base_df = pd.merge(left = df_ud, right = film_rating, left_on = 'knownForTitles', right_on = 'tconst', how = 'inner')
base_df = pd.merge(left = base_df, right = dir_wri, left_on = 'tconst', right_on = 'tconst', how = 'left')
print(base_df.head())

  knownForTitles           cast_name     tconst titleType  \
0      tt0011414  [Natalie Talmadge]  tt0011414     movie   
1      tt0011890  [Natalie Talmadge]  tt0011890     movie   
2      tt0014341  [Natalie Talmadge]  tt0014341     movie   
3      tt0018054     [Reeka Roberts]  tt0018054     movie   
4      tt0024151     [James Hackett]  tt0024151     movie   

             primaryTitle           originalTitle  isAdult  startYear  \
0         The Love Expert         The Love Expert        0     1920.0   
1               Yes or No               Yes or No        0     1920.0   
2         Our Hospitality         Our Hospitality        0     1923.0   
3       The King of Kings       The King of Kings        0     1927.0   
4  I Cover the Waterfront  I Cover the Waterfront        0     1933.0   

   endYear  runtimeMinutes                   genres  averageRating  numVotes  \
0      NaN            60.0           Comedy,Romance            4.9       136   
1      NaN            72.0        

In [8]:
base_df_drop = base_df.drop(columns = 'tconst', axis = 1)

# Missing value checking
print(base_df_drop.isna().sum(),'\n')

# Handling missing value
base_df_drop['genres'] = base_df_drop['genres'].fillna('unknown')
base_df_drop[['director_name','writer_name']] = base_df_drop[['director_name','writer_name']].fillna('unknown')
print(base_df_drop.isna().sum(),'\n')

# Preview base_df_drop dataset
print(base_df_drop.head(),'\n')

# Splitting genres into list
base_df_drop['genres'] = base_df_drop['genres'].apply(lambda x: x.split(','))

# Drop knownForTitles, endYear, isAdult, originalTitle. Rename some of remaining columns
base_df_drop_2 = base_df_drop.drop(columns = ['knownForTitles','endYear','isAdult','originalTitle'], axis = 1)
base_df_drop_2 = base_df_drop_2.rename(columns = {'titleType':'type', 'primaryTitle':'title',
                                                 'startYear':'year', 'runtimeMinutes':'duration',
                                                 'averageRating':'rating', 'numVotes':'votes',})

print('base_df_drop dataframe preview after columns drop and renaming columns\n')
print(base_df_drop_2.head())

knownForTitles      0
cast_name           0
titleType           0
primaryTitle        0
originalTitle       0
isAdult             0
startYear           0
endYear           950
runtimeMinutes      0
genres            315
averageRating       0
numVotes            0
director_name      74
writer_name        74
dtype: int64 

knownForTitles      0
cast_name           0
titleType           0
primaryTitle        0
originalTitle       0
isAdult             0
startYear           0
endYear           950
runtimeMinutes      0
genres              0
averageRating       0
numVotes            0
director_name       0
writer_name         0
dtype: int64 

  knownForTitles           cast_name titleType            primaryTitle  \
0      tt0011414  [Natalie Talmadge]     movie         The Love Expert   
1      tt0011890  [Natalie Talmadge]     movie               Yes or No   
2      tt0014341  [Natalie Talmadge]     movie         Our Hospitality   
3      tt0018054     [Reeka Roberts]     movie       The K

In [35]:
# Making metadata classification based on movie genre, name of casts, directors name, and writers name
metadata = ['genres','cast_name','director_name','writer_name']
metadata_df = base_df_drop_2[metadata]
metadata_df = pd.concat([base_df_drop_2['title'], metadata_df], axis = 1) 
def meta_soup_material(cols):
    try:
        if isinstance(cols, list):
            return [col.replace(' ','').lower() for col in cols]
        else:
            return [cols.replace(' ','').lower()]
    except:
        print(cols)
for d in metadata:
    metadata_df[d] = metadata_df[d].apply(meta_soup_material)
print(metadata_df.head())

                    title                       genres          cast_name  \
0         The Love Expert            [comedy, romance]  [natalietalmadge]   
1               Yes or No                    [unknown]  [natalietalmadge]   
2         Our Hospitality  [comedy, romance, thriller]  [natalietalmadge]   
3       The King of Kings  [biography, drama, history]     [reekaroberts]   
4  I Cover the Waterfront             [drama, romance]     [jameshackett]   

                    director_name  \
0                 [davidkirkland]   
1               [roywilliamneill]   
2  [busterkeaton, johng.blystone]   
3                [cecilb.demille]   
4                    [jamescruze]   

                                      writer_name  
0                        [johnemerson, anitaloos]  
1    [arthurf.goodrich, burnsmantle, marymurillo]  
2  [jeanc.havez, clydebruckman, josepha.mitchell]  
3                              [jeaniemacpherson]  
4               [maxmiller, wellsroot, jackjevne]  


In [37]:
# Making metadata soup for recommender system
def meta_soup(col_element):
    return (' '.join(col_element['cast_name']) + ' ' +
            ' '.join(col_element['genres']) + ' ' +
            ' '.join(col_element['director_name']) + ' '+
            ' '.join(col_element['writer_name']) + ' ')
metadata_df['soup'] = metadata_df.apply(meta_soup, axis = 1)
print('metadata_df dataframe preview after making metadata soup in it\n')
print(metadata_df.head())

metadata_df dataframe preview after making metadata soup in it

                    title                       genres          cast_name  \
0         The Love Expert            [comedy, romance]  [natalietalmadge]   
1               Yes or No                    [unknown]  [natalietalmadge]   
2         Our Hospitality  [comedy, romance, thriller]  [natalietalmadge]   
3       The King of Kings  [biography, drama, history]     [reekaroberts]   
4  I Cover the Waterfront             [drama, romance]     [jameshackett]   

                    director_name  \
0                 [davidkirkland]   
1               [roywilliamneill]   
2  [busterkeaton, johng.blystone]   
3                [cecilb.demille]   
4                    [jamescruze]   

                                      writer_name  \
0                        [johnemerson, anitaloos]   
1    [arthurf.goodrich, burnsmantle, marymurillo]   
2  [jeanc.havez, clydebruckman, josepha.mitchell]   
3                              [jeanie

In [43]:
# import another scikit library to use count vectorizer function and cosine similarity function on metadata_df dataframe
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vocab_count = CountVectorizer(stop_words = 'english')
vocab_vector = vocab_count.fit_transform(metadata_df['soup'])
print('shape of vocabulary vector consists of {} rows and {} columns\n'.format(vocab_vector.shape[0], vocab_vector.shape[1]))
print('matrix of vocabulary vector\n')
print(vocab_vector,'\n')

# search the similarities in every data point
similarity_matrix = cosine_similarity(vocab_vector, vocab_vector)
print('matrix of movie similarities\n')
print(similarity_matrix)

shape of vocabulary vector consists of 1060 rows and 10026 columns

matrix of vocabulary vector

  (0, 6898)	1
  (0, 1843)	1
  (0, 8261)	1
  (0, 2172)	1
  (0, 4620)	1
  (0, 517)	1
  (1, 6898)	1
  (1, 9570)	1
  (1, 8342)	1
  (1, 665)	1
  (1, 3364)	1
  (1, 1249)	1
  (1, 6274)	1
  (2, 6898)	1
  (2, 1843)	1
  (2, 8261)	1
  (2, 9304)	1
  (2, 1253)	1
  (2, 4633)	1
  (2, 1023)	1
  (2, 4215)	1
  (2, 3623)	1
  (2, 1814)	1
  (2, 4844)	1
  (2, 6789)	1
  :	:
  (1056, 1895)	1
  (1056, 6858)	1
  (1056, 9584)	1
  (1056, 758)	1
  (1056, 5620)	1
  (1056, 282)	1
  (1056, 782)	1
  (1057, 2538)	1
  (1057, 1895)	1
  (1057, 4786)	1
  (1057, 7830)	2
  (1058, 9304)	1
  (1058, 3771)	1
  (1058, 8467)	1
  (1058, 9060)	2
  (1058, 7794)	1
  (1058, 7733)	1
  (1058, 3465)	1
  (1058, 9068)	1
  (1059, 1843)	1
  (1059, 3771)	1
  (1059, 2884)	1
  (1059, 5463)	1
  (1059, 3462)	1
  (1059, 4575)	2 

matrix of movie similarities

[[1.         0.15430335 0.35355339 ... 0.         0.         0.13608276]
 [0.15430335 1.       

In [49]:
# Building the recommender system
indices = pd.Series(metadata_df.index, index = metadata_df['title']).drop_duplicates()
def recommender_system(film_title):
    # Number of index based on specified movie title
    idx = indices[film_title]
    print('Movie with title "{}" is in {}th index\n'.format(film_title, idx))
    
    # cosine similarity array
    score_similar = list(enumerate(similarity_matrix[idx]))
    
    # sort similarity from second highest score to lowest score
    score_similar = sorted(score_similar, key = lambda x: x[1], reverse = True)
    
    # return the index number
    movie_sim_score = score_similar[1:11]
    movie_sim_index = [i[0] for i in movie_sim_score]
    
    # locate the movie according to the specified index number
    back_to_base = base_df.iloc[movie_sim_index]
    print('top 10 movies that is similar to "{}"\n'.format(film_title))
    print(back_to_base)

# Call the recommender system function
recommender_system('The Lion King')

Movie with title "The Lion King" is in 974th index

top 10 movies that is similar to "The Lion King"

     knownForTitles                   cast_name      tconst titleType  \
848       tt3040964  [Cristina Carrión Márquez]   tt3040964     movie   
383       tt0286336          [Francisco Bretas]   tt0286336  tvSeries   
1002      tt7222086          [Hiroki Matsukawa]   tt7222086  tvSeries   
73        tt0075147             [Joaquín Parra]   tt0075147     movie   
232       tt0119051            [Chris Kosloski]   tt0119051     movie   
556      tt10068158          [Hiroki Matsukawa]  tt10068158     movie   
9         tt0028657            [Bernard Loftus]   tt0028657     movie   
191       tt0107875               [Simon Mayal]   tt0107875     movie   
803       tt2356464               [Sina Müller]   tt2356464     movie   
983       tt6270328                   [Jo Boag]   tt6270328  tvSeries   

                                         primaryTitle  \
848                                  