# Film recommender system - NLP style


Jo and I are always perplexed as to which film to watch together. To combat this I've made a list of all the films we've watched together and had her rate them with a simple system of 0 = bad, 1 = ok, 2 = good. I hope to be able to make a content based recommender system that we could use to find future films.

The key ingredient is a massive film database that I found on kaggle from here:

https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset

Let's take a look at it.

In [1]:
# import modules and data

import pandas as pd
import numpy as np
import re

films = pd.read_csv('IMDb movies.csv')

# rename the id column for future merging

films = films.rename(columns={'imdb_title_id':'Id'})

# we run into some memory issues later and so I'm removing all films before 1980 from the list

# complicated function to change the year column to just a year that's an integer

films['year'] = films['year'].apply(lambda x: int(re.findall("[0-9]{4}", str(x))[0]))

films = films[films['year']>1980]

print(films.shape)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(63432, 22)


In [2]:
# We're only interested in the id, title and description

films = films[['Id', 'original_title', 'description']].reset_index()

films.head()

Unnamed: 0,index,Id,original_title,description
0,560,tt0017938,La glace à trois faces,Psychological narrative avantgarde film about ...
1,4334,tt0035423,Kate & Leopold,An English Duke from 1876 is inadvertedly drag...
2,4653,tt0036606,"Another Time, Another Place","Set in 1943 Scotland during World War II, Jani..."
3,13237,tt0062181,Rece do góry,"Censored by the Polish authorities, this movie..."
4,14127,tt0064730,Nihon boryoku-dan: Kumicho,Coming out of jail and hoping for a quiet life...


In [3]:
# check for NaN values

films.isnull().sum()

index                0
Id                   0
original_title       0
description       1759
dtype: int64

In [4]:
# replace NaN values in the description column with empty strings

films = films.fillna('')

films.isnull().sum()

index             0
Id                0
original_title    0
description       0
dtype: int64

In [5]:
# import the vectorizer module

from sklearn.feature_extraction.text import TfidfVectorizer

# create a sparse matrix from the description column

vectorizer = TfidfVectorizer(
    min_df=3, # ignore words that occur less than 3 times
    max_features=None, # have as many columns as we need
    strip_accents='unicode', # strip any accents that are above words
    analyzer='word', # take the words
    token_pattern=r'\w{1,}', # stipulate how a word is defined with a regex expression i.e a word with at least 1 letter
    ngram_range=(1,3), # define the usable ngram range
    stop_words='english' # void words that arn't relevent i.e and
    )

X = vectorizer.fit_transform(films['description'])

X

<63432x60007 sparse matrix of type '<class 'numpy.float64'>'
	with 1138068 stored elements in Compressed Sparse Row format>

In [6]:
# import tanh function (sigmoid kernal) for a similarity rating

from sklearn.metrics.pairwise import sigmoid_kernel

# cimpute the similarity matrix

sim_mat = sigmoid_kernel(X, X)

In [7]:
# create a series of zeros, one for each film in the dataframe

final_series = pd.Series([0]*films.shape[0])

# create a series of indicies to uickly find the films

indicies = pd.Series(films.index, index=films['Id'])

indicies.head() # note the indecies seem strange because we dropped a lot of films

Id
tt0017938    0
tt0035423    1
tt0036606    2
tt0062181    3
tt0064730    4
dtype: int64

In [8]:
def add_film_ratings(Id, multiplier):

    # get the film's index

    idx = indicies[Id]

    # convert the array to a list with each film's index

    ratings = list(enumerate(sim_mat[idx]))

    # sort the films by the rating and put them into a series of their indicies

    ratings = pd.Series(sorted(ratings, key=lambda x: x[1],reverse=True)).apply(lambda x: x[0])

    # swap the indicies and values and add the values to the final series

    return (multiplier * pd.Series(ratings.index.values, index=ratings))

In [9]:
# import Jo's dataframe

Jo = pd.read_csv('JosFilms.csv')

Jo = Jo.dropna()

Jo.head()

Unnamed: 0,Film,Response,Id
0,Shutter Island,2,tt1130884
3,Green Book,2,tt6966692
4,The Silence of the Lambs,2,tt0102926
5,Paranormal Activity,0,tt1179904
6,Saw,1,tt0387564


In [10]:
# iterate through Jo's films and update the final_list

# first make an iterable list

Jo_iter = [(x, y) for x, y in zip(Jo['Id'], Jo['Response']-1)]

# iterate through

for film in Jo_iter:
    final_series += add_film_ratings(film[0], film[1])

In [11]:
# sort the values to find the films

final_series = final_series.sort_values()

In [12]:
final_series.index.values

array([  566,  2201,   611, ..., 61516, 61927, 62243], dtype=int64)

In [18]:
# get the top n films

n = 10

empty_series = pd.Series([0]*final_series.shape[0])

for film in range(final_series.shape[0]):
    empty_series[film] = films['Id'][final_series.index[film]]
    if film < 5:
        print(films.iloc[final_series.index[film],:])
    
empty_series.head()

index                                                         20262
Id                                                        tt0083564
original_title                                                Annie
description       A young orphan girl's adventures in finding a ...
Name: 566, dtype: object
index                                                         21933
Id                                                        tt0089123
original_title                                              Falfúró
description       Budapest, in the 1980's. Géza and his family l...
Name: 2201, dtype: object
index                                                         20311
Id                                                        tt0083693
original_title                                  Brimstone & Treacle
description       A strange young man has a sinister effect on t...
Name: 611, dtype: object
index                                                         19165
Id                                      

0    tt0083564
1    tt0089123
2    tt0083693
3    tt0079891
4    tt0076694
dtype: object

In [25]:
# drop films that are already in Jo's list

for film in list(Jo['Id']):
    empty_series = empty_series[empty_series != film]

In [26]:
empty_series.to_csv('nlp_films.csv')