# Content Filtering based on tags
- This notebooks takes in a movie lense dataset and creates a tag or large text corpus which is a combination of movie title,genre,cast,crews,summary of the plot for the movie. 
- The tag is then processed through natural language pre processing techniques to identify 5,000 most common occuring words
- We will create movie vector by identifying number of times each of word occurs in the movie tag. 
- We will use this matrix to compute the similarity between the movies using cosine similarity
- Finally we will test the recommendations


In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
#from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [2]:
# Input file path
input_cleaned_file_path = '/Users/gauridhumal/Development Projects/UOL-PROJECTs/CRS/crs_ds/data/processed/movieLense'

In [3]:
# Utility function to read the files
def read_file(file_name):
    return pd.read_csv(open(f"{input_cleaned_file_path}/{file_name}", 'r'))

## Read Movie File

In [4]:
df_movie = read_file('movies_combined.csv')

In [5]:
df_movie.head(2)

Unnamed: 0,ml_id,title,release_year,imdb_id,unknown,Action,Adventure,Animation,Children's,Comedy,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,plot_summary,crew,genre_tags
0,1,Toy Story,1995,tt0114709,0,0,0,1,1,1,...,0,0,0,0,0,0,0,Woody is a pull-string cowboy doll and leader ...,"[{""category"":""writer"",""crew_name"":""Joel Cohen""...","Animation, childrens, comedy"
1,2,GoldenEye,1995,tt0097446,0,1,1,0,0,0,...,0,0,0,0,1,0,0,,"[{""category"":""composer"",""crew_name"":""Michael B...","action, adventure, thriller"


In [6]:
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1663 entries, 0 to 1662
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ml_id         1663 non-null   int64 
 1   title         1663 non-null   object
 2   release_year  1663 non-null   int64 
 3   imdb_id       1663 non-null   object
 4   unknown       1663 non-null   int64 
 5   Action        1663 non-null   int64 
 6   Adventure     1663 non-null   int64 
 7   Animation     1663 non-null   int64 
 8   Children's    1663 non-null   int64 
 9   Comedy        1663 non-null   int64 
 10  Crime         1663 non-null   int64 
 11  Documentary   1663 non-null   int64 
 12  Drama         1663 non-null   int64 
 13  Fantasy       1663 non-null   int64 
 14  Film-Noir     1663 non-null   int64 
 15  Horror        1663 non-null   int64 
 16  Musical       1663 non-null   int64 
 17  Mystery       1663 non-null   int64 
 18  Romance       1663 non-null   int64 
 19  Sci-Fi

In [7]:
df_movie.columns

Index(['ml_id', 'title', 'release_year', 'imdb_id', 'unknown', 'Action',
       'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'plot_summary', 'crew', 'genre_tags'],
      dtype='object')

## Create Movie Tag or text corpus

In [8]:
def create_combined_genre(x):
    '''
    This function creates a combined Genre tag for the movie.
    '''
    genre_tags=[]
    if x["Action"] == 1:
        genre_tags.append("action")
    if x["Adventure"] == 1:
        genre_tags.append("adventure")
    if x["Animation"] == 1:
        genre_tags.append("animation")
    if x["Children's"] == 1:
        genre_tags.append("childrens")
    if x["Crime"] == 1:
        genre_tags.append("crime")
    if x["Comedy"] == 1:
        genre_tags.append("comedy")
    if x["Documentary"] == 1:
        genre_tags.append("documentary")
    if x["Drama"] == 1:
        genre_tags.append("drama")
    if x["Fantasy"] == 1:
        genre_tags.append("fantasy")
    if x["Film-Noir"] == 1:
        genre_tags.append("film-Noir")
    if x["Horror"] == 1:
        genre_tags.append("horror")
    if x["Musical"] == 1:
        genre_tags.append("musical")
    if x["Mystery"] == 1:
        genre_tags.append("mystery")
    if x["Romance"] == 1:
        genre_tags.append("romance")
    if x["Sci-Fi"] == 1:
        genre_tags.append("sci-fi")
    if x["Thriller"] == 1:
        genre_tags.append("thriller")
    if x["War"] == 1:
        genre_tags.append("war")
    if x["Western"] == 1:
        genre_tags.append("western")
    return genre_tags

In [9]:
# Create a combined genre tag
df_movie["genre_comb_tag"]=df_movie.apply(create_combined_genre,axis=1)

In [10]:
df_movie.head(2)

Unnamed: 0,ml_id,title,release_year,imdb_id,unknown,Action,Adventure,Animation,Children's,Comedy,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,plot_summary,crew,genre_tags,genre_comb_tag
0,1,Toy Story,1995,tt0114709,0,0,0,1,1,1,...,0,0,0,0,0,0,Woody is a pull-string cowboy doll and leader ...,"[{""category"":""writer"",""crew_name"":""Joel Cohen""...","Animation, childrens, comedy","[animation, childrens, comedy]"
1,2,GoldenEye,1995,tt0097446,0,1,1,0,0,0,...,0,0,0,1,0,0,,"[{""category"":""composer"",""crew_name"":""Michael B...","action, adventure, thriller","[action, adventure, thriller]"


In [11]:
# We want to create a dataframe that will have movie id, title and tags columns. Tags will contain a combined column of plot summary, crew,generes.
# This will allow us to search using any keyword present in the tags. For this we will have to do pre- processing to create such data.

In [12]:
# first check if any of the columns are null
df_movie.isnull().sum()

ml_id               0
title               0
release_year        0
imdb_id             0
unknown             0
Action              0
Adventure           0
Animation           0
Children's          0
Comedy              0
Crime               0
Documentary         0
Drama               0
Fantasy             0
Film-Noir           0
Horror              0
Musical             0
Mystery             0
Romance             0
Sci-Fi              0
Thriller            0
War                 0
Western             0
plot_summary      809
crew               15
genre_tags          1
genre_comb_tag      0
dtype: int64

In [13]:
# Drop records which have null value
df_movie.dropna(inplace=True)

In [14]:
df_movie.shape

(854, 27)

In [15]:
# we will extract crews and cast in seperate columns for easy processing
df_movie.loc[0]["crew"]

'[{"category":"writer","crew_name":"Joel Cohen"},{"category":"actor","crew_name":"Tom Hanks"},{"category":"actor","crew_name":"Tim Allen"},{"category":"actor","crew_name":"Don Rickles"},{"category":"actor","crew_name":"Jim Varney"},{"category":"director","crew_name":"John Lasseter"},{"category":"writer","crew_name":"Pete Docter"},{"category":"writer","crew_name":"Andrew Stanton"},{"category":"writer","crew_name":"Joe Ranft"},{"category":"writer","crew_name":"Joss Whedon"}]'

In [16]:
# Abstract syntax library to convert string dictionary to dictionary
import ast

In [17]:
def extract_crew(crew_obj):
    '''Function to extract cast and crew
    '''
    workers=[]
    directors=[]
    producers=[]
    actors=[]
    for item in ast.literal_eval(crew_obj):
        if item["category"]in["actor","actress"]:
            actors.append(item["crew_name"])
        else:
            if item["category"]in["director"]:
                directors.append(item["crew_name"])
            else:
                if item["category"]in["producer"]:
                    producers.append(item["crew_name"])
                else:
                    workers.append(item["crew_name"])
    return actors,directors,producers, workers


In [18]:
# Extract actors,directors,producers, workers
for index,row in df_movie.iterrows():
    actors,directors,producers, workers = extract_crew(row["crew"])
    df_movie.at[index,"actors"] = str(actors)
    df_movie.at[index,"directors"] = str(directors)
    df_movie.at[index,"producers"] = str(producers)
    df_movie.at[index,"workers"] = str(workers)

In [19]:
df_movie.head(3)

Unnamed: 0,ml_id,title,release_year,imdb_id,unknown,Action,Adventure,Animation,Children's,Comedy,...,War,Western,plot_summary,crew,genre_tags,genre_comb_tag,actors,directors,producers,workers
0,1,Toy Story,1995,tt0114709,0,0,0,1,1,1,...,0,0,Woody is a pull-string cowboy doll and leader ...,"[{""category"":""writer"",""crew_name"":""Joel Cohen""...","Animation, childrens, comedy","[animation, childrens, comedy]","['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",['John Lasseter'],[],"['Joel Cohen', 'Pete Docter', 'Andrew Stanton'..."
2,3,Four Rooms,1995,tt0113101,0,0,0,0,0,0,...,0,0,"The film is set on New Year's Eve, and starts ...","[{""category"":""producer"",""crew_name"":""Lawrence ...",thriller,[thriller],"['Tim Roth', 'Antonio Banderas', 'Sammi Davis'...","['Allison Anders', 'Alexandre Rockwell', 'Robe...",['Lawrence Bender'],[]
3,4,Get Shorty,1995,tt0113161,0,1,0,0,0,1,...,0,0,"Chili Palmer , a loan shark based in Miami, cl...","[{""category"":""composer"",""crew_name"":""John Luri...","action, comedy, drama","[action, comedy, drama]","['Gene Hackman', 'Rene Russo', 'Danny DeVito',...",['Barry Sonnenfeld'],"['Michael Shamberg', 'Stacey Sher']","['John Lurie', 'Elmore Leonard', 'Scott Frank']"


In [20]:
# Covert plot summary to a list
df_movie["plot_summary"]= df_movie["plot_summary"].apply(lambda x: x.split())

In [21]:
df_movie.head(1)

Unnamed: 0,ml_id,title,release_year,imdb_id,unknown,Action,Adventure,Animation,Children's,Comedy,...,War,Western,plot_summary,crew,genre_tags,genre_comb_tag,actors,directors,producers,workers
0,1,Toy Story,1995,tt0114709,0,0,0,1,1,1,...,0,0,"[Woody, is, a, pull-string, cowboy, doll, and,...","[{""category"":""writer"",""crew_name"":""Joel Cohen""...","Animation, childrens, comedy","[animation, childrens, comedy]","['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",['John Lasseter'],[],"['Joel Cohen', 'Pete Docter', 'Andrew Stanton'..."


In [22]:
# Drop columns which are not required for processing
df_movie.columns

Index(['ml_id', 'title', 'release_year', 'imdb_id', 'unknown', 'Action',
       'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'plot_summary', 'crew', 'genre_tags', 'genre_comb_tag', 'actors',
       'directors', 'producers', 'workers'],
      dtype='object')

In [23]:

colmns_to_drop=['unknown', 'Action','Adventure', 'Animation', "Children's", 'Comedy', 'Crime','Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical','Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western','crew',]

In [24]:
df_movie.drop(colmns_to_drop,axis=1,inplace=True)

In [25]:
df_movie.head(2)

Unnamed: 0,ml_id,title,release_year,imdb_id,plot_summary,genre_tags,genre_comb_tag,actors,directors,producers,workers
0,1,Toy Story,1995,tt0114709,"[Woody, is, a, pull-string, cowboy, doll, and,...","Animation, childrens, comedy","[animation, childrens, comedy]","['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",['John Lasseter'],[],"['Joel Cohen', 'Pete Docter', 'Andrew Stanton'..."
2,3,Four Rooms,1995,tt0113101,"[The, film, is, set, on, New, Year's, Eve,, an...",thriller,[thriller],"['Tim Roth', 'Antonio Banderas', 'Sammi Davis'...","['Allison Anders', 'Alexandre Rockwell', 'Robe...",['Lawrence Bender'],[]


In [26]:
# We will now join the first name and last name for actors, workers,directors and producers to create one single tag

In [27]:
literal_eval(df_movie["workers"][4])

['Christopher Young', 'Ann Biderman', 'David Madsen']

In [28]:
import re

def remove_special_characters(text):
    '''Function to clean up any given text by removing non alphanumeric characters with spaces'''
    # Define a pattern to keep only alphanumeric characters
    pattern = re.compile(r'[^a-zA-Z0-9\s]')

    # Use the pattern to replace non-alphanumeric characters with an empty string
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

In [29]:
def remove_space(x):
    '''Function to remove space between words'''
    y=literal_eval(x)
    # print (y)
    s=[]
    for i in y:
        # print (i)
        s.append(i.replace(" ","").lower())
    return s


In [30]:
# This will remove space between first name and last name so that a person can be identified
df_movie["workers"]=df_movie["workers"].apply(remove_space)

In [31]:
# This will remove space between first name and last name so that a person can be identified
df_movie["actors"]=df_movie["actors"].apply(remove_space)

In [32]:
# This will remove space between first name and last name so that a person can be identified
df_movie["directors"]=df_movie["directors"].apply(remove_space)

In [33]:
# This will remove space between first name and last name so that a person can be identified
df_movie["producers"]=df_movie["producers"].apply(remove_space)

In [34]:
df_movie['cleaned_title'] = df_movie['title'].apply(lambda x:x.lower()).apply(remove_special_characters).str.replace(" ","")

In [35]:
df_movie.head(2)

Unnamed: 0,ml_id,title,release_year,imdb_id,plot_summary,genre_tags,genre_comb_tag,actors,directors,producers,workers,cleaned_title
0,1,Toy Story,1995,tt0114709,"[Woody, is, a, pull-string, cowboy, doll, and,...","Animation, childrens, comedy","[animation, childrens, comedy]","[tomhanks, timallen, donrickles, jimvarney]",[johnlasseter],[],"[joelcohen, petedocter, andrewstanton, joeranf...",toystory
2,3,Four Rooms,1995,tt0113101,"[The, film, is, set, on, New, Year's, Eve,, an...",thriller,[thriller],"[timroth, antoniobanderas, sammidavis, amandad...","[allisonanders, alexandrerockwell, robertrodri...",[lawrencebender],[],fourrooms


In [36]:
# create tags column by combining all columns on which search should be allowed
df_movie['tags'] = df_movie["plot_summary"]+df_movie["genre_comb_tag"]+df_movie["actors"]+df_movie["workers"]+df_movie["directors"]+df_movie["producers"]

In [37]:
df_movie.head(2)

Unnamed: 0,ml_id,title,release_year,imdb_id,plot_summary,genre_tags,genre_comb_tag,actors,directors,producers,workers,cleaned_title,tags
0,1,Toy Story,1995,tt0114709,"[Woody, is, a, pull-string, cowboy, doll, and,...","Animation, childrens, comedy","[animation, childrens, comedy]","[tomhanks, timallen, donrickles, jimvarney]",[johnlasseter],[],"[joelcohen, petedocter, andrewstanton, joeranf...",toystory,"[Woody, is, a, pull-string, cowboy, doll, and,..."
2,3,Four Rooms,1995,tt0113101,"[The, film, is, set, on, New, Year's, Eve,, an...",thriller,[thriller],"[timroth, antoniobanderas, sammidavis, amandad...","[allisonanders, alexandrerockwell, robertrodri...",[lawrencebender],[],fourrooms,"[The, film, is, set, on, New, Year's, Eve,, an..."


In [38]:
# Storing required columns in a new DF for NL processing
new_df_movie = df_movie[["ml_id","title","imdb_id","tags"]]

In [39]:
new_df_movie.iloc[0]

ml_id                                                      1
title                                             Toy Story 
imdb_id                                            tt0114709
tags       [Woody, is, a, pull-string, cowboy, doll, and,...
Name: 0, dtype: object

## Natural Language Preprocessing
We will use some of the following techniques on the tag
- Tokenization
    - Breaking down text into smaller units, such as words or sentences (tokens).
    - Facilitates analysis by converting text into a format that can be processed more easily.
- Stopword Removal
    - Removing common words (e.g., "the," "is") that do not contribute much to the meaning of a text.
    - Reduces noise in the data and focuses on more meaningful words.
- Stemming and Lemmatization
    - Reducing words to their root or base form (e.g., "running" to "run").
    - Normalizes words to their base form, reducing variations

In [40]:
# reset index
new_df_movie.reset_index(drop=True,inplace=True)

In [41]:
# convert list back to string
new_df_movie['tags']= new_df_movie['tags'].apply(lambda x: " ".join(x))

In [42]:
# convert to lower case
new_df_movie['tags'] = new_df_movie['tags'].apply(lambda x: x.lower())

In [43]:
new_df_movie.iloc[0]

ml_id                                                      1
title                                             Toy Story 
imdb_id                                            tt0114709
tags       woody is a pull-string cowboy doll and leader ...
Name: 0, dtype: object

### Create vectors using Bag of Words technique
- To find similar movies, the tags of various movies need to be compared to find similarities i.e calculate the similarity score.
- We will convert this tags text to  vectors and then calulate the distance between the two vectors using cosine distance.
- To recommend similar movies for a given movie, we will find the vector distances between the movie and other remaining movies and recommend closest top 5 movies.
- There are many techniques create word vectors like - Bag of words, ifidf, word2vec
- Here we will use "Bag of word" technique
    - concatenate all tags of all movies - to create a large text corpus.
    - drop all stop words
    - select most common 5000 (can be any arbitrary number) words in this large corpus
    - compare these 5000 most common words with each movie tag and count how many times each word appears in this movie tag and store the information
    - above step will be repeated for all the movies
    - this will create an array of 854 movies with 5000 most popular word in the text - shape of the array [854,5000]

    | movie| W1 | W2 | W3 | ... | W5000 |
    | -----| ---| ---| ---| --- | ------|
    | m1   | 4  | 0  | 9  | ... |  8    |
    | m2   | 3  | 8  | 9  | ... |  8    |
    | m3   | 5  | 0  | 12 | ... |  8    |
    | ...  | ...| ...| ...| ... |  ...  |
    | m854 | 6  | 0  | 12 | ... |  8    |

   - Each row is called text vector for the movie - movie converted to 5000 vector space

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [45]:
vectors = cv.fit_transform(new_df_movie['tags']).toarray()

In [46]:
vectors.shape

(854, 5000)

In [47]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0])

In [48]:
# Most popular words found in the large corpus are
list(cv.get_feature_names_out())

['000',
 '10',
 '100',
 '1000',
 '11',
 '12',
 '14',
 '15',
 '16',
 '17',
 '18',
 '1950s',
 '1960s',
 '1963',
 '1969',
 '1994',
 '20',
 '200',
 '20th',
 '22',
 '24',
 '25',
 '30',
 '40',
 '50',
 '500',
 '747',
 'aaron',
 'abandon',
 'abandoned',
 'abandons',
 'abbey',
 'abbott',
 'abby',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abortion',
 'abruptly',
 'absence',
 'abuse',
 'abused',
 'abusive',
 'academy',
 'accept',
 'accepted',
 'accepting',
 'accepts',
 'access',
 'accident',
 'accidentally',
 'accompanied',
 'accompanies',
 'accompany',
 'account',
 'accused',
 'accuses',
 'ace',
 'act',
 'acting',
 'action',
 'actions',
 'activates',
 'activities',
 'activity',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'ad',
 'adam',
 'adams',
 'addict',
 'addiction',
 'adding',
 'addison',
 'addition',
 'address',
 'addresses',
 'adele',
 'administration',
 'admiral',
 'admit',
 'admits',
 'adopt',
 'adopted',
 'adoption',
 'adult',
 'adults',
 'advance',
 'advanc

### Stemming

In [49]:
'''
In this output we see there are many numbers, there are duplicates like
 'adopt',
 'adopted',
 'adoption',
 'adult',
 'adults',
so we will need to go back to root word in the tags so that different forms of same words are not selected.
This is called stemming
'''

"\nIn this output we see there are many numbers, there are duplicates like\n 'adopt',\n 'adopted',\n 'adoption',\n 'adult',\n 'adults',\nso we will need to go back to root word in the tags so that different forms of same words are not selected.\nThis is called stemming\n"

In [50]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [51]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))

    return  " ".join(y) # covert back to string

In [52]:
stem(new_df_movie.loc[0]['tags'])

"woodi is a pull-str cowboy doll and leader of a group of toy that belong to a boy name andi davis, which act lifeless when human are present. with hi famili move home one week befor hi birthday, the toy stage a reconnaiss mission to discov andy' new presents. andi receiv a space ranger buzz lightyear action figure, whose impress featur see him replac woodi as andy' favorit toy. woodi is resentful, especi as buzz also get attent from the other toys. howev buzz believ himself to be a real space ranger on a mission to return to hi home planet, as woodi fail to convinc him he is a toy. andi prepar for a famili outing at the space theme pizza planet restaur with buzz. woodi attempt to be pick by misplac buzz. he intend to trap buzz in a gap behind andy' desk, but the plan goe disastr wrong when he accident knock buzz out the window, result in him be accus of murder buzz out of jealousy. with buzz missing, andi take woodi to pizza planet, but buzz climb into the car and confront woodi when 

In [53]:
new_df_movie['tags'] = new_df_movie['tags'].apply(stem)

In [54]:
# Recalculate vectors
vectors = cv.fit_transform(new_df_movie['tags']).toarray()

In [55]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [6, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [56]:
vectors.shape

(854, 5000)

In [57]:
# most popular words found in the large corpus are
list(cv.get_feature_names_out())

['000',
 '10',
 '100',
 '1000',
 '11',
 '12',
 '14',
 '15',
 '16',
 '17',
 '18',
 '18th',
 '1950',
 '1960',
 '1963',
 '1969',
 '1970',
 '1980',
 '1984',
 '1988',
 '1994',
 '20',
 '200',
 '20th',
 '21',
 '22',
 '24',
 '25',
 '30',
 '40',
 '400',
 '50',
 '500',
 '60',
 '747',
 'aaron',
 'abandon',
 'abbey',
 'abbi',
 'abbott',
 'abduct',
 'abe',
 'abil',
 'abl',
 'aboard',
 'abort',
 'abortion',
 'abov',
 'abra',
 'abruptli',
 'absenc',
 'absence',
 'abus',
 'abuse',
 'academ',
 'academi',
 'accept',
 'accepts',
 'access',
 'accid',
 'accident',
 'accompani',
 'accomplic',
 'accomplish',
 'accord',
 'account',
 'accus',
 'ace',
 'achiev',
 'acid',
 'ackerman',
 'acknowledg',
 'acquaint',
 'acquir',
 'acquit',
 'act',
 'action',
 'actions',
 'activ',
 'activist',
 'activities',
 'actor',
 'actors',
 'actress',
 'actual',
 'ad',
 'adam',
 'adams',
 'adapt',
 'add',
 'addict',
 'addison',
 'addit',
 'address',
 'adjust',
 'administ',
 'administr',
 'admir',
 'admit',
 'adopt',
 'adrian',
 '

In [58]:
# Now the words dont repeat so much. We will calculate the cosine distance between two vectors.

## Calculate Cosine Distance

In [59]:
from sklearn.metrics.pairwise import cosine_similarity

In [60]:
# calculate distance of each movie with respect to other movies [845x854]
similarity = cosine_similarity(vectors)

In [61]:
similarity.shape

(854, 854)

In [62]:
similarity[5]
# this matrix has diagonal values as 1

array([0.04016313, 0.05832236, 0.04378524, 0.04298136, 0.04592829,
       1.        , 0.05835849, 0.06952796, 0.03387509, 0.08778869,
       0.03860067, 0.08153423, 0.04140074, 0.09071644, 0.06492642,
       0.03365156, 0.05358538, 0.02715946, 0.03378581, 0.05931487,
       0.05955169, 0.36808662, 0.06238368, 0.06265743, 0.03878099,
       0.03919902, 0.03208816, 0.04541511, 0.02020074, 0.05954512,
       0.04373714, 0.03334069, 0.05094014, 0.03865988, 0.05798641,
       0.08544454, 0.04240536, 0.03369329, 0.05689737, 0.02792694,
       0.05932811, 0.0877167 , 0.0379023 , 0.0510169 , 0.05974137,
       0.03335178, 0.07899429, 0.02000764, 0.07012434, 0.03240165,
       0.05253839, 0.03461793, 0.07422857, 0.04849002, 0.03755383,
       0.04352769, 0.03608415, 0.06532306, 0.04929845, 0.05515665,
       0.04672033, 0.03355161, 0.03794763, 0.03174932, 0.03183965,
       0.05053956, 0.03886897, 0.05208494, 0.0091158 , 0.03597341,
       0.0454581 , 0.06262696, 0.04844908, 0.06660486, 0.03889

In [63]:
# now that we have distance of each movie with respect to other movie, we can create recommendations
# when we will sort the distances, we will loose the position. Hence lets use enumerate function that adds the index position along with the values

In [64]:
sorted(list(enumerate(similarity[6])),reverse=True,key=lambda x:x[1])[1:6]

[(173, 0.5267746422829293),
 (842, 0.5165186529211098),
 (745, 0.5090689120963562),
 (395, 0.5033575589321491),
 (798, 0.4940483476743795)]

In [65]:
new_df_movie['cleaned_title'] = new_df_movie['title'].apply(lambda x:x.lower()).apply(remove_special_characters).str.replace(" ","")

In [66]:
def recommend_based_on_title(search_string):
    '''Function to recommend movies based on title '''
    # Clean the input movie title
    print(search_string)

    cleaned_search_string = remove_special_characters(search_string.lower().strip()).replace(" ","")
    # Search the string in the new dataframe in the tag
    searched_movies = new_df_movie[new_df_movie['tags'].str.contains(cleaned_search_string)]
    # print(find_movie)
    if searched_movies.empty:
        print("String not found in Tag - " + cleaned_search_string)
        searched_movies = new_df_movie[new_df_movie['cleaned_title'].str.contains(cleaned_search_string)]
        if searched_movies.empty:
            movie_list=[]
        else:
        # Get the movie index
            movie_index = searched_movies.index[0]
            print(movie_index)
        # Get the cosine distance of this movie with respect to other movies from similarity matrix computed above
            distances = similarity[movie_index]
        # Sort the distances in ascending order and select first 5 movies closet to the movies
            movie_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
            return movie_list,searched_movies
    else:
        # Get the movie index
        movie_index = searched_movies.index[0]
        print(movie_index)
        # Get the cosine distance of this movie with respect to other movies from similarity matrix computed above
        distances = similarity[movie_index]
        # Sort the distances in ascending order and select first 5 movies closet to the movies
        movie_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
        return movie_list,searched_movies

In [67]:
new_df_movie.head(2)

Unnamed: 0,ml_id,title,imdb_id,tags,cleaned_title
0,1,Toy Story,tt0114709,woodi is a pull-str cowboy doll and leader of ...,toystory
1,3,Four Rooms,tt0113101,"the film is set on new year' eve, and start wi...",fourrooms


In [68]:
new_df_movie[new_df_movie['cleaned_title'].str.contains("toystory")]

Unnamed: 0,ml_id,title,imdb_id,tags,cleaned_title
0,1,Toy Story,tt0114709,woodi is a pull-str cowboy doll and leader of ...,toystory


In [69]:
new_df_movie.shape

(854, 5)

In [70]:
new_df_movie.iloc[51]

ml_id                                                           82
title                                               Jurassic Park 
imdb_id                                                  tt0107290
tags             billionair john hammond, ceo of ingen, ha crea...
cleaned_title                                         jurassicpark
Name: 51, dtype: object

## Testing of content similarity

In [71]:

movie_list,find_movie = recommend_based_on_title('toy story')
if movie_list == []:
    print("No movies found...")
else:
    print("Movies found with the string")
    for i in find_movie['title']:
        print(i)
    print("Recommendations based on first movie in the list")
    for i in movie_list:
        print(new_df_movie.iloc[i[0]].title + new_df_movie.iloc[i[0]].imdb_id)

toy story
String not found in Tag - toystory
0
Movies found with the string
Toy Story 
Recommendations based on first movie in the list
Shaggy Dog, The tt0053271
Rebel Without a Cause tt0048545
Jingle All the Way tt0116705
Father of the Bride Part II tt0113041
Crooklyn tt0109504


In [72]:
df_movie[df_movie["imdb_id"].isin(["tt0053271","tt0048545","tt0116705","tt0113041","tt0109504"])]

Unnamed: 0,ml_id,title,release_year,imdb_id,plot_summary,genre_tags,genre_comb_tag,actors,directors,producers,workers,cleaned_title,tags
500,506,Rebel Without a Cause,1955,tt0048545,"[Shortly, after, moving, to, Los, Angeles, wit...",drama,[drama],"[jamesdean, nataliewood, salmineo, jimbackus]",[nicholasray],[davidweisbart],"[ernesthaller, stewartstern, irvingshulman, le...",rebelwithoutacause,"[Shortly, after, moving, to, Los, Angeles, wit..."
748,756,Father of the Bride Part II,1995,tt0113041,"[George, Banks, must, accept, the, reality, of...",comedy,[comedy],"[stevemartin, dianekeaton, martinshort, kimber...",[charlesshyer],[],"[elliotdavis, alberthackett, francesgoodrich, ...",fatherofthebridepartii,"[George, Banks, must, accept, the, reality, of..."
785,793,Crooklyn,1994,tt0109504,"[The, movie, opens, with, scenes, of, a, racia...",comedy,[comedy],"[alfrewoodard, delroylindo, davidpatrickkelly,...",[spikelee],[],"[barryalexanderbrown, joielee, cinquã©lee, ter...",crooklyn,"[The, movie, opens, with, scenes, of, a, racia..."
835,843,"Shaggy Dog, The",1959,tt0053271,"[Wilby, Daniels, is, constantly, misunderstood...","childrens, comedy","[childrens, comedy]","[fredmacmurray, jeanhagen, tommykirk, annettef...",[charlesbarton],[],"[edwardcolman, billwalsh, lilliehayward, felix...",shaggydogthe,"[Wilby, Daniels, is, constantly, misunderstood..."
854,862,Jingle All the Way,1996,tt0116705,"[Howard, Langston, is, a, workaholic, mattress...","adventure, childrens, comedy","[adventure, childrens, comedy]","[arnoldschwarzenegger, sinbad, philhartman, ri...",[brianlevant],"[michaelbarnathan, chriscolumbus, markradcliffe]","[davidnewman, randykornfield]",jinglealltheway,"[Howard, Langston, is, a, workaholic, mattress..."


In [73]:
# Save the model as pickle file to access in the web app

In [74]:
import pickle

In [76]:
pickle.dump(new_df_movie,open('/Users/gauridhumal/Development Projects/UOL-PROJECTs/CRS/crs_ds/models/content_filtering/movies.pkl','wb'))

In [77]:
pickle.dump(similarity,open('/Users/gauridhumal/Development Projects/UOL-PROJECTs/CRS/crs_ds/models/content_filtering/cosine_similarity.pkl','wb'))

In [78]:
new_df_movie

Unnamed: 0,ml_id,title,imdb_id,tags,cleaned_title
0,1,Toy Story,tt0114709,woodi is a pull-str cowboy doll and leader of ...,toystory
1,3,Four Rooms,tt0113101,"the film is set on new year' eve, and start wi...",fourrooms
2,4,Get Shorty,tt0113161,"chili palmer , a loan shark base in miami, cla...",getshorty
3,5,Copycat,tt0112722,after give a guest lectur on crimin psycholog ...,copycat
4,6,Shanghai Triad,tt0115012,shanghai triad take place over the cours of se...,shanghaitriad
...,...,...,...,...,...
849,1664,8 Heads in a Duffel Bag,tt0118541,"tommi spinelli , is a wiseguy hire by a pair o...",8headsinaduffelbag
850,1668,Wedding Bell Blues,tt0118127,jasmin is a young woman with men in and out of...,weddingbellblues
851,1672,Kika,tt0107315,"kika , a young, bubbl aspir actress turn cosme...",kika
852,1675,"Sunchaser, The",tt0117781,blue monro is a 16-year-old juvenil offend who...,sunchaserthe
