In [119]:
import spacy
import pandas as pd
import numpy as np

In [120]:
movie_list = pd.read_csv(r"movies.txt", sep =":", header = None)
movie_list.head()

Unnamed: 0,0,1
0,Movie A,When Hiccup discovers Toothless isn't the only...
1,Movie B,"After the death of Superman, several new peopl..."
2,Movie C,A darkness swirls at the center of a world-ren...
3,Movie D,A humorous take on Sir Arthur Conan Doyle's cl...
4,Movie E,A 16-year-old girl and her extended family are...


In [121]:
movie_list.columns = ["Movie name","Movie description"]
movie_list.head()

Unnamed: 0,Movie name,Movie description
0,Movie A,When Hiccup discovers Toothless isn't the only...
1,Movie B,"After the death of Superman, several new peopl..."
2,Movie C,A darkness swirls at the center of a world-ren...
3,Movie D,A humorous take on Sir Arthur Conan Doyle's cl...
4,Movie E,A 16-year-old girl and her extended family are...


In [122]:
# prepare to make a dataframe of given comparator movie
comparator = ([["Planet Hulk", '''Will he save their world or destroy it? When the Hulk becomes too dangerous for the Earth, the illuminati trick Hulk into a shuttle
and launch him into space to a planet where the Hulk can live
in peace.  Unfortunately, Hulk lands on the planet Sakaar where he
is sold into slavery and trained as a gladiator''']])

In [123]:
#create dataframe of comparator movie
df1 = pd.DataFrame(comparator)
df1.head()

Unnamed: 0,0,1
0,Planet Hulk,Will he save their world or destroy it? When t...


In [124]:
# display new dataframe with corresponding column names
df1.columns = ["Movie name", "Movie description"]
df1.head()

Unnamed: 0,Movie name,Movie description
0,Planet Hulk,Will he save their world or destroy it? When t...


In [125]:
#join the 2 dataframes
available_movies_df = df1.append(movie_list, ignore_index = True )
available_movies_df

  available_movies_df = df1.append(movie_list, ignore_index = True )


Unnamed: 0,Movie name,Movie description
0,Planet Hulk,Will he save their world or destroy it? When t...
1,Movie A,When Hiccup discovers Toothless isn't the only...
2,Movie B,"After the death of Superman, several new peopl..."
3,Movie C,A darkness swirls at the center of a world-ren...
4,Movie D,A humorous take on Sir Arthur Conan Doyle's cl...
5,Movie E,A 16-year-old girl and her extended family are...
6,Movie F,"In the last moments of World War II, a young G..."
7,Movie G,"The world at an end, a dying mother sends her ..."
8,Movie H,A musician helps a young singer and actress fi...
9,Movie I,"Corporate analyst and single mom, Jen, tackles..."


In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [127]:
# pair similarity- represent each movie as a vector pair

vectorizer = TfidfVectorizer(min_df = 3,
                      max_features = None,
                      
                      # remove punctuation marks 
                      strip_accents = "unicode",
                      analyzer = "word",
                      token_pattern = r'\w{1,}',
                      # combine 1-3 different words to form meanings
                      ngram_range = (1,3),
                      # remove unnecessary characters i.e. the
                      stop_words = "english"
                            )

In [128]:
vectorizer_matrix = vectorizer.fit_transform(available_movies_df["Movie description"])
vectorizer_matrix

<11x2 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [129]:
# prepare to find similarity value
from sklearn.metrics.pairwise import sigmoid_kernel

# computing sigmoid kernel
sig = sigmoid_kernel(vectorizer_matrix, vectorizer_matrix)

In [130]:
sig[0]

array([0.90514825, 0.90514825, 0.76159416, 0.87047897, 0.76159416,
       0.76159416, 0.87047897, 0.87047897, 0.76159416, 0.76159416,
       0.76159416])

In [131]:
# reverse mapping of indices and movie names
indices =pd.Series(available_movies.index, index = available_movies_df["Movie name"])
indices

Movie name
Planet Hulk     0
Movie A         1
Movie B         2
Movie C         3
Movie D         4
Movie E         5
Movie F         6
Movie G         7
Movie H         8
Movie I         9
Movie J        10
dtype: int64

In [132]:
def predicted_movie (Movie_name, sig = sig):
    # get index of original movie name
    idx = indices[Movie_name]
    # obtain pairwise similarity scores
    sig_scores = list(enumerate(sig[idx]))
    # sort movies
    sig_scores = sorted(sig_scores, key = lambda x: x[1], reverse = True)
    # scores of 10 of the most similar movies
    sig_scores = sig_scores[1:11]
    # movie indices
    movie_indices = [i [0] for i in sig_scores]
    # top 10 most similar movies
    return available_movies_df["Movie name"].iloc[movie_indices]

In [133]:
# test prediction ability based on description given by the movie Planet Hulk
predicted_movie("Planet Hulk")

1     Movie A 
3     Movie C 
6     Movie F 
7     Movie G 
2     Movie B 
4     Movie D 
5     Movie E 
8     Movie H 
9     Movie I 
10    Movie J 
Name: Movie name, dtype: object

Code adapted from Krish Naik (Tutorial 5 - Content based recommendation system) made to suit question (YouTube)