In [1]:
#import all necessary modules and download necessary packages
import csv
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\isaac\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\isaac\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\isaac\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\isaac\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def load_data(rows = 150, file = 'mpst_full_data.csv'):
    '''
    A function to load the data. Takes an input for the amount of rows it returns, as well as the file you wish to access.
    '''
    counter = 0
    loaded_data = []
    
    #The csv file is quite large, so to save memory we only load in one row at a time.
    with open(file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            loaded_data.append(row)
            counter +=1
            if counter == rows:
                break
    
    movies = pd.DataFrame.from_dict(loaded_data)
    del loaded_data
    return movies

movies = load_data()
movies.head(5)

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb


In [3]:
#Here I combine the plot_synopsis and tags columns to allow the model to use both simultaneously,
#which will increase predictive power. I then remove unnecessary columns to save space.

movies['tags'] = movies.plot_synopsis + movies.tags.apply(lambda x: ' ' + x)
movies.drop(columns=['split', 'synopsis_source', 'plot_synopsis'], inplace=True)

def remove_punctuation(text):
    '''
    A function to remove all forms of punctuation from a text.
    '''
    
    translation_table = str.maketrans("", "", "!.,?/;:'=\-()")
    clean_text = text.translate(translation_table)
    clean_text = clean_text.replace('"', '').lower()
    clean_text = clean_text.replace("\n", '')
    return clean_text

def clean(text):
    '''
    A function to clean a text input of punctuation and stopwords (common english words of no predictive value).
    The function them lemmatizes the remaining words.
    '''
    
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = remove_punctuation(text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(words)
    return text

movies['clean_tags'] = movies.tags.apply(clean)

  translation_table = str.maketrans("", "", "!.,?/;:'=\-()")


In [4]:
tv = TfidfVectorizer(max_features=5000, stop_words='english')

def recommend(num_recommendations=5, movies=movies):
    '''
    A function that returns the number of movie recommendations inputted given the inputted dataset.
    Processes and cleans a user movie preference input, and adds that to the movie descriptions pd.Series.
    Vectorizes the description and computes the similarity matrix. 
    Returns the recommended movies and their respective scores.
    '''
    
    prompt = input('Please type a short description of your movie preferences:\n')
    clean_prompt = clean(prompt)
    descriptions = pd.concat([movies.clean_tags, pd.Series([clean_prompt], index=[len(movies.clean_tags)])])
    index = len(descriptions)-1
    
    vector = tv.fit_transform(descriptions).toarray()
    similarity = cosine_similarity(vector)
    
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector: vector[1])
    recommendeds = []
    scores = []
    for i in distance[1:num_recommendations+1]:
        recommendeds.append(movies.iloc[i[0]].title)
        scores.append(i[1])
        
    return recommendeds, scores

def display_recommendations(num_recommendations=5, movies=movies):
    '''
    A function that displays the output of recommend() in more user friendly manner.
    '''
    
    recommends, scores = recommend(num_recommendations, movies)
    assert num_recommendations >= 1
    
    print()
    for i in range(len(recommends)):
        print('Recommendation ', str(i+1)+ ':', recommends[i]+',', ' '*(30-len(recommends[i])), 'Score:', round(scores[i], 3))

In [5]:
display_recommendations()

Please type a short description of your movie preferences:
I like action movies set in space, with a comedic twist.

Recommendation  1: Guardians of the Galaxy,         Score: 0.043
Recommendation  2: Alien,                           Score: 0.032
Recommendation  3: Invasion of the Bee Girls,       Score: 0.031
Recommendation  4: The Hamiltons,                   Score: 0.029
Recommendation  5: Antares,                         Score: 0.026


In [6]:
display_recommendations(3)

Please type a short description of your movie preferences:
My favorite movie is the Amazing Spiderman.

Recommendation  1: The Amazing Spider-Man,          Score: 0.221
Recommendation  2: 14 Going on 30,                  Score: 0.024
Recommendation  3: Targets,                         Score: 0.021


In [7]:
display_recommendations()

Please type a short description of your movie preferences:
Horror movies with a romantic theme.

Recommendation  1: The Haunted,                     Score: 0.053
Recommendation  2: Targets,                         Score: 0.032
Recommendation  3: I vitelloni,                     Score: 0.03
Recommendation  4: Bowling for Columbine,           Score: 0.022
Recommendation  5: I tre volti della paura,         Score: 0.021


# My salary expectation per month is around $2,000