In [37]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import warnings
import pickle
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
seed = 0
np.random.seed(seed)

## load movies

In [2]:
df_movies = pd.read_csv("data/ml-latest-small/movies.csv")
df_movies.columns = ["MovieID", "Title", "Genres"]
df_movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## load tags

In [3]:
df_tags = pd.read_csv("data/ml-latest-small/tags.csv")
df_tags.columns = ["UserID", "MovieID", "Tag", "Timestamp"]
df_tags.head()

Unnamed: 0,UserID,MovieID,Tag,Timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


## Merge data movies and tags

In [4]:
df = df_movies.merge(df_tags, left_on='MovieID', right_on='MovieID')
df = df[["MovieID", "Title", "Genres", "UserID", "Tag"]] 
df.head()

Unnamed: 0,MovieID,Title,Genres,UserID,Tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game


In [5]:
df.shape

(3683, 5)

In [6]:
df = df.drop_duplicates()
df.shape

(3683, 5)

## Recomender System based content
### Preprocess data

In [7]:
df["date"] = df["Title"].str.extract(r"(?:\((\d{4})\))?\s*$")
df["Title"] = df["Title"].str.replace(r"(?:\((\d{4})\))?\s*$", "")
df["Genres"] = df["Genres"].str.replace("|", " ")
df["bag_of_words"] = df["Genres"] +" "+ df["Tag"]
df["bag_of_words"] = df["bag_of_words"].str.lower()
df.head()

Unnamed: 0,MovieID,Title,Genres,UserID,Tag,date,bag_of_words
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,336,pixar,1995,adventure animation children comedy fantasy pixar
1,1,Toy Story,Adventure Animation Children Comedy Fantasy,474,pixar,1995,adventure animation children comedy fantasy pixar
2,1,Toy Story,Adventure Animation Children Comedy Fantasy,567,fun,1995,adventure animation children comedy fantasy fun
3,2,Jumanji,Adventure Children Fantasy,62,fantasy,1995,adventure children fantasy fantasy
4,2,Jumanji,Adventure Children Fantasy,62,magic board game,1995,adventure children fantasy magic board game


In [9]:
mvbg = df[["Title", "bag_of_words"]]
mvbg = mvbg.set_index("Title")
mvbg.head()
mvbg.shape

(3683, 1)

In [11]:
mvbg.drop_duplicates(inplace= True)
mvbg.shape
mvbg.head()

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
Toy Story,adventure animation children comedy fantasy pixar
Toy Story,adventure animation children comedy fantasy fun
Jumanji,adventure children fantasy fantasy
Jumanji,adventure children fantasy magic board game
Jumanji,adventure children fantasy robin williams


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
count = CountVectorizer()
count_matrix = count.fit_transform(mvbg['bag_of_words'])

# list I will use later to match the indexes
idx = pd.Series(mvbg.index)
idx.sample(10)

1014                       All the King's Men 
50                                 To Die For 
409                                 True Lies 
144        Star Wars: Episode IV - A New Hope 
2413                      There Will Be Blood 
2011    Eternal Sunshine of the Spotless Mind 
2950                Guardians of the Galaxy 2 
3043                            Sausage Party 
1847                           Bruce Almighty 
502                                     Fargo 
Name: Title, dtype: object

In [42]:
# applied cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
with open("data/cosine_sim.pkl", "wb") as sim:
    pickle.dump(cosine_sim, sim)
cosine_sim

array([[1.        , 0.83333333, 0.66666667, ..., 0.4330127 , 0.33333333,
        0.33333333],
       [0.83333333, 1.        , 0.66666667, ..., 0.4330127 , 0.33333333,
        0.33333333],
       [0.66666667, 0.66666667, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.4330127 , 0.4330127 , 0.        , ..., 1.        , 0.8660254 ,
        0.8660254 ],
       [0.33333333, 0.33333333, 0.        , ..., 0.8660254 , 1.        ,
        0.83333333],
       [0.33333333, 0.33333333, 0.        , ..., 0.8660254 , 0.83333333,
        1.        ]])

## Recommendations

In [32]:
from fuzzywuzzy import fuzz
from operator import itemgetter

def get_ratio(str1, str2):
    
    ratio = fuzz.token_sort_ratio(str1, str2)
    
    return ratio


def recommendations(title, indices, cosine_sim):
    
    recommended_movies = list()
    
    # get the index of the movie that matches the title
    idx = [(get_ratio(title, val),idx) for idx, val in indices.items()]
    
    # sorted between ratio value
    idx = sorted(idx, key= itemgetter(0), reverse= True)
    
    # get the idx of max ratio
    idx = idx[0][1]
    
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
        
    rec = pd.DataFrame(score_series)
    rec.columns = ["Score"]
    rec["Title"] = ""   
    
    for i in list(score_series.index):
        rec["Title"][i] = indices[i]
        
    rec = rec[["Title", "Score"]]  
    
    rec = rec.drop_duplicates(subset= "Title")
    
    return rec[1:11]

## Test Recommendation

In [44]:
with open('data/cosine_sim.pkl', 'rb') as handle:
    cosine_sim = pickle.load(handle)

rec = recommendations("Paper, The", idx, cosine_sim)
rec.head(10)

Unnamed: 0,Title,Score
1094,Broadcast News,0.866025
50,To Die For,0.866025
1470,Almost Famous,0.816497
151,"Madness of King George, The",0.666667
475,Welcome to the Dollhouse,0.666667
1208,Radio Days,0.666667
388,"Adventures of Priscilla, Queen of the Desert, ...",0.666667
1409,Parenthood,0.666667
434,Mrs. Doubtfire,0.666667
1420,Auntie Mame,0.666667
