<a href="https://colab.research.google.com/github/MaryDongsn/GNG-5125-clustering/blob/master/movie_name_only_Final_project_content_based_recommendation_tfidf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:

from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [69]:
###### Use the following code if you use google colab #####
from google.colab import drive 
drive.mount('/content/gdrive')
ratings=pd.read_csv('/content/gdrive/My Drive/gng5125 project/ratings.csv',sep=',', encoding='latin-1', usecols=['userId','movieId','rating','timestamp'])
movies = pd.read_csv('/content/gdrive/My Drive/gng5125 project/movies.csv', sep=',', encoding='latin-1', usecols=['movieId','title','genres'])

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [71]:

# # Reading ratings file
# ratings = pd.read_csv('ratings.csv', sep=',', encoding='latin-1', usecols=['userId','movieId','rating','timestamp'])

# # Reading movies file
# movies = pd.read_csv('movies.csv', sep=',', encoding='latin-1', usecols=['movieId','title','genres'])

df_movies = movies 
df_ratings = ratings

In [72]:
#remove the year of the movie
df_movies.insert(2, "title_only",df_movies['title'].str.rsplit(" (").str[0])
print (df_movies['title']) 
df_movies['title_only'] = df_movies['title_only'].str.lower()
#df_movies['title'] = df_movies['title'].str.rsplit(" (").str[0]
print (df_movies['title_only'])


0                                Toy Story (1995)
1                                  Jumanji (1995)
2                         Grumpier Old Men (1995)
3                        Waiting to Exhale (1995)
4              Father of the Bride Part II (1995)
                          ...                    
9737    Black Butler: Book of the Atlantic (2017)
9738                 No Game No Life: Zero (2017)
9739                                 Flint (2017)
9740          Bungo Stray Dogs: Dead Apple (2018)
9741          Andrew Dice Clay: Dice Rules (1991)
Name: title, Length: 9742, dtype: object
0                                toy story
1                                  jumanji
2                         grumpier old men
3                        waiting to exhale
4              father of the bride part ii
                       ...                
9737    black butler: book of the atlantic
9738                 no game no life: zero
9739                                 flint
9740          bungo st

In [73]:
# Define a TF-IDF Vectorizer Object.
tfidf_movies_genres = TfidfVectorizer(token_pattern = '[a-zA-Z0-9\-]+')

#Replace NaN with an empty string
df_movies['genres'] = df_movies['genres'].replace(to_replace="(no genres listed)", value="")

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_movies_genres_matrix = tfidf_movies_genres.fit_transform(df_movies['genres'])

# Compute the cosine similarity matrix

cosine_sim_movies = linear_kernel(tfidf_movies_genres_matrix, tfidf_movies_genres_matrix)
# print(cosine_sim_movies)

In [74]:
def get_recommendations_based_on_genres(movie_title, cosine_sim_movies=cosine_sim_movies):
    """
    Calculates top 2 movies to recommend based on given movie titles genres. 
    :param movie_title: title of movie to be taken for base of recommendation
    :param cosine_sim_movies: cosine similarity between movies 
    :return: Titles of movies recommended to user
    """
    # Get the index of the movie that matches the title
    movie_title = movie_title.lower()
    idx_movie = df_movies.loc[df_movies['title_only'].isin([movie_title])]
    idx_movie = idx_movie.index
    
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores_movies = list(enumerate(cosine_sim_movies[idx_movie][0]))
    
    # Sort the movies based on the similarity scores
    sim_scores_movies = sorted(sim_scores_movies, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies 
    sim_scores_movies = sim_scores_movies[0:10]

    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores_movies ]
   
    
    # Return the top 10 most similar movies
    return df_movies['title'].iloc[movie_indices]
    
  

In [79]:
get_recommendations_based_on_genres("Toy story 2")

0                                        Toy Story (1995)
1706                                          Antz (1998)
2355                                   Toy Story 2 (1999)
2809       Adventures of Rocky and Bullwinkle, The (2000)
3000                     Emperor's New Groove, The (2000)
3568                                Monsters, Inc. (2001)
6194                                     Wild, The (2006)
6486                               Shrek the Third (2007)
6948                       Tale of Despereaux, The (2008)
7760    Asterix and the Vikings (AstÃ©rix et les Vikin...
Name: title, dtype: object

In [62]:

from sklearn.neighbors import KNeighborsClassifier  
def get_movie_label(movie_id):
    """
    Get the cluster label to which movie belongs by KNN algorithm.  
    :param movie_id: movie id
    :return: genres label to movie belong
    """
    classifier = KNeighborsClassifier(n_neighbors=5)
    x= tfidf_movies_genres_matrix
    y = df_movies.iloc[:,-1]
    classifier.fit(x, y)
    y_pred = classifier.predict(tfidf_movies_genres_matrix[movie_id])
    #movie_name = df_movies['title'].iloc[movie_id]

    #print ("+++++++++",type(y_pred))
    #dic = {'movie_name': movie_name, 'predected_genre': y_pred}
    # apply pandas data frame
    #df_predicted_movie = pd.DataFrame(dic)
    #return (df_predicted_movie)
    return y_pred

In [63]:
 get_movie_label(2355)

array(['Adventure|Animation|Children|Comedy|Fantasy'], dtype=object)

In [66]:

def get_recommendation_content_model(userId):
    """
    Calculates top movies to be recommended to user based on movie user has watched.  
    :param userId: userid of user
    :return: Titles of movies recommended to user
    """
    recommended_movie_list = []
    movie_list = []
    df_rating_filtered = df_ratings[df_ratings["userId"]== userId]
    for key, row in df_rating_filtered.iterrows():
        movie_list.append((df_movies["title_only"][row["movieId"]==df_movies["movieId"]]).values) 
    for index, movie in enumerate(movie_list):
        for key, movie_recommended in get_recommendations_based_on_genres(movie[0]).iteritems():
            recommended_movie_list.append(movie_recommended)

    # removing already watched movie from recommended list    
    for movie_title in recommended_movie_list:
        if movie_title in movie_list:
            recommended_movie_list.remove(movie_title)
    
    return set(recommended_movie_list)
get_recommendation_content_model(1)
#print(type(get_recommendation_content_model(1)))

{'Too Late for Tears (1949)',
 'Harry Potter and the Chamber of Secrets (2002)',
 'Agent Cody Banks (2003)',
 'Asterix and the Vikings (AstÃ©rix et les Vikings) (2006)',
 'Americanization of Emily, The (1964)',
 'Django Unchained (2012)',
 'Hell Comes to Frogtown (1988)',
 'Robin Hood (1973)',
 'Princess Bride, The (1987)',
 'Partly Cloudy (2009)',
 'Sleeper (1973)',
 'Ruby Red (2013)',
 'Little Drummer Boy, The (1968)',
 'Texas Rangers (2001)',
 'Ace Ventura: When Nature Calls (1995)',
 'Die Hard: With a Vengeance (1995)',
 'Galaxy Quest (1999)',
 'Batman (1989)',
 'Land and Freedom (Tierra y libertad) (1995)',
 'Sorrow (2015)',
 "Pee-wee's Big Adventure (1985)",
 'Ladyhawke (1985)',
 'Underneath (1995)',
 'Ghost and Mrs. Muir, The (1947)',
 'English Patient, The (1996)',
 'Meet the Robinsons (2007)',
 'Pearl Harbor (2001)',
 'Back to the Future Part II (1989)',
 'Quest for Camelot (1998)',
 'Butch Cassidy and the Sundance Kid (1969)',
 'Man Who Knew Too Little, The (1997)',
 'Teenage

In [68]:
true_count = 0
false_count = 0
def evaluate_content_based_model():
    """
    Evaluate content based model.  
    """
    for key, colums in df_movies.iterrows():
        movies_recommended_by_model = get_recommendations_based_on_genres(colums["title_only"])
        #print(movies_recommended_by_model)
        #print(type(movies_recommended_by_model))
        
        for index, value in movies_recommended_by_model.items():
          
          predicted_genres  = get_movie_label(index)
          
          #print (type(predicted_genres))
          # full_list = (movies_recommended_by_model,predicted_genres)
          # print (full_list)
          #print(predicted_genres)
          for predicted_genre in predicted_genres:
              global true_count, false_count
              if predicted_genre == colums["genres"]:
                true_count = true_count+1
              else:
                print("movie_name: ",value)
                print("Actual Genres: ", colums["genres"])
                
                # print(colums["genres"])
                print("Pridicted as:  ", predicted_genre)

                
                false_count = false_count +1
evaluate_content_based_model()
total = true_count + false_count
print("Hit:"+ str(true_count/total))
print("Fault:" + str(false_count/total))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Actual Genres:  Comedy|Drama|Fantasy|Mystery|Romance
Pridicted as:   Drama|Fantasy|Mystery
movie_name:  Dragonfly (2002)
Actual Genres:  Comedy|Drama|Fantasy|Mystery|Romance
Pridicted as:   Drama|Fantasy|Mystery|Romance
movie_name:  Scoop (2006)
Actual Genres:  Comedy|Drama|Fantasy|Mystery|Romance
Pridicted as:   Drama|Fantasy|Mystery
movie_name:  Fanny and Alexander (Fanny och Alexander) (1982)
Actual Genres:  Comedy|Drama|Fantasy|Mystery|Romance
Pridicted as:   Drama|Fantasy|Mystery
movie_name:  Lady in the Water (2006)
Actual Genres:  Comedy|Drama|Fantasy|Mystery|Romance
Pridicted as:   Drama|Fantasy|Mystery
movie_name:  The Hound of the Baskervilles (1988)
Actual Genres:  Crime|Drama|Horror|Mystery
Pridicted as:   Comedy|Crime|Drama|Horror|Mystery
movie_name:  Opera (1987)
Actual Genres:  Crime|Drama|Horror|Mystery
Pridicted as:   Crime|Horror|Mystery|Thriller
movie_name:  Gozu (GokudÃ´ kyÃ´fu dai-gekijÃ´: Gozu) (2003