In [17]:
#Liberaries
import pandas as pd
import numpy as np
from surprise import Reader, Dataset
from surprise import SVDpp , Dataset, SVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

### Content Based Filtering

In [18]:
#load Movie data
movies_df = pd.read_csv("DataSet\movie Data\movies.csv")
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [19]:
#Clean Genres
def getCleanedGenres(x):
    genres = x["genres"].split("|")
    string =""
    for i in range(len(genres)):
        string= string + " " +genres[i]
    return string

In [20]:
#Clean Genres
movies_df["genres"] = movies_df.apply(getCleanedGenres, axis = 1)
movies_df.sort_values(by = 'movieId', inplace=True)
movies_df.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action Crime Thriller
6,7,Sabrina (1995),Comedy Romance
7,8,Tom and Huck (1995),Adventure Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action Adventure Thriller


In [21]:
# Prepare TFIDF For 
tfidf = TfidfVectorizer( stop_words = 'english', ngram_range=(1,2))
movies_df["genres"] = movies_df["genres"].fillna("")
tvf_Matrix = tfidf.fit_transform(movies_df["genres"] )
cosine_sim = cosine_similarity(tvf_Matrix, tvf_Matrix)

### Collaborative Filtering

In [22]:
#Load Data
ratings_df = pd.read_csv("DataSet/movie Data/ratings.csv")
ratings_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [23]:
# Read data into Surprise dataset
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
train=data.build_full_trainset()

# Train Algorithm for Collaborative filtering
algo = SVD(n_factors=30, 
             n_epochs=10, 
             lr_all=0.005, 
             random_state=40, 
             verbose=True,)
model = algo.fit(train)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9


### Hybrid Filtering

In [24]:
# Function for Hybrid Recommendation
def recommendMovie(user , movie):
    
    results=[]   
    
    #get the cosine similarity for the query movie 
    query_index = movies_df[movies_df["movieId"]== movie].index[0]
    contentBasedScores = list(enumerate(cosine_sim[query_index]))
    contentBasedScores=sorted(contentBasedScores,key=lambda x:x[1],reverse=True)
    
    # sorted movie indexes on cosine similarity
    search_ids=[i[0] for i in contentBasedScores]    

    rated_movies=list(ratings_df[ratings_df['userId']==user]['movieId'])
    
    
    #for each movie index except query movie
    for i in range(0,len(search_ids)):
        
        if(query_index != search_ids[i]):
                
                
                #average rating of each movie for content based rating
                
                index = search_ids[i]
                movie_id = movies_df.iloc[index]["movieId"]
                ratings=ratings_df[ratings_df['movieId']== movie_id]['rating'] 
                contentBased_rating= round(np.mean(ratings),4)
                
                
                if(movie_id not in rated_movies):
                    
                        
                    # collaborating rating for each movie
                    colaborativeBased_rating=round(model.predict(user,movie_id).est,4)

                    #hybrid rating of each movie
                    hybrid_rating = round(np.mean(np.array([contentBased_rating,colaborativeBased_rating]), dtype=np.float64),4)


                    results.append((movie_id,hybrid_rating))
    
    # ten top recommendations 
    results=sorted(results,key=lambda x:x[1],reverse=True) 

    query_movie_title = movies_df[movies_df["movieId"]== movie]["title"].iloc[0]
    print(f"Recommendations for user \"{user}\" and movie \"{query_movie_title}\" are follwoing:\n")
    for recommendation in results[0:10]:
        movie_title = movies_df[movies_df['movieId'] == recommendation[0]]["title"].iloc[0]
        print(movie_title)

In [25]:
recommendMovie(1 , 1)

Recommendations for user "1" and movie "Toy Story (1995)" are follwoing:

Match Factory Girl, The (Tulitikkutehtaan tyttö) (1990)
Jonah Who Will Be 25 in the Year 2000 (Jonas qui aura 25 ans en l'an 2000) (1976)
George Carlin: It's Bad for Ya! (2008)
Belle époque (1992)
Sorority House Massacre II (1990)
Trinity and Sartana Are Coming (1972)
Come and See (Idi i smotri) (1985)
Shawshank Redemption, The (1994)
Presto (2008)
12 Chairs (1976)
