In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies=pd.read_csv(r'/Users/lohitashwa/Desktop/Semester 6/RS/ml-latest-small/movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
ratings=pd.read_csv(r'/Users/lohitashwa/Desktop/Semester 6/RS/ml-latest-small/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


# Preprocessing of the Data

In [4]:
# transpose the rating dataframe
t_ratings = ratings.pivot(index='userId', columns='movieId', values='rating')
t_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


### Cosine Similarity

In [5]:
def similarity(active_user,data):
    data = data.fillna(0)
    return pd.DataFrame(cosine_similarity(data.loc[active_user].values.reshape(1, -1), data.values), columns=data.index, index=[active_user])

## mean-centered prediction function

In [6]:
def pred(active_user,similarties,t_ratings,ratings,movie_pred_rating):
    # neighbours for active is set at = 10
    neighbours=similarties.iloc[:,:10]
    #mean centered data
    mean_data=t_ratings.sub(t_ratings.mean(axis=1, skipna=True), axis=0)
    mean_data=mean_data.fillna(0)
    if(movie_pred_rating!=None):
        return t_ratings.loc[active_user].mean(skipna=True)+(np.sum(np.multiply(mean_data.loc[neighbours.columns,movie_pred_rating].values,neighbours.values))/np.abs(neighbours.values).sum())
    else:
        
        pred_rating={}
        #taking out prediction by using raw rating mean function
        for i in mean_data.columns:
            pred_rating[i]=t_ratings.loc[active_user].mean(skipna=True)+(np.sum(np.multiply(mean_data.loc[neighbours.columns,i].values,neighbours.values))/np.abs(neighbours.values).sum())

        # print(pred_rating)
        # removing those movies which active user has already rated it
        pred_rating= {k: v for k, v in pred_rating.items() if k not in ratings.loc[(ratings['userId']==active_user),'movieId'].values}
        return dict(sorted(pred_rating.items(), key=lambda x: x[1], reverse=True)[:5])  

# Collaberative Filtering

In [7]:
def recommendation(active_user,t_ratings,movies,ratings,movie_pred_rating=None):
    similarties=similarity(active_user, t_ratings)
    similarties=similarties.loc[:, similarties.iloc[0].sort_values(ascending=False).index]
    similarties=similarties.drop(active_user,axis=1)
    # print(similarties)
    prediction=pred(active_user,similarties,t_ratings,ratings,movie_pred_rating)
    if(movie_pred_rating!=None):
        return "Active User "+str(active_user)+" can almost rate this movie about "+str(round(prediction,1))
    else:
        #it will recommend top 5 movies to the active user
        # print(prediction)
        movies = movies.set_index('movieId')
        return movies.loc[prediction.keys(),'title']


In [10]:
active_user=int(input("Enter The User Id "))
x=input("If you want to enter the Movie ID enter YES otherwise NO: ")
if(x=="YES"):
    movie_pred_rating=int(input("Enter The movie Id "))
    # checking that wether that movie is already rated by active user
    if(pd.isna(t_ratings.at[active_user, movie_pred_rating])==True):
        print(recommendation(active_user,t_ratings,movies,ratings,movie_pred_rating))
    else:
        print("Active User already rated this movie ",t_ratings.loc[active_user, movie_pred_rating])
else:
    print("Top 5 movies should be recommended to this active user: \n",recommendation(active_user,t_ratings,movies,ratings))


Enter The User Id 1
If you want to enter the Movie ID enter YES otherwise NO: NO
Top 5 movies should be recommended to this active user: 
 movieId
858       Godfather, The (1972)
1200              Aliens (1986)
541         Blade Runner (1982)
1387                Jaws (1975)
2762    Sixth Sense, The (1999)
Name: title, dtype: object
