In [1]:
# Importing libraies
import os
import pandas as pd
import numpy as np
import sklearn.metrics.pairwise as pw

In [2]:
#Loading the ratings data
baseDir = 'D:\\Recommdar_system\\Assignment1\\ml-latest-small'
ratings = pd.read_csv(baseDir+"\\ratings.csv")
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# Drop 'timestamp' column 
ratings.drop('timestamp', inplace=True, axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [4]:
print("Number of rows: ",len(ratings))
print("Number of users: ", len(pd.unique(ratings['userId'])))

Number of rows:  100836
Number of users:  610


In [5]:
# Loading the movie data
movies_df = pd.read_csv(baseDir+'\\movies.csv')

# Drop 'genres' column 
movies_df.drop('genres', inplace=True, axis=1)
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


### item-based collaborative filtering approach using the cosine similarity 

Merging the data

In [6]:
combined_df = movies_df.merge(ratings,on="movieId")
combined_df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [7]:
combined_df.shape

(100836, 4)

Creating a user inforamtion table where each row represents ratings of all the movies provided by a user.

In [8]:
user_data_table = pd.pivot_table(combined_df,values='rating',columns='movieId',index='userId')
user_data_table = user_data_table.fillna(0)
user_data_table.head(2)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Calculating Cosine Similarity

In [9]:
cosineSim = pw.cosine_similarity(user_data_table.T,user_data_table.T)
cosineSim

array([[1.        , 0.41056206, 0.2969169 , ..., 0.        , 0.        ,
        0.        ],
       [0.41056206, 1.        , 0.28243799, ..., 0.        , 0.        ,
        0.        ],
       [0.2969169 , 0.28243799, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [14]:
# Convert cosine similarity into a dataframe
cosineSim = pd.DataFrame(cosineSim, index = user_data_table.columns, columns = user_data_table.columns)
#cosineSim

User rated and unrated movies lists

In [15]:
userHistory = {}
for ind in user_data_table.index:
    userRated =[]
    userNotRated = []
    for col in user_data_table.columns:
        if user_data_table.loc[ind,col] == 0:
            userNotRated.append(col)
        else:
            userRated.append(col)
    userHistory[ind] = [userRated, userNotRated]

## Calculation function

In [18]:
R = {}

def predictionScore(userId):
    temp_r = {}
    for p in userHistory[userId][1]:
        neu = 0
        den = 0
        for i in userHistory[userId][0]:
            neu = neu +(cosineSim.loc[i,p] * user_data_table.loc[userId,i])
            den = den+ cosineSim.loc[i,p]
            
        if den!= 0:
            temp_r[p]=(neu/den)
            
    R[userId] = temp_r 

In [19]:
predictionScore(100)

In [20]:
RecommendedMovieIds = []
for k,v in R.items():
    sort_v = sorted(v.items(), key=lambda x: x[1], reverse=True)
    for i in sort_v:
        RecommendedMovieIds.append(i[0])

In [23]:
recommended_df =movies_df[movies_df['movieId'].isin(RecommendedMovieIds[:20])]
recommended_df

Unnamed: 0,movieId,title
624,791,"Last Klezmer: Leopold Kozlowski, His Life and ..."
847,1116,"Single Girl, A (Fille seule, La) (1995)"
864,1137,Hustler White (1996)
866,1144,"Line King: The Al Hirschfeld Story, The (1996)"
1659,2226,"Ring, The (1927)"
1875,2493,"Harmonists, The (1997)"
2838,3795,"Five Senses, The (1999)"
4193,6049,Ethan Frome (1993)
5453,26095,"Carabineers, The (Carabiniers, Les) (1963)"
6945,65350,"General Died at Dawn, The (1936)"
