https://www.albgri.com/python-rs/
for the images of this Document

**GETTING THE DATAS**

In [1]:
import pandas as pd
import numpy as np

In [2]:
column_names = ['user_id','item_id','ratings','timestamp']

In [3]:
df = pd.read_csv('u.data',sep='\t',names=column_names)

In [4]:
df.head()

Unnamed: 0,user_id,item_id,ratings,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [5]:
movie_title = pd.read_csv('Movie_Id_Titles')

In [6]:
movie_title.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
df = pd.merge(df,movie_title,on='item_id')
df.head()

Unnamed: 0,user_id,item_id,ratings,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [8]:
n_user = df['user_id'].nunique()
n_items = df['item_id'].nunique()

print('The Num of unique users: '+str(n_user))
print('The Num of unique items: '+str(n_items))

The Num of unique users: 944
The Num of unique items: 1682


**BREAKING OUR DATA**

In [9]:
from sklearn.model_selection import train_test_split
train_data,test_data = train_test_split(df,test_size=0.25)

# Memory Based Collaborative Filtering

In [10]:
# Creating user item matrix
train_data_matrix = np.zeros((n_user,n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]
    
test_data_matrix = np.zeros((n_user, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [11]:
# Creating a user and item similarity matrix
from sklearn.metrics import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T,metric='cosine')

**Predictions**

In [12]:
# Kindly Refer the module
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff)
        / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [13]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [14]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [15]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 382.0507714945377
Item-based CF RMSE: 3.445468117099904


# Model based Collaborative Filtering

dividing the number of ratings present in the matrix by the product of users and movies in the matrix and subtracting that from 1 will give us the sparsity or the percentage of the ratings matrix that is empty.

In [16]:
sparsity = round(1.0-len(df)/float(n_user*n_items),3)
print('The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%')

The sparsity level of MovieLens100K is 93.7%


K determine the Number of singular values and singular vectors to compute. 
Must satisfy 1 <= k <= kmax, where kmax=min(M, N) for solver='propack' and kmax=min(M, N) - 1 otherwise.

In [18]:
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))

User-based CF MSE: 2.7091708898267672


**Some Rough**

In [19]:
s_diag_matrix

array([[ 68.8597343 ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ],
       [  0.        ,  69.42310627,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,  71.42473533,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,  

In [20]:
s

array([ 68.8597343 ,  69.42310627,  71.42473533,  72.00133961,
        72.30236906,  72.6615808 ,  74.75378357,  76.76397816,
        79.33827424,  79.99255875,  81.33196322,  87.61921733,
        97.38181902,  99.6415658 , 115.23151332, 123.0300267 ,
       124.1186579 , 167.65355443, 186.73966609, 482.59598195])