In [202]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds

In [203]:
col_names= ["user_id", "item_id", "rating", "timestamp"]
ratingsData = pd.read_table("C:\\Users\\naman\\Documents\\Harpreet\\CUNY\\Data_602\\Assignment-3\\Mitxpro\\u.data", names=col_names)
ratingsData.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## calculate sparcity of the data

In [204]:
def calculateSparcity (data) :
    Number_Ratings = float(len(data))
    Number_Movies = float(len(np.unique(data["item_id"])))
    Number_Users = float(len(np.unique(data["user_id"])))
    sparcity = (Number_Ratings/(Number_Movies * Number_Users)) * 100
    return sparcity

sparcityBefore = calculateSparcity(ratingsData)
print ("Sparcity Before " + str(sparcityBefore))


Sparcity Before 6.30466936422


## Subsetting the data, by not including the users that have less than 50 ratings

In [205]:
users = ratingsData["user_id"]
rating_count = {}
for user in users :
    if user in rating_count :
        rating_count[user] += 1
    else :
        rating_count[user] = 1
RATING_CUTOFF = 50
remove_user = []
for user in rating_count :
    if rating_count[user] < RATING_CUTOFF:
        remove_user.append(user)
ratingsData = ratingsData.loc[~ ratingsData["user_id"].isin(remove_user) ]

sparcityAfter = calculateSparcity (ratingsData)
print ("Sparcity After " + str (sparcityAfter))


Sparcity After 9.26584192843


In [206]:
pivoted_ratingsData = ratingsData.pivot(index= "user_id", columns= "item_id", values = "rating").fillna(0)
pivoted_ratingsData.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [207]:
ratingDataMatrix = pivoted_ratingsData.as_matrix()
meanMatrix = np.mean(ratingDataMatrix, axis=1)
normalizedMatrix = ratingDataMatrix - meanMatrix.reshape(-1,1)
normalizedMatrix


array([[ 4.41582391,  2.41582391,  3.41582391, ..., -0.58417609,
        -0.58417609, -0.58417609],
       [ 3.86317668, -0.13682332, -0.13682332, ..., -0.13682332,
        -0.13682332, -0.13682332],
       [-0.08982748, -0.08982748, -0.08982748, ..., -0.08982748,
        -0.08982748, -0.08982748],
       ..., 
       [-0.22010708, -0.22010708, -0.22010708, ..., -0.22010708,
        -0.22010708, -0.22010708],
       [-0.20047591, -0.20047591, -0.20047591, ..., -0.20047591,
        -0.20047591, -0.20047591],
       [-0.34086853,  4.65913147, -0.34086853, ..., -0.34086853,
        -0.34086853, -0.34086853]])

## Calculating SVD

In [208]:
# with k value 100
u100, sigma100, vT100 = svds(normalizedMatrix, k = 100)
sigma100 = np.diag(sigma100)

# with k value 50
u50, sigma50, vT50 = svds(normalizedMatrix, k = 50)
sigma50 = np.diag(sigma50)

# with k value 10
u10, sigma10, vT10 = svds(normalizedMatrix, k = 10)
sigma10 = np.diag(sigma10)

## Rating Predictions  for All Users

In [209]:
# with k value 100
fullData100 = np.dot(np.dot(u100, sigma100), vT100) + meanMatrix.reshape(-1, 1)


# with k value 50
fullData50 = np.dot(np.dot(u50, sigma50), vT50) + meanMatrix.reshape(-1, 1)


# with k value 10
fullData10 = np.dot(np.dot(u10, sigma10), vT10) + meanMatrix.reshape(-1, 1)



## Rating Predictions for specific user

In [210]:
## with K value 100
sigma100 = np.diag(sigma100)
userPrefProduct = np.dot (u100[5, : ] , sigma100) 
userRecommendation = np.dot(userPrefProduct, vT100) 
print ("Recommendation for 6th User and 6th movie " + str(userRecommendation[5,5] + meanMatrix[5]))


## with K value 50
sigma50 = np.diag(sigma50)
userPrefProduct50 = np.dot (u50[5, : ] , sigma50) 
userRecommendation50 = np.dot(userPrefProduct50, vT50) 
print ("Recommendation for 6th User and 6th movie " + str(userRecommendation50[5,5] + meanMatrix[5]))


## with K value 10
sigma10 = np.diag(sigma10)
userPrefProduct10 = np.dot (u10[5, : ] , sigma10) 
userRecommendation10 = np.dot(userPrefProduct10, vT10) 
print ("Recommendation for 6th User and 6th movie " + str(userRecommendation10[5,5] + meanMatrix[5]))


Recommendation for 6th User and 6th movie 0.126142801544
Recommendation for 6th User and 6th movie 3.98828970677
Recommendation for 6th User and 6th movie 0.565261902394


In [211]:
userRecommendation50[5,5]

3.0376650785678594

## Conclusion
In this project, we were able to apply matrix factorization using SVD. We were able to predict the rating for users based on the feature reduction done by SVD. We tried with multiple values of k (10, 50, 100). As a future work we would like to test each of these model on the test data to find the most optimum value of K that provides best prediction.