In [1]:
import pandas as pd
import numpy as np
import ScoreCalculate as sc
import DataReaderplus as dr
import math
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [2]:
scoreData = dr.DataReader().get_score_data() # read data from the Score table
score_df = scoreData.pivot(index='user_id', columns = 'technology_id', values = 'total_score') # reshape score table 
score_df = score_df.fillna(0)

score_spare = sparse.csr_matrix(score_df)
similarities = cosine_similarity(score_spare)
#print('pairwise dense output:\n {}\n'.format(similarities))
similarities_df = pd.DataFrame(similarities, columns=score_df.index, index = score_df.index)
#print similarities_df.head()


In [3]:
"""
Given a user and an item to rate, predict the ratings of the user on the item
"""
def rating_predict(user, item):
    # find all users who have rated the item, return a user list
    userls = rated_users(item)
    # find the similarity betwen the user and each user in the user list
    sim_urs = similarities_df.loc[user, userls]
    # find the score of similar users on the given item
    sim_scores = score_df.loc[userls, item]
    # predict rating of the user based on the weighted rating of similar users
    score_pred = np.sum(sim_urs*sim_scores)/np.sum(sim_scores)
    return score_pred

In [4]:
"""
given an item, return all users in a list who have rated the item
"""
def rated_users(item):
    score = score_df[item] # score vector of the item 
    userls = score[score!=0].index.values # users that rated the item
    return userls

In [5]:
score_pred_df = score_df.copy()
for user in score_df.index:
    for item in score_df.columns:
        if score_df.loc[user, item]==0.0:
            score_pred_df.loc[user, item] = rating_predict(user, item)
print score_pred_df.head


<bound method DataFrame.head of technology_id                              2         3         4         5    \
user_id                                                                        
5260234c-9878-4d49-9d26-46b2d4718e13  0.000000  0.006891  0.011276  0.000000   
5260f967-7d74-4f6e-9deb-4b7fd4718e13  0.039572  0.120156  0.066331  0.014834   
526132a9-4280-4cc4-a2d8-42acd4718e13  0.001441  0.007284  0.017069  0.000000   
526640ab-ebd8-4593-9749-4002d4718e13  0.000000  0.035136  0.018956  0.018956   
528de178-be0c-4af4-9324-4fc8d4718e13  0.000000  0.014618  0.023920  0.000000   
528f575a-c6a8-4fdb-9bd7-4662d4718e13  0.022690  0.009991  0.007502  0.011731   
529492c1-d5c8-48d0-b411-4410d4718e13  0.052907  1.000000  1.000000  0.118647   
52949cdf-6860-496a-83e1-4e3bd4718e13  0.078862  1.000000  0.054310  1.000000   
5294de89-09e8-482f-899e-4cfcd4718e13  0.046447  0.077978  0.035770  0.026203   
5296fe22-a518-48c8-96bd-42e0d4718e13  0.005921  0.026837  0.006273  0.000000   
5298c2bc

In [6]:
for user in score_df.index:
    print user
    print score_df.loc[user].nlargest(5)

pred_df = pd.DataFrame({index:score_df.loc[index].nlargest(5).index.tolist() for index in score_df.index}).T
print pred_df

5260234c-9878-4d49-9d26-46b2d4718e13
technology_id
11    1.0
38    1.0
2     0.0
3     0.0
4     0.0
Name: 5260234c-9878-4d49-9d26-46b2d4718e13, dtype: float64
5260f967-7d74-4f6e-9deb-4b7fd4718e13
technology_id
274    3.0
302    3.0
20     2.0
189    2.0
203    2.0
Name: 5260f967-7d74-4f6e-9deb-4b7fd4718e13, dtype: float64
526132a9-4280-4cc4-a2d8-42acd4718e13
technology_id
826    3.0
200    2.0
218    2.0
201    1.0
207    1.0
Name: 526132a9-4280-4cc4-a2d8-42acd4718e13, dtype: float64
526640ab-ebd8-4593-9749-4002d4718e13
technology_id
58    2.0
2     0.0
3     0.0
4     0.0
5     0.0
Name: 526640ab-ebd8-4593-9749-4002d4718e13, dtype: float64
528de178-be0c-4af4-9324-4fc8d4718e13
technology_id
114    1.0
2      0.0
3      0.0
4      0.0
5      0.0
Name: 528de178-be0c-4af4-9324-4fc8d4718e13, dtype: float64
528f575a-c6a8-4fdb-9bd7-4662d4718e13
technology_id
17    1.0
28    1.0
31    1.0
68    1.0
73    1.0
Name: 528f575a-c6a8-4fdb-9bd7-4662d4718e13, dtype: float64
529492c1-d5c8-48d0-b411-4