In [1]:
import pandas as pd
import numpy as np

In [2]:
from pathlib import Path

In [3]:
path = Path("../data/ratings.json")
ratings = pd.read_json(path,lines=True)
ratings = ratings[["user_id","item_id","rating"]]
ratings

Unnamed: 0,user_id,item_id,rating
0,0,41335427,5
1,1,41335427,3
2,2,41335427,5
3,3,41335427,5
4,4,41335427,5
...,...,...,...
5152651,46691,56590230,5
5152652,318648,56590230,5
5152653,295248,56590230,4
5152654,258912,56590230,3


In [4]:
table1 = ratings.groupby("user_id").agg(rating_count = ('rating','count')).reset_index()
table1 = table1[table1["rating_count"]>300]
table2 = ratings.groupby("item_id").agg(rating_count = ('rating','count')).reset_index()
table2 = table2[table2["rating_count"]>10]
table3 = pd.merge(ratings,table1,on="user_id",how = 'inner').drop(['rating_count'], axis=1)
table3 = pd.merge(table3,table2,on="item_id",how = 'inner').drop(['rating_count'], axis=1)
table3

Unnamed: 0,user_id,item_id,rating
0,22,41335427,5
1,28,41335427,2
2,69,41335427,4
3,78,41335427,5
4,92,41335427,5
...,...,...,...
320988,248541,48280406,5
320989,154634,54774790,4
320990,138614,54774790,3
320991,211535,54774790,3


In [5]:
! pip install surprise 




[notice] A new release of pip available: 22.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [6]:
import surprise

In [7]:
reader = surprise.Reader(rating_scale=(1,5))
data = surprise.Dataset.load_from_df(table3, reader)
type(data)

surprise.dataset.DatasetAutoFolds

In [8]:
from surprise.model_selection.split import train_test_split
train, test = train_test_split(data,test_size=0.25)
type(train)

surprise.trainset.Trainset

In [9]:
train.ur

defaultdict(list,
            {0: [(0, 4.0),
              (42, 3.0),
              (956, 3.0),
              (1116, 2.0),
              (1846, 5.0),
              (3746, 5.0),
              (4063, 3.0),
              (2258, 4.0),
              (6429, 5.0),
              (1862, 3.0),
              (5496, 4.0),
              (2550, 2.0),
              (6858, 5.0),
              (5115, 3.0),
              (3516, 5.0),
              (6940, 3.0),
              (7003, 3.0),
              (4723, 4.0),
              (4733, 1.0),
              (139, 5.0),
              (7635, 4.0),
              (1547, 3.0),
              (472, 3.0),
              (1143, 4.0),
              (948, 5.0),
              (2429, 5.0),
              (7844, 5.0),
              (1787, 3.0),
              (2636, 4.0),
              (744, 3.0),
              (409, 3.0),
              (1817, 3.0),
              (4255, 5.0),
              (2323, 4.0),
              (4784, 3.0),
              (6815, 2.0),
              (801

In [10]:
from surprise.prediction_algorithms.knns import KNNBasic
sim_option ={
    "name": "cosine",
    "user_based": True
}

knnBasic = KNNBasic(sim_option=sim_option)
model = knnBasic.fit(train)
prediction = model.test(test)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [11]:
result = pd.DataFrame(prediction, columns=['user_id','item_id','base_event','predict_event','detail'])
result.head()

Unnamed: 0,user_id,item_id,base_event,predict_event,detail
0,1321,23858953,3.0,3.192403,"{'actual_k': 22, 'was_impossible': False}"
1,6652,14345371,3.0,3.48755,"{'actual_k': 40, 'was_impossible': False}"
2,17987,19198861,4.0,4.137922,"{'actual_k': 17, 'was_impossible': False}"
3,27133,2793516,5.0,4.429392,"{'actual_k': 40, 'was_impossible': False}"
4,211535,40720183,5.0,4.367714,"{'actual_k': 18, 'was_impossible': False}"


In [12]:
ratings[ratings['item_id']==46663]

Unnamed: 0,user_id,item_id,rating
466803,53690,46663,5
466804,104932,46663,4
466805,20769,46663,1
466806,43826,46663,5
466807,53001,46663,5
...,...,...,...
3898473,21739,46663,5
3898474,71799,46663,4
3898475,237373,46663,3
3898476,65389,46663,5


In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
rmse = np.sqrt(mean_squared_error(result["base_event"], result["predict_event"]))
mae = mean_absolute_error(result['base_event'], result['predict_event'])
r2 = r2_score(result['base_event'], result['predict_event'])
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R^2: {r2}')

RMSE: 0.9142672905522788
MAE: 0.7193622224471422
R^2: 0.1498634278612856


In [14]:
result.shape

(80249, 5)

In [15]:
result.to_csv("../data/knn_prediction.csv",index=False)

In [17]:
prediction = pd.read_csv('../data/knn_prediction.csv')

In [18]:
prediction

Unnamed: 0,user_id,item_id,base_event,predict_event,detail
0,1321,23858953,3.0,3.192403,"{'actual_k': 22, 'was_impossible': False}"
1,6652,14345371,3.0,3.487550,"{'actual_k': 40, 'was_impossible': False}"
2,17987,19198861,4.0,4.137922,"{'actual_k': 17, 'was_impossible': False}"
3,27133,2793516,5.0,4.429392,"{'actual_k': 40, 'was_impossible': False}"
4,211535,40720183,5.0,4.367714,"{'actual_k': 18, 'was_impossible': False}"
...,...,...,...,...,...
80244,14596,14794535,4.0,3.535344,"{'actual_k': 32, 'was_impossible': False}"
80245,191994,48027180,3.0,3.473504,"{'actual_k': 40, 'was_impossible': False}"
80246,20865,21998925,3.0,4.168763,"{'actual_k': 40, 'was_impossible': False}"
80247,47725,503752,5.0,4.023727,"{'actual_k': 13, 'was_impossible': False}"


In [19]:
def getRecommendation(user):
    if user not in prediction['user_id']:
        return []
    return prediction[prediction['user_id']==user]['item_id'].to_list()

In [24]:
getRecommendation(6652)

[14345371,
 48027180,
 3173189,
 10706553,
 2237401,
 9871439,
 19102940,
 45105689,
 524491,
 21525054,
 15535056,
 6976108,
 16536239,
 15464655,
 26732116,
 17389742,
 2674739,
 14530108,
 14479209,
 3207062,
 43212054,
 15093617,
 16160067,
 11552215,
 42819798,
 11409817,
 2267189,
 4790821,
 17225055,
 1429939,
 26680940,
 2856172,
 856203,
 868252,
 976403,
 47255158,
 2904401,
 1540236,
 13451574,
 41824344,
 750423,
 6171458,
 18707527,
 3271379,
 14093739,
 26112550,
 24170172,
 21521601,
 24752715,
 26634437,
 2134456,
 25379189,
 14300276,
 2328657,
 1993854,
 10644152,
 18330294,
 52973207,
 1486401,
 6803715,
 2833088,
 14449480,
 15668403,
 21495195,
 43959504,
 2218243,
 13440919,
 2502879,
 13395554,
 2216625,
 6434098,
 16319487,
 2682098,
 6683959,
 919911,
 48118999,
 9737203,
 2677305,
 23986788,
 25694131,
 13492114,
 28553427,
 6526004,
 1139031,
 21933087,
 17375239,
 2841515,
 47327681,
 2129359,
 6488966,
 26659491,
 25386818,
 10870318,
 1993810,
 6553801,
 1