In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
df = pd.read_json('Video_Games_5.json', lines=True) 
len(df)

497577

In [3]:
#data overview
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,


In [4]:
len(df) , df.isna().sum()

(497577,
 overall                0
 verified               0
 reviewTime             0
 reviewerID             0
 asin                   0
 reviewerName          76
 reviewText           158
 summary              109
 unixReviewTime         0
 vote              389784
 style             208340
 image             493943
 dtype: int64)

In [5]:
dg = df[['reviewerID', 'asin' , 'overall' , 'reviewText']]
#summary are highly correlated with the overall field
#other columns have to much missed values

In [6]:
dg.isna().sum()

reviewerID      0
asin            0
overall         0
reviewText    158
dtype: int64

In [7]:
#dg[dg.reviewText.isna()]
#lets fill nan value

In [8]:
dg['reviewText'].value_counts()[:5]

reviewText
good         2302
great        1592
Good         1532
Great        1318
Excellent    1238
Name: count, dtype: int64

In [9]:
fill_by_rating = lambda x: 'one' if x==1 else ('two' if x==2 else ('three' if x==3 else ('four' if x==4 else 'five')))
fill_by_rating(1)

'one'

In [10]:
dg['reviewText'] = dg.apply(lambda x: fill_by_rating(x.overall) if pd.isna(x.reviewText) else x.reviewText , axis=1)
dg.reviewText.isna().sum() #no more nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dg['reviewText'] = dg.apply(lambda x: fill_by_rating(x.overall) if pd.isna(x.reviewText) else x.reviewText , axis=1)


0

In [11]:
plots = dg.reviewText.to_numpy()
ids = dg.asin.to_numpy()

In [12]:
#extract features from text description using simple bag of words as an example
from cornac.data import TextModality
from cornac.data.text import BaseTokenizer

item_text_modality = TextModality(corpus=plots, ids=ids, 
                                 tokenizer=BaseTokenizer(sep=' ', stop_words='english'),
                                 max_vocab=5000, max_doc_freq=0.5)

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
import cornac
from cornac.models import VAECF
from cornac.eval_methods import RatioSplit
from cornac.hyperopt import Discrete
from cornac.hyperopt import RandomSearch

In [14]:
data = dg.to_numpy()

In [15]:
def calculate_rating_threshold(df):
    """Returns the first percentile of attribute value rankings"""
    # print(df["rating"].nunique())
    if df["overall"].nunique() == 1:
        rating_threshold = df["overall"][0] / 2
    else:
        rating_threshold = df["overall"].quantile(
            q=0.01, interpolation="linear")  # use it in a training
    #print("A rating threshold is " + str(round(rating_threshold, 3)) + '\n')
    return rating_threshold

In [16]:
#make our dataset with train test val split
rating_threshold = calculate_rating_threshold(dg)
ratio_split = RatioSplit(data, test_size=0.2, val_size=0.1,  exclude_unknowns=True,
                         #item_text=item_text_modality,
                         verbose=True, seed=123, rating_threshold=rating_threshold,
                         )

rating_threshold = 1.0
exclude_unknowns = True




---
Training data:
Number of users = 55164
Number of items = 17394
Number of ratings = 336461
Max rating = 5.0
Min rating = 1.0
Global mean = 4.2




---
Test data:
Number of users = 55164
Number of items = 17394
Number of ratings = 98327
Number of unknown users = 0
Number of unknown items = 0
---
Validation data:
Number of users = 55164
Number of items = 17394
Number of ratings = 49412
---
Total users = 55164
Total items = 17394


In [17]:
# Instantiate evaluation measures
rec_5 = cornac.metrics.Recall(k=5)
ndcg_5 = cornac.metrics.NDCG(k=5)
auc = cornac.metrics.AUC()
n_trails =2
#initiate models
vaecf = cornac.models.VAECF(
    k=5,
    autoencoder_structure=[20],
    act_fn="tanh",
    likelihood="mult",
    n_epochs=20,
    batch_size=100,
    learning_rate=0.001,
    beta=1.0,
    seed=123,
    use_gpu=True,
    verbose=True,
)

#example params search 
rs_vaecf = RandomSearch(
            model=vaecf,
            space=[
                #Discrete("n_epochs", np.arange(9, 11, 2)),
                Discrete("beta", np.arange(0.9, 1.1, 0.1)),
                #Discrete("k", np.arange(10, 14, 2)),
                #Discrete("act_fn", ['sigmoid', 'tanh','elu', 'relu', 'relu6']),
                #Discrete("batch_size", np.arange(64, 256, 64)),
                #Discrete("learning_rate", np.arange(0.001, 0.002, 0.001))
            ],
            metric=ndcg_5,
            eval_method=ratio_split,
            n_trails=n_trails)

In [18]:
def find_best_params(ratio_split , n_trails =2 ):
    # Put everything together into an experiment and run it
    ndcg_5 = cornac.metrics.NDCG(k=5)

    vaecf = cornac.models.VAECF(
        k=5,
        autoencoder_structure=[20],
        act_fn="tanh",
        likelihood="mult",
        n_epochs=20,
        batch_size=100,
        learning_rate=0.001,
        beta=1.0,
        seed=123,
        use_gpu=True,
        verbose=True,
    )
    rs_vaecf = RandomSearch(
                model=vaecf,
                space=[
                    #Discrete("n_epochs", np.arange(9, 11, 2)),
                    Discrete("beta", np.arange(0.9, 1.1, 0.1)),
                    #Discrete("k", np.arange(10, 14, 2)),
                    #Discrete("act_fn", ['sigmoid', 'tanh','elu', 'relu', 'relu6']),
                    #Discrete("batch_size", np.arange(64, 256, 64)),
                    #Discrete("learning_rate", np.arange(0.001, 0.002, 0.001))
                ],
                metric=ndcg_5,
                eval_method=ratio_split,
                n_trails=n_trails)

    cornac.Experiment(
        eval_method=ratio_split,
        models=[ rs_vaecf ],
        metrics=[rec_5, ndcg_5, auc],
        user_based=True,
    ).run()
    return rs_vaecf.best_params
best_params = find_best_params(ratio_split)


[RandomSearch_VAECF] Training started!
Evaluating: {'beta': 1.1}


100%|███████████████████████████████| 20/20 [04:17<00:00, 12.89s/it, loss=0.527]


Evaluating: {'beta': 1.0}


100%|███████████████████████████████| 20/20 [04:15<00:00, 12.75s/it, loss=0.525]


Best parameter settings: {'beta': 1.1}
NDCG@5 = 0.0205

[RandomSearch_VAECF] Evaluation started!


Ranking: 100%|███████████████████████████| 43480/43480 [01:39<00:00, 436.91it/s]
Ranking: 100%|███████████████████████████| 30184/30184 [01:07<00:00, 448.29it/s]


VALIDATION:
...
                   |    AUC | NDCG@5 | Recall@5 | Time (s)
------------------ + ------ + ------ + -------- + --------
RandomSearch_VAECF | 0.8626 | 0.0205 |   0.0282 |  67.3423

TEST:
...
                   |    AUC | NDCG@5 | Recall@5 | Train (s) | Test (s)
------------------ + ------ + ------ + -------- + --------- + --------
RandomSearch_VAECF | 0.8639 | 0.0238 |   0.0296 |  634.2831 |  99.5369






In [19]:
best_params

{'beta': 1.1}

In [20]:
# train 2 models , we can assign more epochs after finding the best parameters
rec_5 = cornac.metrics.Recall(k=5)
ndcg_5 = cornac.metrics.NDCG(k=5)
auc = cornac.metrics.AUC()
    
svd = cornac.models.SVD(
    k=5, max_iter=30, learning_rate=0.01, lambda_reg=0.02, verbose=True
)

vaecf = cornac.models.VAECF(
        k=5,
        autoencoder_structure=[20],
        act_fn="tanh",
        likelihood="mult",
        n_epochs=20,
        batch_size=100,
        learning_rate=0.001,
        beta=best_params["beta"], #лучшие параметры
        seed=123,
        use_gpu=True,
        verbose=True,
        )
# start models training
cornac.Experiment(
    eval_method=ratio_split,
    models=[ svd , vaecf],
    metrics=[rec_5, ndcg_5, auc],
    user_based=True,
).run()


[SVD] Training started!


100%|██████████████████████████| 30/30 [00:00<00:00, 245.50it/s, loss=123359.63]


Optimization finished!

[SVD] Evaluation started!


Ranking: 100%|███████████████████████████| 43480/43480 [01:24<00:00, 514.22it/s]
Ranking: 100%|███████████████████████████| 30184/30184 [00:57<00:00, 527.55it/s]



[VAECF] Training started!


100%|███████████████████████████████| 20/20 [03:57<00:00, 11.90s/it, loss=0.527]



[VAECF] Evaluation started!


Ranking: 100%|███████████████████████████| 43480/43480 [01:39<00:00, 435.50it/s]
Ranking: 100%|███████████████████████████| 30184/30184 [01:08<00:00, 442.98it/s]


VALIDATION:
...
      |    AUC | NDCG@5 | Recall@5 | Time (s)
----- + ------ + ------ + -------- + --------
SVD   | 0.5549 | 0.0002 |   0.0003 |  57.2242
VAECF | 0.8626 | 0.0205 |   0.0282 |  68.1492

TEST:
...
      |    AUC | NDCG@5 | Recall@5 | Train (s) | Test (s)
----- + ------ + ------ + -------- + --------- + --------
SVD   | 0.5535 | 0.0002 |   0.0002 |    0.1422 |  84.5726
VAECF | 0.8639 | 0.0238 |   0.0296 |  237.9762 |  99.8549






In [21]:
#VAECF | 0.8438 | 0.0165 |   0.0209 | 3528.3831 | 1718.8671 - results without text rewiw information

In [22]:
#sample inference
uid = 'A1HP7NVNPFMA4N'
k=5
remove_seen = True
train_set = ratio_split.train_set
vaecf.recommend(
        user_id=uid,
        k=k,
        remove_seen=remove_seen,
        train_set=ratio_split.train_set)

['B000XJNTNS', 'B002BRZ9G0', 'B0009VXBAQ', 'B000ZKA0J6', 'B000FO4KO8']

In [23]:
vaecf.save("trained_model")

VAECF model is saved to trained_model/VAECF/2024-01-01_19-11-29-881086.pkl


'trained_model/VAECF/2024-01-01_19-11-29-881086.pkl'

In [24]:
ratio_split = RatioSplit(data, test_size=0.2, val_size=0.1,  exclude_unknowns=True,
                         #item_text=item_text_modality,
                         verbose=True, seed=123, rating_threshold=rating_threshold, )
train_set = ratio_split.train_set
#save dataset if we want t use remove_seen 

with open('trained_model/VAECF/train_set.pkl', 'wb') as handle:
    pickle.dump(train_set, handle, protocol=pickle.HIGHEST_PROTOCOL)

rating_threshold = 1.0
exclude_unknowns = True




---
Training data:
Number of users = 55164
Number of items = 17394
Number of ratings = 336461
Max rating = 5.0
Min rating = 1.0
Global mean = 4.2
---
Test data:
Number of users = 55164
Number of items = 17394
Number of ratings = 98327
Number of unknown users = 0
Number of unknown items = 0
---
Validation data:
Number of users = 55164
Number of items = 17394
Number of ratings = 49412
---
Total users = 55164
Total items = 17394


