# Construct datasets

In [1]:
import csv
import json

import pandas as pd
from surprise import (
    Dataset,
    Reader,
    KNNBasic,
)
from surprise.model_selection import (
    train_test_split,
)
from surprise.accuracy import rmse

In [2]:
def load_reviews(path: str):
    """
    read user-item rating
    :param path: 
    :return: 
    """
    review_dict = {
        'itemID': [],
        'userID': [],
        'rating': []
    }
    with open(path, 'r', encoding='utf-8') as f:
        csv_reader = csv.reader(f)
        for i, row in enumerate(csv_reader):
            if i==0:
                continue
            review_dict['itemID'].append(row[0])
            review_dict['userID'].append(row[-2])
            is_recommended = row[4]
            review_dict['rating'].append(2 if is_recommended == 'true' else 1)
    df = pd.DataFrame(review_dict)
    return df #


def load_games(path: str, positive_ratio_threshold: float = 70):
    """
    Read game information
    :param path: 
    :param positive_ratio_threshold: 
    :return: 
    """
    ret = []
    with open(path, 'r', encoding='utf-8') as f:
        csv_reader = csv.reader(f)
        for i, row in enumerate(csv_reader):
            if i==0: 
                continue
            positive_ratio = float(row[7])
            if positive_ratio >= positive_ratio_threshold:
                ret.append(
                    {
                        'app_id': row[0],
                        'title': row[1],
                        'rating': row[6],
                        'positive_ratio': row[7],
                    }
                )
    return ret

In [3]:
reviews_df = load_reviews('data/recommendations.csv')
games = load_games('data/games.csv', positive_ratio_threshold=75)
reviews_df = reviews_df[
    reviews_df['itemID'].isin([game['app_id'] for game in games])
] 
reader = Reader(rating_scale=(1, 2))
reviews_dataset = Dataset.load_from_df(
    reviews_df[['userID', 'itemID', 'rating']],
    reader
)
train, test = train_test_split(reviews_dataset, test_size=.2)

In [4]:
print('[games] len: {}'.format(len(games)))

[games] len: 32567


# Build the training model

In [5]:
N_NEIGHBORS = 5

sim_options = {
    "name": "cosine",  # sim_options(dict): A dictionary of options for the similarity measure. Here we choose cosine
    "user_based": False,  # item-based
}

model = KNNBasic(k=N_NEIGHBORS, sim_options=sim_options)
model.fit(train)
preditions = model.test(test)
model_rmse = rmse(preditions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.3264


In [6]:
# Calculate the top N similar items for each item
top_n_neighbors = {}
for i, inner_id in enumerate(model.trainset.all_items()):
    raw_id = model.trainset.to_raw_iid(inner_id)
    neighbors = model.get_neighbors(inner_id, k=N_NEIGHBORS)
    top_n_neighbors[raw_id] = [model.trainset.to_raw_iid(neighbor) for neighbor in neighbors]

# save top_n_neighbors
with open('data/top_n_neighbors.json', 'w', encoding='utf-8') as f:
    json.dump(top_n_neighbors, f, ensure_ascii=False, indent=4)

In [7]:

def recommend(user_id: str, top_n: int = 10):
    """
    recommend
    :param user_id: 
    :param top_n: 
    :return: 
    """
    game_ids = [game['app_id'] for game in games]
    # Get user rated items
    user_items = reviews_df[reviews_df['userID'] == user_id]['itemID'].tolist()
    if len(user_items) == 0:
        # If user has not rated any item 
        sorted_games = sorted(games, key=lambda x: x['positive_ratio'] * x['rating'], reverse=True)
        return [game['app_id'] for game in sorted_games[:top_n]]
    # Get the top N similar items to the item rated by the users
    top_n_items = []
    for user_item in user_items:
        neighbor_list = top_n_neighbors.get(user_item, [])
        for neighbor in neighbor_list:
            if neighbor not in user_items and neighbor in game_ids:
                top_n_items.append(neighbor)
    top_n_items = list(set(top_n_items))
    return top_n_items[:top_n]


In [8]:
# test
test_user_id = '51580'
game_dict = {game['app_id']: game for game in games}
recommend_list = recommend(test_user_id)
print('[Recommended Games]')
for item in reviews_df[reviews_df['userID'] == test_user_id]['itemID'].tolist():
    print(game_dict[item])
print('[Model Recommendation List]')
for item in recommend_list:
    print(game_dict[item])

[推荐过的游戏]
{'app_id': '975370', 'title': 'Dwarf Fortress', 'rating': 'Overwhelmingly Positive', 'positive_ratio': '95'}
{'app_id': '1817190', 'title': 'Marvel’s Spider-Man: Miles Morales', 'rating': 'Very Positive', 'positive_ratio': '94'}
{'app_id': '1649080', 'title': 'Two Point Campus', 'rating': 'Very Positive', 'positive_ratio': '88'}
{'app_id': '590380', 'title': 'Into the Breach', 'rating': 'Very Positive', 'positive_ratio': '94'}
{'app_id': '379720', 'title': 'DOOM', 'rating': 'Overwhelmingly Positive', 'positive_ratio': '95'}
[模型推荐列表]
{'app_id': '1173510', 'title': 'XSOverlay', 'rating': 'Very Positive', 'positive_ratio': '91'}
{'app_id': '1766580', 'title': 'Diptych', 'rating': 'Positive', 'positive_ratio': '95'}
{'app_id': '1481400', 'title': 'Dagon: by H. P. Lovecraft', 'rating': 'Overwhelmingly Positive', 'positive_ratio': '96'}
{'app_id': '206190', 'title': 'Gunpoint', 'rating': 'Overwhelmingly Positive', 'positive_ratio': '97'}
{'app_id': '421660', 'title': 'Harmonia', 'ra