# Model 2: Content-based Filtering using TF-IDF and Cosine Similarity
- This model uses TF-IDF to vectorize the text data and Cosine Similarity for recommendations.
- Recommend games with similar content (genre, developer, language) to previously played games.
- input: top_10_percent_games, purchased_games

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
purchased_games = pd.read_csv("./clean_datasets/purchased_games_demo.csv")
games = pd.read_csv("./data_model/top_10_percent_games.csv")

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text) 
    text = re.sub(r'[^\w\s]', '', text)
    return text

games['genres'] = games['genres'].apply(preprocess_text)
games['developers'] = games['developers'].apply(preprocess_text)
games['publishers'] = games['publishers'].apply(preprocess_text)
games['supported_languages'] = games['supported_languages'].apply(preprocess_text)

games['combined_features'] = games['genres'] + ' ' + games['developers'] + ' ' + games['publishers'] + ' ' + games['supported_languages']

In [4]:
import os
if (not os.path.exists('./data_model/cosine_similarity.csv')):

    # Tạo vector TF-IDF cho các đặc trưng tổng hợp của game
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    games_tfidf = tfidf_vectorizer.fit_transform(games['combined_features'])

    # Tính toán độ tương đồng cosine giữa các game
    cosine_sim = cosine_similarity(games_tfidf)

    # Lưu kết quả cosine similarity vào CSV để tái sử dụng
    cosine_sim_df = pd.DataFrame(cosine_sim, index=games['gameid'], columns=games['gameid'])
    cosine_sim_df.to_csv('./data_model/cosine_similarity.csv', index=True)


In [92]:
cosine_sim_df = pd.read_csv('./data_model/cosine_similarity.csv', index_col='gameid')

In [93]:
cosine_sim_df

Unnamed: 0_level_0,730,578080,550,218620,304930,1172470,230410,271590,407530,4000,...,715210,792030,497580,568930,669630,1621070,397400,337450,842420,448710
gameid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
730,1.000000,0.327048,0.960925,0.172464,0.034193,0.329850,0.243120,0.235455,0.260028,0.747302,...,0.006253,0.001757,0.176088,0.090653,0.037009,0.007814,0.041671,0.003480,0.046447,0.008325
578080,0.327048,1.000000,0.300592,0.116035,0.046123,0.355564,0.276273,0.212815,0.226508,0.322390,...,0.007279,0.002045,0.206922,0.061726,0.053164,0.091655,0.048512,0.004051,0.063964,0.018355
550,0.960925,0.300592,1.000000,0.179477,0.006317,0.294402,0.215208,0.245030,0.270601,0.777691,...,0.006507,0.001828,0.183249,0.094340,0.038514,0.008132,0.002242,0.003622,0.002498,0.008664
218620,0.172464,0.116035,0.179477,1.000000,0.006334,0.218075,0.123898,0.181504,0.144841,0.133110,...,0.006525,0.001833,0.131830,0.080560,0.038620,0.008154,0.002248,0.003632,0.002505,0.008688
304930,0.034193,0.046123,0.006317,0.006334,1.000000,0.056687,0.037846,0.033823,0.024676,0.013913,...,0.014543,0.011723,0.021468,0.008685,0.020159,0.057885,0.048240,0.023222,0.051443,0.025741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1621070,0.007814,0.091655,0.008132,0.008154,0.057885,0.021174,0.008648,0.043540,0.078359,0.013042,...,0.015034,0.010989,0.027636,0.007625,0.020240,1.000000,0.013473,0.021768,0.012024,0.028227
397400,0.041671,0.048512,0.002242,0.002248,0.048240,0.059622,0.046123,0.002565,0.015563,0.019549,...,0.014807,0.016473,0.018571,0.005714,0.009181,0.013473,1.000000,0.058401,0.062051,0.019714
337450,0.003480,0.004051,0.003622,0.003632,0.023222,0.004979,0.003852,0.004144,0.053854,0.074505,...,0.023922,0.026614,0.030004,0.009231,0.014832,0.021768,0.058401,1.000000,0.113403,0.031850
842420,0.046447,0.063964,0.002498,0.002505,0.051443,0.078614,0.051409,0.012979,0.052075,0.067942,...,0.006606,0.007349,0.016574,0.013601,0.021855,0.012024,0.062051,0.113403,1.000000,0.018783


In [7]:
purchased_games['library'] = purchased_games['library'].apply(eval)

In [28]:
games.head(10)['gameid']

0        730
1     578080
2        550
3     218620
4     304930
5    1172470
6     230410
7     271590
8     407530
9       4000
Name: gameid, dtype: int64

In [94]:
def get_recommended_games(playerid, cosine_sim_df, games_df, top_n=10):
    owned_games = purchased_games[purchased_games['playerid'] == playerid]['library'].values[0]
    
    # Nếu danh sách rỗng, trả về game phổ biến
    if not owned_games:
        print(f"Player {playerid} has not purchased any games.")
        return games_df.head(top_n)['gameid']
    
    # Tính điểm tương đồng
    similar_scores = pd.Series(dtype=float)
    for game_id in owned_games:
        if game_id in cosine_sim_df.index:
            game_id = str(game_id)
            similar_scores = pd.concat([similar_scores, cosine_sim_df[game_id]])
            
    # Tổng hợp điểm số, sắp xếp giảm dần
    similar_scores = similar_scores.groupby(similar_scores.index).sum().sort_values(ascending=False)
    
    # Loại bỏ game đã mua, chọn 10 game đầu tiên
    recommended_games = similar_scores[~similar_scores.index.isin(owned_games)].head(top_n)
    recommended_games = pd.DataFrame(recommended_games).reset_index()
    recommended_games.columns = ['gameid', 'similarity_score']
    # recommended_games = recommended_games['gameid']
    
    # Nếu không đủ game để gợi ý (vì không tìm thấy trong cosine_sim_df), bổ sung thêm game phổ biến
    if len(recommended_games) < top_n:
        additional_games = games_df[~games_df['gameid'].isin(recommended_games.index)]['gameid'].head(top_n - len(recommended_games))
        recommended_games = pd.concat([recommended_games, additional_games])
    
    return recommended_games



Finished time depends on number of purchased games

In [95]:
playerid = 76561198060698936  
print(get_recommended_games(playerid, cosine_sim_df, games, 10))

    gameid  similarity_score
0   705120        429.857788
1   225280        391.228232
2  1000010        371.744288
3   365590        371.167432
4  1011670        359.656264
5  2221490        355.841567
6  2161700        354.898236
7   375010        353.309892
8  1677280        352.593682
9  1062830        352.236347
