# Model 2: Content-based Filtering using TF-IDF and Cosine Similarity
- This model uses TF-IDF to vectorize the text data and Cosine Similarity for recommendations.
- Recommend games with similar content (genre, developer, language) to previously played games.
- input: top_10_percent_games, purchased_games

In [11]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
purchased_games = pd.read_csv("./clean_datasets/purchased_games_demo.csv")
games = pd.read_csv("./clean_datasets/top_10_percent_games.csv")

In [13]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text) 
    text = re.sub(r'[^\w\s]', '', text)
    return text

games['title'] = games['title'].apply(preprocess_text)
games['genres'] = games['genres'].apply(preprocess_text)
games['developers'] = games['developers'].apply(preprocess_text)
games['publishers'] = games['publishers'].apply(preprocess_text)
games['supported_languages'] = games['supported_languages'].apply(preprocess_text)

games['combined_features'] = games['title'] + ' ' + games['genres'] + ' ' + games['developers'] + ' ' + games['publishers'] + ' ' + games['supported_languages']

In [14]:
games

Unnamed: 0,gameid,title,developers,publishers,genres,supported_languages,release_date,purchases_count,combined_features
0,730,counterstrike 2,valve,valve,action free to play,czech danish dutch english finnish french germ...,2012-08-21,43967.0,counterstrike 2 action free to play valve valv...
1,578080,pubg battlegrounds,pubg corporation,krafton inc,action adventure massively multiplayer free to...,english korean simplified chinese french germa...,2017-12-21,28356.0,pubg battlegrounds action adventure massively ...
2,550,left 4 dead 2,valve,valve,action,danish dutch english finnish french german ita...,2009-11-16,24727.0,left 4 dead 2 action valve valve danish dutch ...
3,218620,payday 2,overkill a starbreeze studio,starbreeze publishing ab,action rpg,english german french italian spanish spain d...,2013-08-13,23785.0,payday 2 action rpg overkill a starbreeze stu...
4,304930,unturned,smartly dressed games,smartly dressed games,action adventure casual indie free to play,english,2017-07-07,21620.0,unturned action adventure casual indie free to...
...,...,...,...,...,...,...,...,...,...
9197,1621070,deadpoly,tfl games,tfl games,action adventure casual massively multiplayer ...,english,2022-01-12,224.0,deadpoly action adventure casual massively mul...
9198,397400,huebots,huebotics,huebotics,casual free to play indie strategy,english,2015-08-26,224.0,huebots casual free to play indie strategy hue...
9199,337450,dream tale,green lava studios,strategy first,casual indie,english,2014-12-16,224.0,dream tale casual indie green lava studios str...
9200,842420,a walk in the woods,something dark studios,something dark studios,adventure free to play indie,english,2018-05-01,224.0,a walk in the woods adventure free to play ind...


In [15]:
## Skip this step if already have the cosine similarity matrix

# Tạo vector TF-IDF cho các đặc trưng tổng hợp của game
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
games_tfidf = tfidf_vectorizer.fit_transform(games['combined_features'])

# Tính toán độ tương đồng cosine giữa các game
cosine_sim = cosine_similarity(games_tfidf)

# Lưu kết quả cosine similarity vào CSV để tái sử dụng
cosine_sim_df = pd.DataFrame(cosine_sim, index=games['gameid'], columns=games['gameid'])


In [16]:
cosine_sim_df

gameid,730,578080,550,218620,304930,1172470,230410,271590,407530,4000,...,715210,792030,497580,568930,669630,1621070,397400,337450,842420,448710
gameid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
730,1.000000,0.231011,0.834183,0.153619,0.030556,0.256807,0.212178,0.187115,0.225124,0.600407,...,0.005583,0.001493,0.121218,0.079193,0.031873,0.006772,0.035474,0.002652,0.042018,0.005316
578080,0.231011,1.000000,0.205573,0.081433,0.032473,0.218089,0.189967,0.133235,0.154351,0.204059,...,0.005121,0.001370,0.112209,0.042473,0.036052,0.062427,0.032538,0.002432,0.045591,0.009223
550,0.834183,0.205573,1.000000,0.154717,0.005476,0.221942,0.181867,0.188453,0.226733,0.604699,...,0.005623,0.001504,0.122084,0.079759,0.032101,0.006821,0.001854,0.002671,0.002195,0.005354
218620,0.153619,0.081433,0.154717,1.000000,0.005635,0.168704,0.107448,0.143248,0.124363,0.106206,...,0.005786,0.001547,0.090126,0.069892,0.033033,0.007019,0.001907,0.002748,0.002259,0.005509
304930,0.030556,0.032473,0.005476,0.005635,1.000000,0.043995,0.032929,0.026818,0.021317,0.011172,...,0.012986,0.009957,0.014752,0.007569,0.017327,0.050113,0.041084,0.017684,0.046529,0.016424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1621070,0.006772,0.062427,0.006821,0.007019,0.050113,0.015919,0.007298,0.033405,0.065407,0.010134,...,0.012989,0.009032,0.018376,0.006425,0.016825,1.000000,0.011133,0.016041,0.010529,0.017424
397400,0.035474,0.032538,0.001854,0.001907,0.041084,0.044082,0.038229,0.001942,0.012833,0.014959,...,0.012603,0.013333,0.012178,0.004754,0.007531,0.011133,1.000000,0.042381,0.053482,0.011999
337450,0.002652,0.002432,0.002671,0.002748,0.017684,0.003295,0.002858,0.002798,0.039606,0.050846,...,0.018159,0.019210,0.017546,0.006850,0.010851,0.016041,0.042381,1.000000,0.087457,0.017289
842420,0.042018,0.045591,0.002195,0.002259,0.046529,0.061768,0.045281,0.010412,0.045614,0.055253,...,0.005974,0.006320,0.011517,0.012005,0.019017,0.010529,0.053482,0.087457,1.000000,0.012126


In [None]:
cosine_sim_df.to_csv('./data_model/cosine_similarity.csv', index=True)

In [18]:
cosine_sim_df = pd.read_csv('./data_model/cosine_similarity.csv', index_col='gameid')

In [18]:
def get_recommended_games(playerid, cosine_sim_df, games_df, top_n=10):
    owned_games = purchased_games[purchased_games['playerid'] == playerid]['library'].values[0]
    owned_games = eval(owned_games)
    
    # Nếu danh sách rỗng, trả về game phổ biến
    if not owned_games:
        print(f"Player {playerid} has not purchased any games.")
        return games_df.head(top_n)['gameid']
    
    # Tính điểm tương đồng
    similar_scores = pd.Series(dtype=float)
    for game_id in owned_games[:1000]:
        if game_id in cosine_sim_df.index:
            # game_id = str(game_id)
            similar_scores = pd.concat([similar_scores, cosine_sim_df[game_id]])
            
    # Tổng hợp điểm số, sắp xếp giảm dần
    similar_scores = similar_scores.groupby(similar_scores.index).sum().sort_values(ascending=False)
    
    # Loại bỏ game đã mua, chọn 10 game đầu tiên
    recommended_games = similar_scores[~similar_scores.index.isin(owned_games)].head(top_n)
    recommended_games = pd.DataFrame(recommended_games).reset_index()
    recommended_games.columns = ['gameid', 'similarity_score']
    # recommended_games = recommended_games['gameid']
    
    # Nếu không đủ game để gợi ý (vì không tìm thấy trong cosine_sim_df), bổ sung thêm game phổ biến
    if len(recommended_games) < top_n:
        additional_games = games_df[~games_df['gameid'].isin(recommended_games.index)]['gameid'].head(top_n - len(recommended_games))
        recommended_games = pd.concat([recommended_games, additional_games])
        
    # include info of recommended games
    recommended_games = recommended_games.merge(games_df, on='gameid', how='left')
    recommended_games = recommended_games[['gameid', 'similarity_score', 'title', 'genres', 'developers', 'publishers', 'supported_languages']]
    
    return recommended_games



In [19]:
playerid = 76561197969960651
get_recommended_games(playerid, cosine_sim_df, games, 10)

Unnamed: 0,gameid,similarity_score,title,genres,developers,publishers,supported_languages
0,1677280,41.1731,company of heroes 3,action strategy,relic entertainment,sega,english french italian german spanish spain c...
1,527230,37.663281,for the king,adventure indie rpg strategy,ironoak games,curve games,english french italian german spanish spain p...
2,1502190,37.206479,from space,action indie,triangle studios,curve games,english french italian german spanish spain p...
3,1272080,37.204022,payday 3,action adventure rpg,starbreeze studios,deep silver,english french italian german spanish spain j...
4,1676840,36.992264,for the king ii,adventure indie rpg strategy,ironoak games,curve games,english french italian german spanish spain j...
5,962130,36.983128,grounded,action adventure,obsidian entertainment,xbox game studios,english french italian german spanish latin a...
6,705120,36.950828,death coming死神来了,adventure casual indie strategy,next studios,next studios,english simplified chinese japanese traditiona...
7,548570,36.928179,rage 2,action,id software avalanche studios,bethesda softworks,english french italian german spanish spain j...
8,2183900,36.84823,warhammer 40000 space marine 2,action adventure rpg,saber interactive,focus entertainment,english french italian german spanish spain r...
9,644830,36.665439,the surge 2,action rpg,deck13,focus entertainment,english french italian german spanish spain s...


In [20]:
playerid = 76561197960409700
get_recommended_games(playerid, cosine_sim_df, games, 10)

Unnamed: 0,gameid,similarity_score,title,genres,developers,publishers,supported_languages
0,450390,18.872811,the lab,free to play,valve,valve,english french german czech finnish hungarian ...
1,546560,18.365341,halflife alyx,action adventure,valve,valve,english french german spanish spain japanese ...
2,583950,18.138085,artifact,strategy,valve,valve,english french italian german spanish spain j...
3,1046930,16.823044,dota underlords,casual free to play strategy,valve,valve,english french italian german spanish spain b...
4,1902490,16.559976,aperture desk job,action adventure casual free to play,valve,valve,english french italian german spanish spain b...
5,1252330,12.472122,deathloop,action,arkane studios,bethesda softworks,english french italian german spanish spain a...
6,1238810,12.457664,battlefield v,action,dice,electronic arts,english french italian german spanish spain a...
7,1151340,12.184651,fallout 76,rpg,bethesda game studios,bethesda softworks,english french italian german spanish spain j...
8,548570,12.093524,rage 2,action,id software avalanche studios,bethesda softworks,english french italian german spanish spain j...
9,1502190,12.055426,from space,action indie,triangle studios,curve games,english french italian german spanish spain p...
