In [1]:
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
DATA_DIR = Path("../data/steam")


In [3]:
games_df = pd.read_pickle(DATA_DIR / "steam_games.df.pkl").drop(columns=['url', 'reviews_url'])
games_df = games_df.drop_duplicates(subset='app_name')
games_df

Unnamed: 0,publisher,genres,app_name,title,release_date,tags,discount_price,specs,price,early_access,id,developer,sentiment,metascore
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.49,[Single-player],4.99,False,761140,Kotoshiro,,
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL,Mostly Positive,
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com,Mostly Positive,
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,2017-12-07,"[Action, Adventure, Casual]",0.83,[Single-player],0.99,False,767400,彼岸领域,,
4,,,Log Challenge,,,"[Action, Indie, Casual, Sports]",1.79,"[Single-player, Full controller support, HTC V...",2.99,False,773570,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32130,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,2018-01-04,"[Strategy, Indie, Casual, Simulation]",1.49,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS""",,
32131,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,2018-01-04,"[Strategy, Indie, Casual]",4.24,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada,,
32132,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,2018-01-04,"[Indie, Simulation, Racing]",1.39,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich,,
32133,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns",1 user reviews,


In [4]:
games_df = games_df.dropna(subset='tags')
games_df['tags'] = games_df['tags'].apply(lambda x: " ".join(x))
games_df = games_df.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df['tags'] = games_df['tags'].apply(lambda x: " ".join(x))


In [5]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(games_df['tags'])

In [6]:
cosine_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_similarity

array([[1.        , 0.05813907, 0.34349958, ..., 0.34225325, 0.17646836,
        0.32295669],
       [0.05813907, 1.        , 0.11450694, ..., 0.01613015, 0.00952271,
        0.01067308],
       [0.34349958, 0.11450694, 1.        , ..., 0.20248375, 0.10440215,
        0.13398046],
       ...,
       [0.34225325, 0.01613015, 0.20248375, ..., 1.        , 0.05669009,
        0.20962022],
       [0.17646836, 0.00952271, 0.10440215, ..., 0.05669009, 1.        ,
        0.03751098],
       [0.32295669, 0.01067308, 0.13398046, ..., 0.20962022, 0.03751098,
        1.        ]])

In [9]:
import numpy as np

def get_recommendations(game_title, cosine_sim, df, num_recommend=10):
    idx = df[df['app_name'] == game_title].index[0]

    similarity_scores = cosine_sim[idx]

    top_similar_indices = np.argsort(similarity_scores)[::-1][1:num_recommend+1]

    return df['app_name'].iloc[top_similar_indices]

In [15]:
get_recommendations('Counter-Strike', cosine_similarity, games_df, num_recommend=10)

31810                       Counter-Strike: Source
31915               Counter-Strike: Condition Zero
1848                                    Insurgency
22445    Umbrella Corps™/Biohazard Umbrella Corps™
1030              Counter-Strike: Global Offensive
16479                                  Black Squad
7118                                      Aim Hero
31274                  Battlefield: Bad Company™ 2
2331                                       Warface
30959                               Orcs Must Die!
Name: app_name, dtype: object