In [1]:
import pandas as pd
import numpy as np
import ast
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

In [2]:
games = pd.concat([pd.read_csv('../data/steam_games_cleaned_1.csv'), pd.read_csv('../data/steam_games_cleaned_2.csv'), pd.read_csv('../data/steam_games_cleaned_3.csv')], axis=0)
print(games.shape)
games.head()

(56733, 506)


Unnamed: 0,appid,name,genre,developer,publisher,owners,average_forever,median_forever,pos_rating_pct,total_ratings,...,Web Publishing,Well-Written,Werewolves,Western,Word Game,World War I,World War II,Wrestling,Zombies,e-sports
0,10,Counter-Strike,Action,Valve,Valve,"10,000,000-20,000,000",11666,244,0.974693,189081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217837
1,20,Team Fortress Classic,Action,Valve,Valve,"2,000,000-5,000,000",91,18,0.857002,6105,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30,Day of Defeat,Action,Valve,Valve,"5,000,000-10,000,000",403,26,0.900055,5423,...,0.0,0.0,0.0,0.0,0.0,0.015287,0.313376,0.0,0.0,0.0
3,40,Deathmatch Classic,Action,Valve,Valve,"5,000,000-10,000,000",33,6,0.816689,2193,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50,Half-Life: Opposing Force,Action,Gearbox Software,Valve,"5,000,000-10,000,000",322,127,0.951749,13119,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
games['has_tag'] = [games.iloc[i,72:].drop(columns = ['Free to Play','Early Access']).values.max() for i in range(len(games))]
games['has_tag'].value_counts()

has_tag
1.0    48397
0.0     8336
Name: count, dtype: int64

In [4]:
# sistem rekomendasi nanti akan menggunakan rasio tag jadi kita perlu untuk mensortir tag yang kosong
games = games.loc[games['has_tag'] > 0]

search_df = games.loc[ : , ['appid','name']]
search_df['name'] = [name.lower() for name in search_df['name']]
search_df.set_index('name', inplace=True)

In [5]:
# untuk indeks kita menggunakan appid karena nanti ada beberapa game dengan nama yang sama
games.set_index('appid', inplace=True)
games.sort_index(inplace=True)
print(games.shape)
games.head()

(48397, 506)


Unnamed: 0_level_0,name,genre,developer,publisher,owners,average_forever,median_forever,pos_rating_pct,total_ratings,price,...,Well-Written,Werewolves,Western,Word Game,World War I,World War II,Wrestling,Zombies,e-sports,has_tag
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,Counter-Strike,Action,Valve,Valve,"10,000,000-20,000,000",11666,244,0.974693,189081,9.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217837,1.0
20,Team Fortress Classic,Action,Valve,Valve,"2,000,000-5,000,000",91,18,0.857002,6105,4.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
30,Day of Defeat,Action,Valve,Valve,"5,000,000-10,000,000",403,26,0.900055,5423,4.99,...,0.0,0.0,0.0,0.0,0.015287,0.313376,0.0,0.0,0.0,1.0
40,Deathmatch Classic,Action,Valve,Valve,"5,000,000-10,000,000",33,6,0.816689,2193,4.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50,Half-Life: Opposing Force,Action,Gearbox Software,Valve,"5,000,000-10,000,000",322,127,0.951749,13119,4.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
search = 'counter-strike'.lower()
titles = search_df[search_df.index.str.contains(search)]
print(titles)

                                   appid
name                                    
counter-strike                        10
counter-strike: condition zero        80
counter-strike: condition zero       100
counter-strike: source               240
counter-strike: global offensive     730
counter-strike nexon: studio      273110


In [7]:
search_df.loc[search,'appid']

np.int64(10)

In [8]:
%%time
recommender = games.loc[ : , '1980s' : 'e-sports'].drop(columns = ['Free to Play','Kickstarter','Early Access'])
sparse_rec = sparse.csr_matrix(recommender)
dists = pairwise_distances(sparse_rec, metric='cosine')
recommender_df = pd.DataFrame(dists, columns=recommender.index, index=recommender.index)

CPU times: total: 1min 51s
Wall time: 14min 28s


In [9]:
top_recommendations = recommender_df[search_df.loc[search,'appid']].sort_values()[1:21]
top_recs_df = pd.DataFrame(top_recommendations)
top_recs_df['name'] = [games.loc[ind]['name'] for ind in top_recs_df.index]
top_recs_df[['name',top_recommendations.name]].rename(columns={'name':'Game Name', top_recommendations.name : f'Similarity to {search} (lower is better)'})

Unnamed: 0_level_0,Game Name,Similarity to counter-strike (lower is better)
appid,Unnamed: 1_level_1,Unnamed: 2_level_1
80,Counter-Strike: Condition Zero,0.054881
100,Counter-Strike: Condition Zero,0.055738
360,Half-Life Deathmatch: Source,0.111348
240,Counter-Strike: Source,0.126721
20,Team Fortress Classic,0.128983
2350,QUAKE III: Team Arena,0.136147
320,Half-Life 2: Deathmatch,0.147429
7940,Call of Duty 4: Modern Warfare,0.148882
239660,Soldier Front 2,0.149587
458560,Plastic Playground,0.152615


In [None]:
%%time
top100_df = pd.DataFrame(columns = ['games'])
for game_index in list(recommender_df.index):
    top100 = recommender_df[game_index].sort_values()[1:101]
    top100_df.loc[game_index] = str(list(top100.index))
    print(' '*50, end = '\r')
    print(f'{int(round(top100_df.shape[0]/games.shape[0]*100,0))}% complete', end= '\r')
print(' '*50, end = '\r')
top100_df.to_csv('../data/top100_simils.csv')

98% complete                                      

In [None]:
top100_df.loc[search_df.loc[search]]

In [None]:
top100_df.loc[search_df.loc[search]].values[0][0]

In [None]:
results = ast.literal_eval(top100_df.loc[search_df.loc[search]].values[0][0])
search_range = 20
games.loc[results][ : search_range]

In [None]:
search_df.to_csv('../data/search_keys.csv')

In [None]:
search_df.head()

In [None]:
search = 'Counter'.lower()
titles = list(search_df[search_df.index.str.contains(search)].index)
list(games.loc[search_df.loc[titles]['appid'].values]['name'].values)