In [71]:
# Content-based for only genre. Before executing this notebook, please execute content-based combined-features notebook first.
from pandas import read_csv, DataFrame, concat
import pathlib
import matplotlib.pyplot as plt

In [72]:
locationUsersFile=pathlib.Path("C:\\Users\\z004cnpx\\Desktop\\steam-user-game-train.csv")
dataUsers = read_csv(locationUsersFile)

In [73]:
dataUsers['UserID'] = dataUsers['UserID'].astype(int)
# create column ID for user dataset
dataUsers["ID"] = ""
dataUsers

Unnamed: 0,UserID,Game,Action,Hours,ID
0,151603712,The Elder Scrolls V Skyrim,play,273.0,
1,151603712,Fallout 4,play,87.0,
2,151603712,Spore,play,14.9,
3,151603712,Fallout New Vegas,play,12.1,
4,151603712,HuniePop,play,8.5,
...,...,...,...,...,...
56377,128470551,Fallen Earth,play,2.4,
56378,128470551,Magic Duels,play,2.2,
56379,128470551,Titan Souls,play,1.5,
56380,128470551,Grand Theft Auto Vice City,play,1.5,


In [74]:
# space and special char removal
import re
for i, row in dataUsers.iterrows():
    name = re.sub('[^A-Za-z0-9]+', '', row["Game"])
    name = name.lower()
    dataUsers.at[i, 'ID'] = name

In [75]:
## get usedGames!!
locationGamesFile = pathlib.Path("C:\\Users\\z004cnpx\\Desktop\\steam-games-processed.csv")
usedGames = read_csv(locationGamesFile)
#preprocess genre for dropping nan values
usedGames = usedGames[usedGames['genre'].notna()]

In [76]:
from pandas import Series, concat
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Construct a reverse map of indices and game names
usedGames.drop_duplicates(subset=['name'],keep='last')
# get list of games we have info about
listGames = usedGames['name'].unique()

In [77]:
#Number of recommendation
n_recommendation = 10
# create dataframe for recommendations
col_names = list(map(str, range(1, n_recommendation + 1)))
col_names = ["UserID"] + col_names

In [78]:
#sklearn.feature_extraction module can be used to extract features in format supported by machine learning algorithms
#in that module, CountVectorizer count the number of text
cv = CountVectorizer()
count_matrix = cv.fit_transform(usedGames['genre'])
print("Count Matrix: ",count_matrix.toarray())

Count Matrix:  [[0 1 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]


In [79]:
#cosine sim matrix
cosine_sim = cosine_similarity(count_matrix)

In [80]:
# set index to know which game it is
usedGames['index'] = usedGames.index
usedGames.head(3)

Unnamed: 0,name,popular_tags,game_details,genre,ID,index
0,DayZ,"Survival,Zombies,Open World,Multiplayer,PvP,Ma...","Multi-player,Online Multi-Player,Steam Worksho...","Action,Adventure,Massively Multiplayer",dayz,0
1,EVE Online,"Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","Multi-player,Online Multi-Player,MMO,Co-op,Onl...","Action,Free to Play,Massively Multiplayer,RPG,...",eveonline,1
2,TERA,"Free to Play,MMORPG,Massively Multiplayer,RPG,...","Multi-player,MMO,Co-op,Steam Trading Cards,Par...","Action,Adventure,Free to Play,Massively Multip...",tera,2


In [81]:
## recommendation for a game
game_user_like = "Dota 2"

def get_index_from(game):
    return usedGames[usedGames.name == game]['index'].values[0]

game_index = get_index_from(game_user_like)
game_index

118

In [82]:
# recommend similar games without considering user
similar_games = list(enumerate(cosine_sim[game_index]))
sorted_similar_games = sorted(similar_games, key = lambda x:x[1], reverse = True)
def get_title_from_index(index):
    return usedGames[usedGames.index == index]["name"].values[0]

i=0
for games in sorted_similar_games:
    print(get_title_from_index(games[0]))
    i = i+1;
    if i>15:
        break

Dota 2
DmC: Devil May Cry
Anoxemia
SMITE®
Team Fortress 2
Warframe
Counter-Strike: Global Offensive
If My Heart Had Wings
The Baconing
oO
140
Space Rangers HD: A War Apart
FootLOL: Epic Fail League
Weird Worlds: Return to Infinite Space
Famaze
Secrets of Rætikon


In [83]:
# Function that takes in game name and Cosine Similarity matrix as input and outputs most similar games
def get_recommendations(title, cosine_sim):
    sim_scores = list()
    if title not in listGames:
        return []
    # Get the index of the game
    idx = usedGames[usedGames.name == title]['index'].values[0]
    # if idx become an array
    if type(idx) is Series:
        return []
    #pairwise similarity
    if idx < len(cosine_sim):
        sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #sim_scores 0 is the game itself so discard
    sim_scores = sim_scores[1:n_recommendation + 1]
    # Get the games indices
    game_indices = [i[0] for i in sim_scores]
    # Return the top most similar games
    return usedGames['name'].iloc[game_indices].tolist()


In [84]:
def make_recommendation_for_user(user_id, suggestion_list, game_user_have):
    if type(suggestion_list) is not list or len(suggestion_list) == 0:
        # return empty one
        return DataFrame(data=[[user_id] + [""] * n_recommendation], columns=col_names)

    # get name of game, remove the game the user already has and order by genre
    recommendation = usedGames.loc[usedGames['name'].isin(suggestion_list)]
    recommendation = recommendation.loc[~recommendation['name'].isin(game_user_have)]
    recommendation = recommendation.sort_values(by="genre", ascending=False)

    if len(recommendation.index) < n_recommendation:
        return DataFrame(data=[[user_id] + recommendation["name"].tolist() +
                               [""] * (n_recommendation - len(recommendation.index))],
                         columns=col_names)
    else:
        return DataFrame(data=[[user_id] + recommendation["name"].tolist()[0:n_recommendation]],
                         columns=col_names)


In [85]:
output_file= pathlib.Path("C:\\Users\\z004cnpx\\Desktop\\content_based_recommender_genre.csv")
recommendationByUserData = DataFrame(columns=col_names)
previousId = ""
listSuggestion = list()
listGamesUserHas = list()

for j, row in dataUsers.iterrows():
    if previousId != row["UserID"]:
        recommendationByUserData = concat([recommendationByUserData,
                                           make_recommendation_for_user(previousId, listSuggestion, listGamesUserHas)],
                                          ignore_index=True)
        previousId = row["UserID"]
        listSuggestion = list()
        listGamesUserHas = list()
    # add all elements to list
    listGamesUserHas.extend([row["Game"]])
    listSuggestion.extend(get_recommendations(row["Game"], cosine_sim))

# add the last element for the last user
recommendationByUserData = concat([recommendationByUserData,
                                   make_recommendation_for_user(previousId, listSuggestion, listGamesUserHas)],
                                  ignore_index=True)
recommendationByUserData.to_csv(output_file, index=False)

In [86]:
#sneak peek at the test
locationUserFile=pathlib.Path("C:\\Users\\z004cnpx\\Desktop\\steam-user-game-test.csv")
dataUsersTest = read_csv(locationUserFile, skiprows = 1,header = None, usecols=[0, 1], names=['UserID', 'Game'])
dataUsersTest

Unnamed: 0,UserID,Game
0,49462664,Hero Siege
1,113224251,DiRT Showdown
2,106057222,Dota 2
3,103260848,Crysis
4,163073929,Dota 2
...,...,...
14090,158974308,Motorbike
14091,106816274,Counter-Strike Source
14092,47457723,BioShock
14093,48798067,Star Wars Jedi Knight Jedi Academy


In [87]:
dataUsersTestDict = dict()
for k, v in dataUsersTest.groupby('UserID'):
    dataUsersTestDict[k] = v
    

In [88]:
##  ACCURACY CALCULATION after genre
tempGamesDictList = list()
common_counter = 0
# it would be better to drop empty recommendation rows
# recommendationByUserData.dropna()
# foreach key in df_games , you get values (games)
for userId in dataUsersTestDict:
    #print("usernext", userId)
    tempGamesDictList = dataUsersTestDict[userId].Game.tolist()
    # bu key(UserID)in recommendationByUserDatadaki row unu bul.
    tempGamesRecommendedList = list()
    if not recommendationByUserData.loc[recommendationByUserData['UserID'] == userId].empty:
        #recommended DataFrame is provided for this user!
    
        for i in range(1,11):
            tempGamesRecommendedList.append(recommendationByUserData.loc[recommendationByUserData['UserID'] == userId, str(i)].iloc[0])
        #print("test", tempGamesDictList, "for user", userId)
        #print("recomm", tempGamesRecommendedList, "for userr", userId)

        common_elements = list(set(tempGamesDictList).intersection(set(tempGamesRecommendedList)))
        if(len(common_elements)>0):
            print(common_elements, "is the common element(s) for this userId:", userId)
            common_counter+=1

print("Exact game match rate:", common_counter/recommendationByUserData.shape[0])

['Age of Wonders III'] is the common element(s) for this userId: 2753525
['Poly Bridge'] is the common element(s) for this userId: 16081636
['Company of Heroes 2'] is the common element(s) for this userId: 17567828
['Team Fortress 2'] is the common element(s) for this userId: 22088241
['Company of Heroes 2'] is the common element(s) for this userId: 22301321
['Victoria II'] is the common element(s) for this userId: 31187179
['Victoria II'] is the common element(s) for this userId: 32126281
['Project CARS'] is the common element(s) for this userId: 32577778
['Homeworld Remastered Collection'] is the common element(s) for this userId: 34177747
['Portal'] is the common element(s) for this userId: 40289887
['Company of Heroes 2'] is the common element(s) for this userId: 43955374
['Company of Heroes 2'] is the common element(s) for this userId: 45774794
['Company of Heroes 2'] is the common element(s) for this userId: 46055854
['Borderlands 2'] is the common element(s) for this userId: 511