# Game Recommendation System for Steam Data Set
## Team : Michal Lyskawinski, Shalin Barot, Shrawan Sapre
### Final Project - CS541 - Artficial Intelligence
#### Date: 11/30/2018

In [1]:
#Importin packages
import csv
import pandas as pd
import numpy as np
import matplotlib as plt
from tqdm import tqdm
import pickle
import random 

## 1. Data Preparation

In [2]:
#Load the dataset
headers = ['user_id', 'game', 'behavior', 'play_time', '0']
steam_data = pd.read_csv('steam-200k.csv', sep=',', names=headers)

steam_data = steam_data.drop(['0'],axis=1)

steam_data = steam_data.sort_values(by=['behavior'])
# steam_data.set_index(range(0,steam_data.shape[0],1))
steam_data.head()

Unnamed: 0,user_id,game,behavior,play_time
199999,128470551,RUSH,play,1.4
70753,43955374,Orcs Must Die! 2,play,17.1
70751,43955374,XCOM Enemy Unknown,play,17.3
154701,32126281,Medieval II Total War,play,14.4
70749,43955374,Anno 2070,play,17.5


In [217]:
steam_data.reset_index(drop=True, inplace=True)
steam_data.head()

Unnamed: 0,user_id,game,behavior,play_time
0,128470551,RUSH,play,1.4
1,43955374,Orcs Must Die! 2,play,17.1
2,43955374,XCOM Enemy Unknown,play,17.3
3,32126281,Medieval II Total War,play,14.4
4,43955374,Anno 2070,play,17.5


In [3]:
steam_data_play = steam_data.loc[steam_data.behavior=='play']
steam_data_play.tail()
steam_data_purchase = steam_data.loc[steam_data.behavior=='purchase']
steam_data_purchase.head()
games_names = steam_data_play['game'].unique().tolist()
unique_ids = steam_data_play['user_id'].unique().tolist()

In [219]:
user_id_groups = steam_data_play.groupby("user_id")

In [220]:
print(user_id_groups.get_group(43955374))

         user_id                                               game  behavior  \
180736  43955374                                  Lego Harry Potter  purchase   
180737  43955374                                 King Arthur's Gold  purchase   
180738  43955374       Warhammer 40,000 Dawn of War  Winter Assault  purchase   
180739  43955374         Warhammer 40,000 Dawn of War  Dark Crusade  purchase   
180740  43955374                                           Overlord  purchase   
180741  43955374                                         HELLDIVERS  purchase   
180742  43955374                   Tom Clancy's Rainbow Six Vegas 2  purchase   
180743  43955374                                        Endless Sky  purchase   
180744  43955374                                        Hammerwatch  purchase   
180745  43955374                               Villagers and Heroes  purchase   
180746  43955374                           Amnesia The Dark Descent  purchase   
180747  43955374            

In [299]:
beautiful_df = pd.DataFrame(0, index=unique_ids, columns=games_names)

In [300]:
beautiful_df.head(15)

Unnamed: 0,Amnesia The Dark Descent,Unturned,Aliens Colonial Marines,Champions Online,Grand Theft Auto Vice City,Quake Live,Grand Theft Auto San Andreas,Yet Another Zombie Defense,Tomb Raider Chronicles,AdVenture Capitalist,...,Realm of Perpetual Guilds,Agapan,Desktop Dungeons Goatperson DLC,Desktop Dungeons Soundtrack,Diehard Dungeon,Dr.Green,Dungeon Crawlers HD,EverQuest II Rise of Kunark,EverQuest II The Shadow Odyssey,Butsbal
10450544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
260017289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
168163793,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36557643,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
165608075,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
142793906,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
116564064,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108264287,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
113300324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
155919035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [301]:
try:
    with open('beautiful_df.pkl','rb') as f:
        beautiful_df = pickle.load(f)
except:
    for i in tqdm(range(0, len(unique_ids),1)):
        user_id = unique_ids[i]
        user_group = user_id_groups.get_group(user_id)
        for game_name in user_group['game']:
            beautiful_df[game_name][user_id] = 1
    with open('beautiful_df.pkl', 'wb') as f:
        pickle.dump(beautiful_df, f)    

In [302]:
average_games_played = np.mean(beautiful_df.sum(axis=1).values)
print(average_games_played)

10.3932865327


#### Splitting into train and test sets

In [303]:
# 80/20 Data Split
try:
    with open('test_users.pkl','rb') as f:
        test_users = pickle.load(f)
except:
    n = 0
    test_users = []
    num_test_users = 2000
    pbar = tqdm(total = num_test_users)
    while(n < num_test_users):
        sample = np.random.choice(len(unique_ids),1, replace=False)
        test_user_id = beautiful_df.index[sample][0]
        test_user_games = np.unique(user_id_groups.get_group(test_user_id)['game'].values)
        if test_user_games.shape[0] > 10:
            if test_user_id not in test_users:
                test_users.append(test_user_id)
                n+=1
                pbar.update(1)
    pbar.close()
    with open('test_users.pkl', 'wb') as f:
        pickle.dump(test_users, f)

In [307]:
assert len(test_users)==np.unique(test_users).shape[0] # Check for duplicates

In [304]:
def alterData(data, test_users, hidden, user_id_groups):
    games_altered = []
    for i in tqdm(range(0, len(test_users),1)):
        test_user_id = test_users[i]
        test_games_purchased = np.unique(user_id_groups.get_group(test_user_id)['game'].values)
        picked_games = np.random.choice(len(test_games_purchased), min(len(test_games_purchased),hidden), replace=False)
        games = []
        for game in picked_games:
            if data[test_games_purchased[game]][test_user_id] == 1:
                data[test_games_purchased[game]][test_user_id] = 0
                games.append(test_games_purchased[game])
            else:
                print(data[test_games_purchased[game]][test_user_id])
                print(test_games_purchased[game],test_user_id, picked_games)
                raise Exception('This game should be purchased. Try recomputing beautiful_df')
        games_altered.append(games)
    return games_altered,data

In [305]:
games_altered, beautiful_df = alterData(beautiful_df, test_users, 2, user_id_groups)

100%|██████████| 2000/2000 [00:03<00:00, 542.36it/s]


In [308]:
matrix = np.array(beautiful_df.values)

## 2. Matrix Complition

At a high level, SVD decomposes a matrix $R$ into the best lower rank approximation of the original matrix $R$. Mathematically, it decomposes R into a two unitary matrices and a diagonal matrix:

$$ R = U \Sigma V^T $$

where R is users's purchase matrix, $U$ is the user "features" matrix, $\Sigma$ is the diagonal matrix of singular values (essentially weights), and $V^{T}$ is the game "features" matrix. $U$ and $V^{T}$ are orthogonal, and represent different things. $U$ represents how much users "like" each feature and $V^{T}$ represents how relevant each feature is to each game.

In [309]:
#Obtaining SVD values of the user-item matrix
try:
    with open('s.pkl','rb') as f:
        s = pickle.load(f)
    with open('u.pkl','rb') as f:
        u = pickle.load(f)
    with open('vt.pkl','rb') as f:
        vt = pickle.load(f)
except:
    u, s, vt = np.linalg.svd(matrix, full_matrices=True)
    with open('s.pkl', 'wb') as f:
        pickle.dump(s, f)
    with open('u.pkl', 'wb') as f:
        pickle.dump(u, f)
    with open('vt.pkl', 'wb') as f:
        pickle.dump(vt, f)

#### Sparsifying thorugh thresholding

In [310]:
def sparsify(s,u,vt):
    thresholdCheck = list(np.around(s,0) == 0)
    thresholdIndex = thresholdCheck.index(True)
    sparsed_s = np.diag(s[:thresholdIndex])
    sparsed_vt = vt[:thresholdIndex,:]
    sparsed_u = u [:,:thresholdIndex]
    return sparsed_s, sparsed_vt, sparsed_u

In [311]:
sparsed_s, sparsed_vt, sparsed_u = sparsify(s,u,vt)

In [312]:
def redecompose(sparsed_s,sparsed_u,sparsed_vt):
    all_user_predicted_purchases = np.dot(np.dot(sparsed_u, sparsed_s), sparsed_vt)
    predictions = pd.DataFrame(all_user_predicted_purchases, columns = beautiful_df.columns, index = beautiful_df.index)
    return predictions

In [313]:
predictions = redecompose(sparsed_s,sparsed_u,sparsed_vt)

## 3. Predictions

In [314]:
def recommend_games(user_id, num_recommendations, predictions, user_id_groups):
    
    # Get and sort the user's predictions
    sorted_user_predictions = predictions.loc[user_id].sort_values(ascending=False)
    
    # Get list of purchased games
    purchased_games = user_id_groups.get_group(user_id)['game'].unique().tolist()

#     print('User {0} has already purchased {1} games.'.format(user_id, len(purchased_games)))
#     print('Recommending the highest {0} predicted games.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    count_games = 0 
    recommendations = []
    for i in range(0,sorted_user_predictions.shape[0],1):
        predicted_game_score = sorted_user_predictions[i]
        predicted_game_name = sorted_user_predictions.index[i]
        if predicted_game_name not in purchased_games:
            count_games+=1
            recommendations.append(predicted_game_name)
        if count_games >= num_recommendations:
            break

    return recommendations

In [315]:
recommendations = recommend_games(43955374, 5, predictions, user_id_groups)
print('We are recommending following game: '+ (", ").join(recommendations))

We are recommending following game: Ticket to Ride - Legendary Asia, Iron Sky Invasion, Oil Rush, Democracy 2, Total Annihilation


## 4. Testing
#### Compute Recall

In [345]:
def getRecall(games_altered, test_users, data, num_recommendations, predictions, user_id_groups):
    recalls = []
    for i in tqdm(range(0, len(test_users),1)):
        matches = 0
        test_user_id = test_users[i]
        recommendations = recommend_games(user_id, num_recommendations, predictions, user_id_groups)
#         print(games_altered[i],recommendations)
#         break
        for game in games_altered[i]:
            if game in recommendations:
                matches+=1
        recall = matches/len(games_altered[i])
        recalls.append(recall)
    return np.mean(recalls)

In [346]:
print(getRecall(games_altered, test_users, beautiful_df, 100, predictions,user_id_groups))

  5%|▌         | 107/2000 [00:00<00:07, 255.95it/s]

QuestRun


 10%|▉         | 190/2000 [00:00<00:06, 266.21it/s]

Crysis Wars
Valkyria Chronicles Edy's Mission Enter the Edy Detachment
Defense Grid 2 A Matter of Endurance


 33%|███▎      | 666/2000 [00:02<00:05, 234.15it/s]

Tom Clancy's Ghost Recon Phantoms - NA Assault Starter Pack


 40%|████      | 800/2000 [00:03<00:04, 247.57it/s]

Takedown Red Sabre
Famaze


 44%|████▍     | 880/2000 [00:03<00:05, 215.46it/s]

Dwarfs!?


 46%|████▋     | 929/2000 [00:03<00:05, 213.52it/s]

Divinity Dragon Commander Beta


 54%|█████▍    | 1085/2000 [00:04<00:03, 234.75it/s]

Mortal Kombat Legacy - Ep. 6 Raiden


 60%|█████▉    | 1192/2000 [00:04<00:03, 209.55it/s]

Star Wars Starfighter


 66%|██████▌   | 1312/2000 [00:05<00:03, 216.24it/s]

CSGO Player Profiles - Markeloff


 73%|███████▎  | 1453/2000 [00:06<00:02, 223.93it/s]

Valkyria Chronicles Edy's Mission Enter the Edy Detachment


 76%|███████▌  | 1518/2000 [00:06<00:03, 159.90it/s]

Platypus II


 79%|███████▉  | 1575/2000 [00:07<00:02, 173.28it/s]

Psychonauts


 81%|████████  | 1611/2000 [00:07<00:02, 149.39it/s]

Bridge Project


 86%|████████▌ | 1712/2000 [00:07<00:01, 216.65it/s]

Post Apocalyptic Mayhem


 89%|████████▉ | 1785/2000 [00:08<00:01, 202.53it/s]

Mortal Kombat Legacy - Ep. 6 Raiden
Next Car Game Wreckfest


 94%|█████████▍| 1888/2000 [00:08<00:00, 208.56it/s]

Psychonauts


100%|██████████| 2000/2000 [00:09<00:00, 221.86it/s]

Arma 3 Helicopters
0.00525



