In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import math
import random
import sklearn
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from matplotlib import rcParams
#import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


## CLEANING USER DATASET

In [2]:
# Loading in data
dataframe = pd.read_csv('steam-200k.csv')
dataframe = dataframe.set_axis(['UserID' , 'Game', 'Purchase/Play','Hrs played', '0/1'], axis = 1)

# Cleaning data

# Removing weird symbols that appear often in game titles
names2 = []
for i in dataframe["Game"]:
  names2.append(i.replace('®' , '').replace('™' , '').lower())
dataframe['p_names'] = names2  

# Removing games with 'DLC' in their title, as they are not useful
dlcs = []
for i in dataframe['p_names']:
  if 'dlc' in i:
    dlcs.append(i)
dlcs = np.unique(dlcs)  

# Removing special editions/versions of games 
editions = []
for i in dataframe['p_names']:
  if 'edition' in i:
    editions.append(i)
editions = np.unique(editions) 

# Removing season passes
seasonpass = []
for i in dataframe['p_names']:
  if 'season' in i:
    seasonpass.append(i)
seasonpass = np.unique(seasonpass) 

# Dropping both DLCs and special editions
for i in dlcs:
  dataframe.drop(dataframe[dataframe['p_names']==i].index, inplace=True)
for i in editions:
  dataframe.drop(dataframe[dataframe['p_names']==i].index, inplace=True)
for i in seasonpass:
  dataframe.drop(dataframe[dataframe['p_names']==i].index, inplace=True)


In [3]:
# Creating a collumn Hours_played where we only include the playtime, so that the 'purchase' rows can be dropped later on

dataframe['Hours_Played'] = dataframe['Hrs played'].astype('float32')

dataframe.loc[(dataframe['Purchase/Play'] == 'purchase') & (dataframe['Hrs played'] == 1.0), 'Hours_Played'] = 0

In [4]:
# Drop the 'purchase' rows
clean_df = dataframe.drop_duplicates(['UserID', 'Game'], keep = 'last').drop(['Purchase/Play', 'Hrs played','0/1', 'Game'], axis = 1)
top_10_games = clean_df['p_names'].value_counts()[0:10]

# Drop all game titles that are played less than 25 hours as they are useless for generating recommendations
tot_hrs_p_game = clean_df.groupby('p_names').sum()
tot_hrs_p_game.drop(tot_hrs_p_game[tot_hrs_p_game['Hours_Played']>25].index, inplace=True)
all_useless_games = np.unique(tot_hrs_p_game.index.get_level_values(0))
for i in all_useless_games:
  clean_df.drop(clean_df[clean_df['p_names']==i].index, inplace=True)

## CALCULATING RATINGS BASED ON PLAYTIME PER PERSON

In [5]:
# Creating a collumn with all mean gametimes per game
mean_gametime = clean_df.groupby('p_names').mean()
mean_gametime = dict(mean_gametime[ 'Hours_Played'])
avg_playtimes = []
for i in clean_df['p_names']:
  avg_playtimes.append(mean_gametime[i])
clean_df['avg_playtime'] = avg_playtimes

# Creating a list of the hours per player to calculate ratings
hours_p_player = list(clean_df['Hours_Played'])
avg_hours = list(clean_df['avg_playtime'])


In [6]:
# Calculating ratings based on the game time per player compared to the mean game time of that game
ratings = []
for i in range(0,len(hours_p_player)):
  if hours_p_player[i]/avg_hours[i] > 1:
    ratings.append(5)
  elif hours_p_player[i]/avg_hours[i] > 0.8:
    ratings.append(4)
  elif hours_p_player[i]/avg_hours[i] > 0.5:
    ratings.append(3)
  elif hours_p_player[i]/avg_hours[i] > 0.1:
    ratings.append(2)
  else:
    ratings.append(1)  

clean_df['rating'] = ratings

In [7]:
# Creating usefull variables for later on
n_users = len(np.unique(clean_df['UserID']))
n_games = len(np.unique(clean_df['p_names']))
user_list = np.unique(clean_df['UserID'])
item_list = np.unique(clean_df['p_names'])

## CLEANING STEAM GAMES DATASET

In [8]:
# Loading in the steam games dataset
dataframe1 = pd.read_csv('steam.csv')
dataframe2 = pd.read_csv('steam_requirements_data.csv')
dataframe3 = pd.read_csv('steam_description_data.csv')
dataframe1 = dataframe1.rename(columns={'appid': 'steam_appid'})
dataframe1 = pd.merge(dataframe1,dataframe2, on='steam_appid')
dataframe1 = pd.merge(dataframe1,dataframe3, on='steam_appid')

# Stripping the game names of weird symbols
names = []
for i in dataframe1['name']:
  names.append(i.replace('®' , '').replace('™' , '').replace(':', '').replace('’' , "'").lower())
dataframe1['p_names'] = names

names2 = []
for i in dataframe["Game"]:
  names2.append(i.replace('®' , '').replace('™' , '').lower())
dataframe['p_names'] = names2  

In [9]:
# Dropping features I wont be using
dataframe1 = dataframe1.drop(['english', 'publisher','achievements', 'positive_ratings','required_age', 'price', 'negative_ratings', 'average_playtime', 'median_playtime', 'owners', 'pc_requirements', 'mac_requirements', 'linux_requirements', 'minimum', 'recommended', 'about_the_game', 'short_description'], axis=1)

In [10]:
# Creating the metadata feature where all usefull features are merged into one feature
dataframe1['metadata'] = dataframe1['detailed_description'].str.replace(';',' ').str.replace('.',' ').str.replace("'",' ')
dataframe1['metadata'] = dataframe1['metadata'].str.replace(',', ' ').replace(';',' ')

dataframe1['metadata'][4].lower()

'return to the black mesa research facility as one of the military specialists assigned to eliminate gordon freeman  experience an entirely new episode of single player action  meet fierce alien opponents  and experiment with new weaponry  named  game of the year  by the academy of interactive arts and sciences '

## MATCHING DATASETS

In [11]:
# Matching the player dataset with the steam dataset
all_found_games = []
for i in dataframe1['p_names']:
    if i not in item_list:
        dataframe1 = dataframe1.drop(dataframe1[dataframe1['p_names'] == i].index)
    else:
        all_found_games.append(i)

In [12]:
# Dropping all games that are not in the games dataset
for i in clean_df['p_names']:
    if i not in all_found_games:
        clean_df = clean_df.drop(clean_df[clean_df['p_names']==i].index) 

In [13]:
# Adding the steam ID's of the games to the Userdataset which makes it easier to work with
steam_appid = []
for item in clean_df['p_names']:
    steam_appid.append(dataframe1.loc[dataframe1['p_names'] == item].index)
    
    

In [14]:
list2 = []
for i in steam_appid:
    for j in dataframe1['steam_appid'].loc[i].values:
        list2.append(j)
clean_df['steam_appid'] = list2


## PREPARATION FOR THE MODEL

In [15]:
# Renaming the columns for easier use of the model
clean_df = clean_df.rename(columns={'steam_appid':'contentId', 'UserID':'personId', 'rating':'eventStrength'})

In [16]:
dataframe1 = dataframe1.rename(columns={'steam_appid':'contentId'})

In [17]:
# Renaming the dataset for easier use of the model
interactions_df = clean_df
articles_df = dataframe1

In [18]:
# Only use users that have more than 5 games in their library

users_interactions_count_df = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['personId']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 11239
# users with at least 5 interactions: 2814


In [19]:
print('# of interactions: %d' % len(interactions_df))
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'personId',
               right_on = 'personId')
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 70621
# of interactions from users with at least 5 interactions: 57579


In [20]:
# Optional function to smooth the ratings of the users with a log function.

def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['eventStrength'].sum() \
                    .apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))


# of unique user/item interactions: 57579


In [21]:
# Splitting data in train/test set
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['personId'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 46063
# interactions on Test set: 11516


In [22]:
#Indexing by personId to speed up the searches during evaluation

interactions_full_indexed_df = interactions_full_df.set_index('personId')
interactions_train_indexed_df = interactions_train_df.set_index('personId')
interactions_test_indexed_df = interactions_test_df.set_index('personId')

## MODEL EVALUATOR

In [23]:
# Simple function that checks all games in a users library

def get_items_interacted(person_id, interactions_df):
    interacted_items = interactions_df.loc[person_id]['contentId']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [24]:
### MODEL EVALUATOR ###

EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 50
class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(articles_df['contentId'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['contentId']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['contentId'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    interactions_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['contentId'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['contentId'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator() 


## ACTUAL MODEL

In [25]:
#Trains a model whose vectors size is 5000, composed by the main unigrams and bigrams found in the corpus, ignoring stopwords

vectorizer = TfidfVectorizer(
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=2500,
                     stop_words='english')

item_ids = articles_df['contentId'].tolist()
tfidf_matrix = vectorizer.fit_transform( articles_df['metadata'])
tfidf_feature_names = vectorizer.get_feature_names()
tfidf_matrix

<958x2500 sparse matrix of type '<class 'numpy.float64'>'
	with 102527 stored elements in Compressed Sparse Row format>

In [26]:
### Functions for building userprofiles ###

def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profile(person_id, interactions_indexed_df):
    interactions_person_df = interactions_indexed_df.loc[person_id]
    user_item_profiles = get_item_profiles(interactions_person_df['contentId'])
    
    user_item_strengths = np.array(interactions_person_df['eventStrength']).reshape(-1,1)
    #Weighted average of item profiles by the interactions strength
    user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
    return user_profile_norm

def build_users_profiles(): 
    interactions_indexed_df = interactions_train_df[interactions_train_df['contentId'] \
                                                   .isin(articles_df['contentId'])].set_index('personId')
    user_profiles = {}
    for person_id in interactions_indexed_df.index.unique():
        user_profiles[person_id] = build_users_profile(person_id, interactions_indexed_df)
    return user_profiles

In [27]:
user_profiles = build_users_profiles()
len(user_profiles)

2814

In [28]:
# Check how a profile looks like
myprofile = user_profiles[128470551]
print(myprofile.shape)
pd.DataFrame(sorted(zip(tfidf_feature_names, 
                        user_profiles[128470551].flatten().tolist()), key=lambda x: -x[1])[:10],
             columns=['token', 'relevance'])

(1, 2500)


Unnamed: 0,token,relevance
0,isaac,0.224737
1,strong br,0.169798
2,support,0.15272
3,rogue,0.138788
4,items,0.138418
5,magic,0.129231
6,unique,0.125737
7,secrets,0.124471
8,different,0.119079
9,cards,0.113088


## RECOMMENDER

In [29]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['contentId', 'recStrength']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
content_based_recommender_model = ContentBasedRecommender(articles_df)

## RESULTS OF MODEL

In [30]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...
2813 users processed

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.24409517193469954, 'recall@10': 0.3785168461271275}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
45,5,8,61,0.081967,0.131148,62990992
206,5,11,50,0.1,0.22,30246419
266,3,12,49,0.061224,0.244898,53875128
388,5,11,48,0.104167,0.229167,11403772
225,4,11,48,0.083333,0.229167,22301321
233,3,8,46,0.065217,0.173913,47457723
93,6,12,45,0.133333,0.266667,20772968
734,9,14,42,0.214286,0.333333,49893565
200,2,5,42,0.047619,0.119048,24469287
153,7,12,41,0.170732,0.292683,36546868


In [31]:
content_based_recommender_model.recommend_items(self ,user_id=5250, topn=20, verbose=False)

NameError: name 'self' is not defined