In [1]:
import numpy as np
import pandas as pd
import json

from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from sklearn.linear_model import LinearRegression

## Load metadata file

In [4]:
# Scan each line that represents a json and convert it to dataframe
path = './data/'
file_name = 'games_metadata.json'
data = []
dfs = []
id_games_wo_tags = []
with open(path + file_name) as f:
    for line in f:
        game = json.loads(line)
        if game['tags'] == []:
            id_games_wo_tags.append(game['app_id'])
        data.append(game)
        game_df = pd.DataFrame.from_dict(game)
        dfs.append(game_df)

FileNotFoundError: [Errno 2] No such file or directory: './data/games_metadata.json'

In [3]:
# Concatenate all games
games_metadata = pd.concat(dfs)
games_metadata

ValueError: No objects to concatenate

In [4]:
# Check if there is nan values
games_metadata.isnull().values.any()

False

In [5]:
# Some games have no tags
games_metadata[games_metadata['app_id'] == 2488510]

Unnamed: 0,app_id,description,tags


In [6]:
print(f"Number of unique tags: {games_metadata['tags'].unique().shape[0]}.")

Number of unique tags: 441.


In [7]:
# Group by tags for each game

## Create tuples of combined tags (we use tuples because lists can't be hashed when droping duplicates)
list_tags = games_metadata.groupby('app_id')['tags'].apply(tuple)
## Merge kdataframes
games_metadata = pd.merge(games_metadata, list_tags, on='app_id')
## Drop column of unique tag (keep only the tuples) 
games_metadata.drop('tags_x', axis=1, inplace=True)
games_metadata.rename(columns={'tags_y': 'tags'}, inplace=True)
## Drop duplicates
games_metadata.drop_duplicates(inplace=True)
## Convert tuples to list
games_metadata['tags'] = games_metadata['tags'].apply(lambda x: list(x))
games_metadata

Unnamed: 0,app_id,description,tags
0,13500,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
20,22364,,[Action]
21,113020,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
41,226560,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
59,249050,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra..."
...,...,...,...
584132,2455060,"In the rogue action game ""Taboo Trial"", you wi...","[RPG, Indie, Nudity, Action Roguelike, Female ..."
584152,1138640,"Build better, together 🏡 Hometopia is a seriou...","[Early Access, Life Sim, City Builder, Immersi..."
584172,2515460,,"[Strategy, Indie, Simulation]"
584175,1687000,Seiji Maruyama is a middle-aged yakuza recentl...,"[Side Scroller, Beat 'em up, Sandbox, Martial ..."


## Load recommendations

In [8]:
recommendations = pd.read_csv(f'{path}/recommendations.csv', sep=",")
recommendations.head

<bound method NDFrame.head of            app_id  helpful  funny        date  is_recommended  hours  \
0          975370        0      0  2022-12-12            True   36.3   
1          304390        4      0  2017-02-17           False   11.5   
2         1085660        2      0  2019-11-17            True  336.5   
3          703080        0      0  2022-09-23            True   27.4   
4          526870        0      0  2021-01-10            True    7.9   
...           ...      ...    ...         ...             ...    ...   
41154789   633230        0      0  2021-02-15            True   41.0   
41154790   758870        8      0  2019-07-18           False    8.0   
41154791   696170        3     10  2018-03-26           False    2.0   
41154792   696170        0      0  2018-06-11            True    4.0   
41154793  1089980        2      0  2020-09-16            True   14.0   

           user_id  review_id  
0            51580          0  
1             2586          1  
2        

In [9]:
recommendations.shape

(41154794, 8)

In [10]:
'users:', recommendations['user_id'].unique().shape[0], 'games:', recommendations['app_id'].unique().shape[0], 

('users:', 13781059, 'games:', 37610)

#### Drop games without tags (and their corresponding recommendations)

In [11]:
recommendations = recommendations[~recommendations['app_id'].isin(id_games_wo_tags)]
'users:', recommendations['user_id'].unique().shape[0], 'games:', recommendations['app_id'].unique().shape[0], 

('users:', 7495040, 'games:', 37032)

In [12]:
number_recomm = recommendations.groupby('user_id').aggregate('size')
number_recomm = pd.DataFrame({'user_id': number_recomm.index, f'number_recomm_user': number_recomm.values})
number_recomm

Unnamed: 0,user_id,number_recomm_user
0,0,26
1,1,1
2,5,1
3,10,1
4,11,1
...,...,...
7495035,14306052,3
7495036,14306053,1
7495037,14306057,1
7495038,14306060,1


#### Keep only users that recommended more than a certain limit of games

In [13]:
limit_games = 200
number_ratings = number_recomm[(number_recomm['number_recomm_user'] >= limit_games) ]
number_ratings.shape[0]

1347

In [14]:
recommendations = pd.merge(recommendations, number_ratings, on='user_id')
'users:', recommendations['user_id'].unique().shape[0], 'games:', recommendations['app_id'].unique().shape[0], 

('users:', 1347, 'games:', 32396)

In [15]:
# Drop number of recommendations per user
recommendations.drop('number_recomm_user', axis=1, inplace=True)
recommendations.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,1850570,12,2,2022-03-30,True,72.5,5361677,44337
1,1544020,135,14,2022-12-06,False,19.0,5361677,2953151
2,1475810,38,4,2022-12-14,False,18.4,5361677,6596314
3,638230,4,0,2020-11-25,True,4.9,5361677,6880461
4,637650,8,0,2021-02-17,True,84.2,5361677,7130459


In [16]:
recommendations.shape

(524929, 8)

## Processing games 

In [17]:
games = pd.read_csv(path + 'games.csv', sep=',')
games.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True


### Process existing features

In [18]:
# Drop title
games = games.drop('title', axis=1)

# Extract year of release
games['date_release'] = list(map(lambda x: int(x[:4]), games['date_release'].values))

# Convert boolean values to int
columns = ['win', 'mac', 'linux', 'steam_deck']
for column in columns:
    games[column] = games[column].astype(int) 

# Transform categorical feature 'rating' to numerical one (one hot encoding) 
lb = LabelBinarizer()
games = games.join(pd.DataFrame(lb.fit_transform(games.pop('rating')),
                                            columns=lb.classes_,
                                            index=games.index))
games

Unnamed: 0,app_id,date_release,win,mac,linux,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,Mixed,Mostly Negative,Mostly Positive,Negative,Overwhelmingly Negative,Overwhelmingly Positive,Positive,Very Negative,Very Positive
0,13500,2008,1,0,0,84,2199,9.99,9.99,0.0,1,0,0,0,0,0,0,0,0,1
1,22364,2011,1,0,0,85,21,2.99,2.99,0.0,1,0,0,0,0,0,0,1,0,0
2,113020,2013,1,1,1,92,3722,14.99,14.99,0.0,1,0,0,0,0,0,0,0,0,1
3,226560,2014,1,0,0,61,873,14.99,14.99,0.0,1,1,0,0,0,0,0,0,0,0
4,249050,2014,1,1,0,88,8784,11.99,11.99,0.0,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50867,2296380,2023,1,0,0,96,101,22.00,0.00,0.0,1,0,0,0,0,0,0,0,0,1
50868,1272080,2023,1,0,0,38,29458,40.00,0.00,0.0,1,0,1,0,0,0,0,0,0,0
50869,1402110,2023,1,0,0,89,1128,30.00,0.00,0.0,1,0,0,0,0,0,0,0,0,1
50870,2272250,2023,1,0,0,95,82,17.00,0.00,0.0,1,0,0,0,0,0,0,0,0,1


In [19]:
games

Unnamed: 0,app_id,date_release,win,mac,linux,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,Mixed,Mostly Negative,Mostly Positive,Negative,Overwhelmingly Negative,Overwhelmingly Positive,Positive,Very Negative,Very Positive
0,13500,2008,1,0,0,84,2199,9.99,9.99,0.0,1,0,0,0,0,0,0,0,0,1
1,22364,2011,1,0,0,85,21,2.99,2.99,0.0,1,0,0,0,0,0,0,1,0,0
2,113020,2013,1,1,1,92,3722,14.99,14.99,0.0,1,0,0,0,0,0,0,0,0,1
3,226560,2014,1,0,0,61,873,14.99,14.99,0.0,1,1,0,0,0,0,0,0,0,0
4,249050,2014,1,1,0,88,8784,11.99,11.99,0.0,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50867,2296380,2023,1,0,0,96,101,22.00,0.00,0.0,1,0,0,0,0,0,0,0,0,1
50868,1272080,2023,1,0,0,38,29458,40.00,0.00,0.0,1,0,1,0,0,0,0,0,0,0
50869,1402110,2023,1,0,0,89,1128,30.00,0.00,0.0,1,0,0,0,0,0,0,0,0,1
50870,2272250,2023,1,0,0,95,82,17.00,0.00,0.0,1,0,0,0,0,0,0,0,0,1


### Merge games with metadata

In [20]:
games_metadata_merged = pd.merge(games_metadata, games, on='app_id' )
games_metadata_merged = games_metadata_merged.drop('description', axis=1)

In [21]:
## Tags To "one hot vectors"
mlb = MultiLabelBinarizer()
games_metadata_merged = games_metadata_merged.join(pd.DataFrame(mlb.fit_transform(games_metadata_merged.pop('tags')),
                                            columns=mlb.classes_,
                                            index=games_metadata_merged.index))
games_metadata_merged

Unnamed: 0,app_id,date_release,win,mac,linux,positive_ratio,user_reviews,price_final,price_original,discount,...,Well-Written,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,eSports
0,13500,2008,1,0,0,84,2199,9.99,9.99,0.0,...,0,0,0,0,0,0,0,0,0,0
1,22364,2011,1,0,0,85,21,2.99,2.99,0.0,...,0,0,0,0,0,0,0,0,0,0
2,113020,2013,1,1,1,92,3722,14.99,14.99,0.0,...,0,0,0,0,0,0,0,0,0,0
3,226560,2014,1,0,0,61,873,14.99,14.99,0.0,...,0,0,0,0,0,0,0,0,1,0
4,249050,2014,1,1,0,88,8784,11.99,11.99,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49623,2455060,2023,1,0,0,94,494,12.00,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
49624,1138640,2023,1,0,0,61,248,17.00,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
49625,2515460,2023,1,1,1,67,80,5.00,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
49626,1687000,2023,1,0,0,79,358,20.00,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0


### Merge games with recommendations to create one hot vectors of users

In [22]:
recommendations.drop(['helpful', 'funny', 'date', 'hours', 'review_id'], axis=1, inplace=True)
recommendations['is_recommended'] = recommendations['is_recommended'].astype(int) 
recommendations


Unnamed: 0,app_id,is_recommended,user_id
0,1850570,1,5361677
1,1544020,0,5361677
2,1475810,0,5361677
3,638230,1,5361677
4,637650,1,5361677
...,...,...,...
524924,1659600,1,6370545
524925,1328840,1,6370545
524926,1927720,1,6370545
524927,647830,1,6370545


In [23]:
recommendations_games_users = pd.merge(recommendations, games_metadata_merged, on='app_id' )
recommendations_games_users

Unnamed: 0,app_id,is_recommended,user_id,date_release,win,mac,linux,positive_ratio,user_reviews,price_final,...,Well-Written,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,eSports
0,1850570,1,5361677,2022,1,0,0,93,13621,19.99,...,0,0,0,0,0,0,0,0,0,0
1,1850570,1,5117018,2022,1,0,0,93,13621,19.99,...,0,0,0,0,0,0,0,0,0,0
2,1850570,1,7379653,2022,1,0,0,93,13621,19.99,...,0,0,0,0,0,0,0,0,0,0
3,1850570,0,6505918,2022,1,0,0,93,13621,19.99,...,0,0,0,0,0,0,0,0,0,0
4,1850570,0,4266016,2022,1,0,0,93,13621,19.99,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524924,1038440,0,6370545,2019,1,0,0,70,20,5.99,...,0,0,0,0,0,0,0,0,0,0
524925,1721440,1,6370545,2022,1,0,0,70,10,49.99,...,0,0,0,0,0,0,0,0,0,0
524926,1628440,1,6370545,2021,1,1,1,100,40,1.99,...,0,0,0,0,0,0,0,0,0,0
524927,1270720,1,6370545,2021,1,0,0,86,15,29.99,...,0,0,0,0,0,0,0,0,0,0


In [24]:
recommendations_games_users['app_id'].unique().shape[0], recommendations_games_users['user_id'].unique().shape[0]

(32396, 1347)

### Convert users that recommended games to one hot vectors

In [25]:
## Create tuples of combined tags (we use tuples because lists can't be hashed when droping duplicates)
users_lists = recommendations_games_users.groupby('app_id')['user_id'].apply(list)
## Merge dataframes
recommendations_games_users = pd.merge(recommendations_games_users, users_lists, on='app_id')
## Users to One hot vectors 
mlb = MultiLabelBinarizer()
recommendations_games_users = recommendations_games_users.join(pd.DataFrame(mlb.fit_transform(recommendations_games_users.pop('user_id_y')),
                                            columns=mlb.classes_,
                                            index=recommendations_games_users.index))
recommendations_games_users.rename(columns={'user_id_x':'user_id'}, inplace=True)
recommendations_games_users

Unnamed: 0,app_id,is_recommended,user_id,date_release,win,mac,linux,positive_ratio,user_reviews,price_final,...,14211777,14214108,14215849,14223043,14258281,14267398,14276007,14277326,14283059,14283841
0,1850570,1,5361677,2022,1,0,0,93,13621,19.99,...,0,0,0,0,0,0,0,0,1,0
1,1850570,1,5117018,2022,1,0,0,93,13621,19.99,...,0,0,0,0,0,0,0,0,1,0
2,1850570,1,7379653,2022,1,0,0,93,13621,19.99,...,0,0,0,0,0,0,0,0,1,0
3,1850570,0,6505918,2022,1,0,0,93,13621,19.99,...,0,0,0,0,0,0,0,0,1,0
4,1850570,0,4266016,2022,1,0,0,93,13621,19.99,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524924,1038440,0,6370545,2019,1,0,0,70,20,5.99,...,0,0,0,0,0,0,0,0,0,0
524925,1721440,1,6370545,2022,1,0,0,70,10,49.99,...,0,0,0,0,0,0,0,0,0,0
524926,1628440,1,6370545,2021,1,1,1,100,40,1.99,...,0,0,0,0,0,0,0,0,0,0
524927,1270720,1,6370545,2021,1,0,0,86,15,29.99,...,0,0,0,0,0,0,0,0,0,0


In [26]:
games_vectors = recommendations_games_users.drop(['is_recommended', 'user_id'], axis=1, inplace=False)
games_vectors.drop_duplicates(inplace=True)
games_vectors.sort_values(by='user_reviews', ascending=False, inplace=True)

games_vectors

Unnamed: 0,app_id,date_release,win,mac,linux,positive_ratio,user_reviews,price_final,price_original,discount,...,14211777,14214108,14215849,14223043,14258281,14267398,14276007,14277326,14283059,14283841
69982,304930,2017,1,1,1,91,515016,0.00,0.00,0.0,...,0,0,0,1,0,0,0,0,0,0
327740,433850,2018,1,0,0,55,207328,0.00,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
506173,1281930,2020,1,1,1,97,188684,0.00,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
443318,1468810,2023,1,0,0,52,185051,19.99,19.99,0.0,...,0,0,0,0,0,0,0,0,0,0
81865,698780,2017,1,1,0,96,184949,0.00,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524760,1794840,2022,1,0,0,20,10,0.00,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
228942,2015260,2022,1,0,0,100,10,8.99,29.99,70.0,...,0,0,0,0,0,0,0,0,0,0
519327,699820,2017,1,0,0,50,10,0.99,0.99,0.0,...,0,0,0,0,0,0,0,0,0,0
519343,1683670,2021,1,0,0,50,10,0.99,0.99,0.0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
### Get games already played recommended (1) or not (0) by user
games_recommended = recommendations_games_users[recommendations_games_users['user_id'] == 5361677]
games_recommended_ids = games_recommended['app_id'].values
games_recommendations = games_recommended['is_recommended'].values
games_recommended = games_recommended.drop(['app_id', 'is_recommended', 'user_id'], axis=1)
games_recommended = np.array(games_recommended)
games_recommended /= np.linalg.norm(games_recommended, 2, axis=1)[:, None]
### Compute linear regression to estimate theta of an estimator
reg = LinearRegression().fit(games_recommended, games_recommendations)
theta_user = reg.coef_

In [28]:
### Create catalog of 2000 games not played by given user using most rated games
games_catalog = []
size_catalog = 0

games_catalog = games_vectors[~games_vectors['app_id'].isin(games_recommended_ids)]
games_catalog = np.array(games_catalog[:2000].drop('app_id', axis=1))
games_catalog

array([[2.017e+03, 1.000e+00, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.018e+03, 1.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.020e+03, 1.000e+00, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [2.006e+03, 1.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.018e+03, 1.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.019e+03, 1.000e+00, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [29]:
### Create catalog of 2000 games not played by given user using most rated games
games_catalog = []
size_catalog = 0

for g in games_vectors.values:
    if g[0] not in games_recommended_ids:
        games_catalog.append(g)
        size_catalog += 1
    if size_catalog == 2000:
        break
games_catalog = np.array(games_catalog)
games_catalog

array([[3.04930e+05, 2.01700e+03, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [4.33850e+05, 2.01800e+03, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.28193e+06, 2.02000e+03, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [2.40000e+03, 2.00600e+03, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [8.03600e+05, 2.01800e+03, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.59510e+05, 2.01900e+03, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [30]:
del games
del recommendations
del games_metadata_merged
del games_metadata
del number_recomm

In [31]:
recomm = recommendations_games_users[['app_id', 'user_id', 'is_recommended']]
recomm

Unnamed: 0,app_id,user_id,is_recommended
0,1850570,5361677,1
1,1850570,5117018,1
2,1850570,7379653,1
3,1850570,6505918,0
4,1850570,4266016,0
...,...,...,...
524924,1038440,6370545,0
524925,1721440,6370545,1
524926,1628440,6370545,1
524927,1270720,6370545,1


In [32]:
recomm.to_csv('./recommendations_games_users.csv')
games_vectors.to_csv('./games_vectors.csv')

In [None]:
recommendations_games_users['']

In [16]:
import pandas as pd
recomm = pd.read_csv('./data/recommendations_games_users.csv')
games_vectors = pd.read_csv('./data/games_vectors.csv')

In [15]:
len(recomm['app_id'].unique()) - np.max(recomm.groupby('user_id')['app_id'].aggregate('size'))

26454

In [2]:
from tqdm import tqdm
import numpy as np 
from sklearn.linear_model import LinearRegression

users = recomm['user_id'].unique()
dict_theta_items = {user:{'theta':None, 'action_set':None} for user in users}
for user_id in tqdm(users):
    
    ### Get games already played recommended (1) or not (0) by user
    games_recommended = recomm[recomm['user_id'] == user_id]
    games_recommended_ids = games_recommended['app_id'].values
    games_recommendations = games_recommended['is_recommended'].values
    games_recommended = games_vectors[games_vectors['app_id'].isin(games_recommended_ids)]
    games_recommended = np.array(games_recommended.drop('app_id', axis=1, inplace=False))
    games_recommended /= np.linalg.norm(games_recommended, 2, axis=1)[:, None]
    ### Compute linear regression to estimate theta of an estimator
    reg = LinearRegression().fit(games_recommended, games_recommendations)
    theta_user = reg.coef_

    ### Create catalog of 2000 games not played by given user using most rated games
    size_catalog = 2000
    games_catalog = games_vectors[~games_vectors['app_id'].isin(games_recommended_ids)]
    games_catalog = np.array(games_catalog[:size_catalog].drop('app_id', axis=1))


    dict_theta_items[user_id]['theta'] = theta_user / np.linalg.norm(theta_user, 2)
    dict_theta_items[user_id]['action_set'] = games_catalog / np.linalg.norm(games_catalog, 2, axis=1)[:, None]



 15%|█▌        | 205/1347 [03:07<19:44,  1.04s/it]

: 

In [None]:
import json
json.dump(dict_theta_items, open('./steam_processed.json', 'w'))

In [34]:
### A chaque run, choisir l'id d'un utilisateur
##creer un fichier processing ou j'aurais pour chaque utilisateur
    # user i : {theta: [], action_set: [[games auxquels il n'a pas joué]]}