In [1]:
import pandas as pd 
import pickle
import os

# Calculate the similarity between the names
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Get the most similar between the df_reddit and the df by multiprocessing
from multiprocessing import Pool
from functools import partial

os.getcwd()

'/home/jliu/project'

In [12]:
# games.pkl is the detailed information about all the games on IGDB.
with open('data/games.pkl', 'rb') as f:
    igdb_games_list = pickle.load(f)

df_IGDB = pd.DataFrame.from_dict(igdb_games_list)

In [13]:
print(df_IGDB.shape)
df_IGDB.head()

(230222, 57)


Unnamed: 0,id,category,created_at,external_games,first_release_date,game_modes,genres,name,platforms,release_dates,...,franchises,franchise,multiplayer_modes,dlcs,expansions,ports,remakes,expanded_games,standalone_expansions,forks
0,35004,0,1495818975,"[8077, 1957344]",1437696000.0,"[2, 5]","[15, 32]",Demon Horde Master,[6],[80453],...,,,,,,,,,,
1,151599,0,1622835085,"[2060085, 2077186, 2593975, 2637093]",1623888000.0,[1],"[31, 32, 34]",Idol Days,"[6, 130]","[251428, 338327, 373398]",...,,,,,,,,,,
2,244927,0,1680957248,"[2690306, 2691475]",,,"[15, 32]",Wave of Elms,,,...,,,,,,,,,,
3,118008,0,1557063612,"[1720438, 1864275, 1918495]",1589501000.0,"[1, 2, 3]","[13, 15, 32]",Tabletop Playground,[6],[220820],...,,,,,,,,,,
4,95080,0,1521818623,[1989881],,,,Dotra,,,...,,,,,,,,,,


In [14]:
# Parse the columns to datetime: first_release_date, created_at, updated_at, release_dates
df_IGDB['first_release_date'] = pd.to_datetime(df_IGDB['first_release_date'], unit='s')


# Let the id be the index
df_IGDB.set_index('id', inplace=True)
# Let the name be the first column
df_IGDB = df_IGDB[['name','genres','game_modes'] + [col for col in df_IGDB.columns if col != 'name']]

# Drop the created_at, screenshots, url, websites, check_sum
df_IGDB.drop(['created_at','screenshots','url','websites','updated_at'], axis=1, inplace=True)

In [15]:
df_IGDB

Unnamed: 0_level_0,name,genres,game_modes,category,external_games,first_release_date,game_modes,genres,platforms,release_dates,...,franchises,franchise,multiplayer_modes,dlcs,expansions,ports,remakes,expanded_games,standalone_expansions,forks
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35004,Demon Horde Master,"[15, 32]","[2, 5]",0,"[8077, 1957344]",2015-07-24,"[2, 5]","[15, 32]",[6],[80453],...,,,,,,,,,,
151599,Idol Days,"[31, 32, 34]",[1],0,"[2060085, 2077186, 2593975, 2637093]",2021-06-17,[1],"[31, 32, 34]","[6, 130]","[251428, 338327, 373398]",...,,,,,,,,,,
244927,Wave of Elms,"[15, 32]",,0,"[2690306, 2691475]",NaT,,"[15, 32]",,,...,,,,,,,,,,
118008,Tabletop Playground,"[13, 15, 32]","[1, 2, 3]",0,"[1720438, 1864275, 1918495]",2020-05-15,"[1, 2, 3]","[13, 15, 32]",[6],[220820],...,,,,,,,,,,
95080,Dotra,,,0,[1989881],NaT,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41228,Videocart-24: Pro Football,[14],,0,"[23278, 132874, 1878997]",1980-03-01,,[14],[127],[91771],...,,,,,,,,,,
46228,Championship Pro-Am,"[5, 10]",,0,"[45810, 134432, 1878050]",1992-02-01,,"[5, 10]",[29],[98470],...,,,,,,,,,,
48228,Tecmo Baseball,[14],,0,"[30728, 142215, 1186920]",1989-02-01,,[14],[18],[94847],...,,,,,,,,,,
49228,Doraemon Dokodemo Walker,"[15, 35]",[1],0,"[33047, 527571, 1895078]",2002-03-29,[1],"[15, 35]",[24],[459729],...,[1435],,,,,,,,,


In [17]:
# game_name.csv is the file from Rappaz et al. - 2017
# it stores all the id-game name in the dataset.
with open('./data/game_names.csv','r') as f:
    game_names = [line.strip('\n') for line in f.readlines()]
# Transform the list into a dataframe
df_reddit = pd.DataFrame(game_names, columns=['reddit_name'])
print(df_reddit.shape)
df_reddit.head()

(3472, 1)


Unnamed: 0,reddit_name
0,Black
1,Dead Space 2
2,Final Fantasy XIII-2
3,Star Wars: Battlefront II
4,Shadow of the Beast


In [23]:
def get_most_similar(_df_reddit, _df, column):
    # Get the most similar
    most_similar = process.extract(_df_reddit, _df[column], limit=1, scorer=fuzz.ratio)
    # Get the similarity
    similarity = most_similar[0][1]
    # Get the name
    name = most_similar[0][0]
    # Get the index
    index = _df[_df[column]==name].index[0]
    return similarity, index

In [37]:
# Get the most similar
with Pool(20) as p:
    base_similarity = p.map(partial(get_most_similar, _df=df_IGDB, column='name'), df_reddit['reddit_name'])
# Save the base_similarity
with open('./data/base_similarity.pkl','wb') as f:
    pickle.dump(base_similarity, f)

# Join the similarity with the df_reddit
df_reddit['similarity'] = [sim[0] for sim in base_similarity]
df_reddit['index'] = [sim[1] for sim in base_similarity]

# Join the df_reddit with the df by the index
df_test = df_reddit.join(df_IGDB, on='index')

df_reddit_new = pd.DataFrame(game_names, columns=['reddit_name'])
df_reddit_new['similarity'] = [sim[0] for sim in base_similarity]
df_reddit_new['index'] = [sim[1] for sim in base_similarity]
# Join by the index of df_reddit_new and df.index
df_test = df_reddit_new.join(df, on='index')
df_test.to_csv('./data/games_tbp.csv')

df_test = pd.read_csv('./data/games_tbp.csv', index_col=0)

games_tbp.csv contains the detailed information of the games in Rappaz et al. - 2017 dataset. 

In [38]:
df_test

Unnamed: 0,reddit_name,similarity,index,name,genres,game_modes,category,external_games,first_release_date,game_modes.1,...,franchises,franchise,multiplayer_modes,dlcs,expansions,ports,remakes,expanded_games,standalone_expansions,forks
0,Black,100,117992,Black,"[9, 32]",[1],0,,2018-09-27,[1],...,,,,,,,,,,
1,Dead Space 2,100,38,Dead Space 2,"[5, 31]","[1, 2, 3]",0,"[13940, 79384, 83156, 134007, 213408, 245892, ...",2011-01-25,"[1, 2, 3]",...,,,[14153],[20440],,,,,,
2,Final Fantasy XIII-2,100,384,Final Fantasy XIII-2,"[12, 31]",[1],0,"[11302, 25513, 79831, 84611, 126259, 213557, 2...",2011-12-15,[1],...,[4],4.0,,,,,,,,
3,Star Wars: Battlefront II,100,142,Star Wars: Battlefront II,[5],"[1, 2, 3, 4]",0,"[14940, 55945, 73741, 78088, 90457, 93148, 931...",2005-10-31,"[1, 2, 3, 4]",...,[1],1.0,[689],,,,,,,
4,Shadow of the Beast,100,5434,Shadow of the Beast,[8],[1],0,"[16532, 150031, 1985022]",1989-10-01,[1],...,,,,,,[42453],[11254],,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3467,Pokémon Mystery Dungeon: Explorers of Darkness...,95,2322,Pokémon Mystery Dungeon: Explorers of Darkness,"[16, 31]","[1, 3]",0,"[28838, 1977680]",2007-09-13,"[1, 3]",...,"[60, 1287]",60.0,,,,,,,,
3468,Senran Kagura 2: Deep Crimson,100,11079,Senran Kagura 2: Deep Crimson,"[4, 25]",[1],0,"[28274, 113399, 220160, 1923899, 2609304]",2014-08-07,[1],...,[561],,,,,,,,,
3469,Ghost Trick: Phantom Detective,100,1348,Ghost Trick: Phantom Detective,"[9, 31]",[1],0,"[29148, 131658, 217825, 247986]",2010-06-19,[1],...,,,[19385],,,,,,,
3470,World Series Baseball,100,94110,World Series Baseball,[14],"[1, 2]",0,[127049],1995-10-01,"[1, 2]",...,[325],,[16337],,,,,,,
