# Imports

In [6]:
import pandas as pd
import json
import os
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np
import plotly 
from collections import Counter
from tqdm import tqdm

In [92]:
with open('/home/msnow/site_configs/secrets.json', 'r') as fp:
    secrets = json.load(fp)

In [93]:
plotly.tools.set_credentials_file(username=secrets['plotly']['username'], api_key=secrets['plotly']['api_key'])

# Ancillary Functions

In [40]:
class EmbedText:

    def __init__(self):
        # self.df = df
        self.model_d2v = None
        self.train_corpus = None
        # self.text_col = text_col
        # self.class_col = class_col

    def d2v_setup(self, df: pd.DataFrame, text_col: str, class_col: str = None, emb_size: int = 100):
        self.train_corpus = []
        for idx, row in df.iterrows():
            if class_col is None:
                self.train_corpus.append(TaggedDocument(row[text_col], [idx]))
            else:
                self.train_corpus.append(TaggedDocument(row[text_col], [row[class_col]]))
        self.model_d2v = Doc2Vec(vector_size=emb_size, dm=0, min_count=2, workers=6, epochs=1, min_alpha=0.025,
                                 alpha=0.025)
        self.model_d2v.build_vocab(self.train_corpus)

    def d2v_fit(self, epochs: int = 10, decay_rate: float = 0.002):
#         print('Fitting Doc2Vec')
        for _ in tqdm(range(epochs)):
            self.model_d2v.train(self.train_corpus, total_examples=self.model_d2v.corpus_count, epochs=epochs)
            self.model_d2v.alpha -= decay_rate
            self.model_d2v.min_alpha = self.model_d2v.alpha
        return self.model_d2v

    def d2v_embed_save(self, loc: str):
        self.model_d2v.save(loc)

    def d2v_embed_load(self, model_loc: str):
        self.model_d2v = Doc2Vec.load(model_loc)

    def d2v_vectors(self, df: pd.DataFrame, text_col: str):
        return df.loc[:, text_col].map(lambda x: self.model_d2v.infer_vector(x).tolist())

    def d2v_wordvecs(self, df: pd.DataFrame, text_col: str) -> pd.Series:
        return df.loc[:, text_col].map(
            lambda x: np.array([self.model_d2v.wv.word_vec(i) for i in x if i in self.model_d2v.wv.vocab]))

# Load Data

In [29]:
with open('../src/data/game_info_190509.json', 'r') as fp:
    df_game = pd.DataFrame(json.load(fp))
df_game.shape    

(15000, 48)

In [96]:
with open('../src/data/user_comments_190511.json', 'r') as fp:
    comment_list = json.load(fp)
df_comments = pd.DataFrame(comment_list)
df_comments.shape, df_comments.game_id.nunique()

((2516474, 4), 14987)

In [97]:
with open('../src/data/user_ratings_190511.json', 'r') as fp:
    rating_list = json.load(fp)
df_ratings = pd.DataFrame(rating_list)
df_ratings.shape, df_ratings.game_id.nunique()

((12753513, 3), 14993)

# Merge Data

In [22]:
df_game.columns

Index(['AbstractGameRank', 'BoardGameRank', 'Children'sGameRank',
       'CustomizableRank', 'FamilyGameRank', 'PartyGameRank', 'RPGItemRank',
       'StrategyGameRank', 'ThematicRank', 'WarGameRank', 'average',
       'averageweight', 'bayesaverage', 'boardgameartist', 'boardgamecategory',
       'boardgamecompilation', 'boardgamedesigner', 'boardgameexpansion',
       'boardgamefamily', 'boardgameimplementation', 'boardgameintegration',
       'boardgamemechanic', 'boardgamepublisher', 'game_id', 'maxplayers',
       'maxplaytime', 'median', 'minage', 'minplayers', 'minplaytime', 'name',
       'numcomments', 'numweights', 'owned', 'playingtime', 'stddev',
       'trading', 'url', 'usersrated', 'wanting', 'wishing', 'yearpublished'],
      dtype='object')

In [53]:
df_ratings = df_ratings.merge(df_game[['name', 'game_id', 'BoardGameRank']], how='left', on='game_id')
df_comments = df_comments.merge(df_game[['name', 'game_id', 'BoardGameRank']], how='left', on='game_id')

In [54]:
df_ratings[['game_id', 'BoardGameRank', 'rating']].groupby(['game_id', 'BoardGameRank']).count().sort_values('rating', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
game_id,BoardGameRank,Unnamed: 2_level_1
13,325,85000
822,150,84600
30549,75,83000
68448,45,68800
36218,76,68100


# Create "sentences" from board game categories

In [70]:
# col_list = ['boardgameartist', 'boardgamedesigner', 'boardgamecategory', 'boardgamefamily', 'boardgamemechanic', 'boardgamepublisher', 'name', 'game_id']
col_list = ['boardgamedesigner', 'boardgamecategory', 'boardgamefamily', 'boardgamemechanic', 'name', 'game_id']
df_game_vecs = df_game[col_list].copy()

In [71]:
df_game_vecs['docs'] = df_game_vecs[col_list[0]]
for col in col_list[1:-2]:
    df_game_vecs['docs'] = df_game_vecs['docs'] + df_game_vecs[col]
df_game_vecs['id_names'] = df_game_vecs.name + df_game_vecs.game_id
    

In [75]:
d2v = EmbedText()
d2v.d2v_setup(df = df_game_vecs, text_col='docs', class_col='name')
d2v.d2v_fit()


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:02<00:21,  2.37s/it][A
 20%|██        | 2/10 [00:04<00:18,  2.37s/it][A
 30%|███       | 3/10 [00:07<00:16,  2.37s/it][A
 40%|████      | 4/10 [00:09<00:14,  2.37s/it][A
 50%|█████     | 5/10 [00:11<00:11,  2.38s/it][A
 60%|██████    | 6/10 [00:14<00:09,  2.38s/it][A
 70%|███████   | 7/10 [00:16<00:07,  2.38s/it][A
 80%|████████  | 8/10 [00:19<00:04,  2.37s/it][A
 90%|█████████ | 9/10 [00:21<00:02,  2.37s/it][A
100%|██████████| 10/10 [00:23<00:00,  2.38s/it][A

<gensim.models.doc2vec.Doc2Vec at 0x7fc5999d7160>

In [84]:
d2v.model_d2v.docvecs.most_similar('7 Wonders', topn=20)

[('7 Wonders Duel', 0.9467664361000061),
 ('Âge de Bronze', 0.7837046980857849),
 ('Tides of Time', 0.7803778648376465),
 ('Hadara', 0.7733749747276306),
 ('Peloponnes Card Game', 0.7644440531730652),
 ('The Sands of Time', 0.7624205350875854),
 ("A Fool's Fortune", 0.7558571100234985),
 ('Polis', 0.7505557537078857),
 ('Sobek', 0.7326430678367615),
 ('Elysium', 0.7305032014846802),
 ('Big Monster', 0.7264267206192017),
 ('Villannex', 0.7263643741607666),
 ('Diesel Demolition Derby', 0.7240949869155884),
 ('Innovation', 0.7218561768531799),
 ('Minute Realms', 0.7193061113357544),
 ('Pentos', 0.718259871006012),
 ('Hexemonia', 0.7176440954208374),
 ('CIV: Carta Impera Victoria', 0.7169272899627686),
 ('Uchronia', 0.7127091884613037),
 ('Citadels', 0.7122185230255127)]

In [87]:
d2v_array = d2v.model_d2v.docvecs.vectors_docs[:2000,:]
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(d2v_array)

In [95]:
vect_size = 2000
trace1 = go.Scatter(
    x=X_tsne[:,0],
    y=X_tsne[:,1],
    mode='text',
    text = d2v.model_d2v.docvecs.offset2doctag[:vect_size],
    textposition='bottom center',
)
data = [trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
config = {'scrollZoom': True}
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='d2v_bgg', config=config)


Consider using IPython.display.IFrame instead



In [18]:
d2v_array = d2v.model_d2v.docvecs.vectors_docs

type(d2v_array)

numpy.ndarray

In [14]:
aa = d2v.model_d2v.wv.vectors

In [19]:
d2v_array.shape

(29744, 100)

In [100]:
d2v.model_d2v.docvecs.

[(324, 0.9941948652267456),
 (1685, 0.8426946401596069),
 ('Catan: Portable Edition', 0.8363494873046875),
 ('Struggle for Catan', 0.8137760162353516),
 (2636, 0.8131700754165649),
 (821, 0.7774920463562012),
 ('Catan Card Game', 0.7761682868003845),
 (6402, 0.738957941532135),
 (5765, 0.738196849822998),
 ('Catan: Family Edition', 0.7370287775993347),
 ('Catan Dice Game', 0.7352598905563354),
 ('Star Trek: Catan', 0.734115719795227),
 (897, 0.7331305742263794),
 ('Catan Histories: Rise of the Inkas', 0.7316294312477112),
 (4751, 0.729112982749939),
 (4103, 0.7221580743789673),
 ('Ataxx', 0.7213221788406372),
 (11963, 0.7177346348762512),
 ('Catan: Traveler – Compact Edition', 0.7165348529815674),
 ('Rivals for Catan', 0.7152861952781677)]

In [97]:
df_game_vecs.loc[df_game_vecs.name==74]

Unnamed: 0,boardgameartist,boardgamedesigner,boardgamecategory,boardgamefamily,boardgamemechanic,boardgamepublisher,name,game_id,docs


In [58]:
df_game.head(2).T

Unnamed: 0,0,1
AbstractGameRank,,
AccessoryRank,,
AmigaRank,,
ArcadeRank,,
AtariSTRank,,
BoardGameRank,1,2
Children'sGameRank,,
Commodore64Rank,,
CustomizableRank,,
FamilyGameRank,,
