In [None]:
import pandas as pd
from __future__ import division
from numbers import Number
import sys, codecs
import numpy as np
import sqlite3
import nltk
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import pickle

## Pickle Load: 
filename = 'bgg_ratings_reviews_full_posttextcleaning.pkl'

infile = open(filename, 'rb')
df_allgames = pickle.load(infile)
infile.close()

df_allgames.head()

Unnamed: 0,gameid,username,rating,value,name,n_ratings,pic_url,rev_LC_noNames,rev_cleaned
0,13,sinahero,10.0,The best game in the world,Catan,77596,https://cf.geekdo-images.com/micro/img/e0y6Bog...,the best game in the world,"[best, game, world]"
1,13,Cayden101,10.0,I've played this game probably close to a hund...,Catan,77596,https://cf.geekdo-images.com/micro/img/e0y6Bog...,i've played this game probably close to a hund...,"['ve, played, game, probably, close, hundred, ..."
2,13,Spaceage Polymer,10.0,"I didn't want to give Catan a perfect ten, bec...",Catan,77596,https://cf.geekdo-images.com/micro/img/e0y6Bog...,"i didn't want to give a perfect ten, because ...","[n't, want, give, perfect, ten, ,, 's, even, f..."
3,13,asauve19,10.0,My all time favorite game. Even if you are rol...,Catan,77596,https://cf.geekdo-images.com/micro/img/e0y6Bog...,my all time favorite game. even if you are rol...,"[time, favorite, game, ., even, rolling, bad, ..."
4,13,Feelie,10.0,One of (if not) the best board game. Ever chan...,Catan,77596,https://cf.geekdo-images.com/micro/img/e0y6Bog...,one of (if not) the best board game. ever chan...,"[one, (, ), best, board, game, ., ever, changi..."


# Rev2Vec: Average Word2Vec vectors for each review  
#### Tools: Gensim and the Google News pretrained Word2Vec model  
  
**Word2Vec Model Selection:** I decided to use a pre-trained Word2Vec model (previously trained on text from Google News) because I was concerned I wouldn't have enough text within the reviews to create an accurate enough Word2Vec model capable of producing meaningful results. This was confirmed when I discussed my project with several people who were knowledgable about Word2Vec - they all recommended using a pre-trained model.  
  
**Approach:** For each review, I create an average vector that represents an average of all the Word2Vec vectors for each word contained in preprocessed version of review. This average vector is then saved in the *'rev2vec'* column in the dataframe. 

In [19]:
import pickle

## Pickle Load: 
filename = 'bgg_ratings_reviews_full_posttextcleaning.pkl'

infile = open(filename, 'rb')
df_allgames = pickle.load(infile)
infile.close()

df_allgames.head()

In [14]:
### Using the Google News pre-trained Word2Vec model to convert words 
# into vectors (with Gensim):

from gensim.models import KeyedVectors
# Load vectors directly from the file
model = KeyedVectors.load_word2vec_format('/Users/meredithjohnson/Downloads/GoogleNews-vectors-negative300.bin', binary=True)


In [15]:
### Functions borrowed from Bart:

# Apply pre-trained word2vec model to a single word. 
def _evaluate(word):
        
    if(isinstance(word,list)):
        return __evaluate_set(word)
    elif(isinstance(word,str)):
        #attempt to get vectorial representation of word.
        try:
            return model[word]
        except KeyError as e:
            return np.full([300,],np.nan)
    else:
        raise TypeError()
            
# Apply the word2vec model to a set of words and average them. 
def __evaluate_set(words):
    #evaluate each word in 
    n = 0
    a = []
    for w in words:
        #attempt to evaluate vectorial representation of word.
        try:
            v = model[w]
            if((np.isnan(v).any() + np.isinf(v).any()) == 0):
                a.append(v)
                n += 1
        except KeyError as e:
            pass
    #if nothing was valid, return nan
    if(n==0):
        return np.full([300,], np.nan)
    #return average
    return np.mean(np.array(a),axis=0)

In [18]:
#### New Version: 

def add_rev2vec_column(df_with_reviews = df_allgames, 
                      cleaned_reviews_col_name = 'rev_cleaned'):
    """
    This function creates a mean vector for each review (contained in the 
    column named cleaned_reviews_col_namees within the df_with_reviews
    and outputs it in a matrix. 
    _evaluate is Bart's function that creates a mean word2vec
    vector for an entire set of words by averaging across the word vectors
    in the set.
    
    Then it adds each array containing the rev2vec to the df_with_reviews
    in a single column ('rev2vec'). 
    """
    rev2vecs = []
    
    # Iterate through each row of df_with_reviews and create a w2v vector
    # for each review. 
    for index, row in df_with_reviews.iterrows():
        review = row[cleaned_reviews_col_name]
        review_vec = _evaluate(review)
        #if ((np.isnan(review_vec).any() + np.isinf(review_vec).any()) == 0):
        rev2vecs.append(review_vec)
        
        # Store vectors for each review as an array in a single column:

    df_with_reviews['rev2vec'] = pd.Series(rev2vecs, index=df_with_reviews.index)

    return df_with_reviews
        
#### Note: 
# Future version of this script could let user pick whether they want the 
# w2vecs added to the df, or if they want just the list of them (i.e., 
# return rev2vecs instead). 

In [19]:
### Apply function:
df_allgames = add_rev2vec_column(df_allgames)

df_allgames.head()

Unnamed: 0,gameid,username,rating,value,name,nrate,pic_url,rev_LC_noNames,rev_cleaned,rev2vec
0,13,sinahero,10.0,The best game in the world,Catan,77596,https://cf.geekdo-images.com/micro/img/e0y6Bog...,the best game in the world,"[best, game, world]","[-0.025390625, 0.0476888, 0.18489583, 0.096547..."
1,13,Cayden101,10.0,I've played this game probably close to a hund...,Catan,77596,https://cf.geekdo-images.com/micro/img/e0y6Bog...,i've played this game probably close to a hund...,"['ve, played, game, probably, close, hundred, ...","[0.013619995, 0.07537842, 0.034295656, 0.15359..."
2,13,Spaceage Polymer,10.0,"I didn't want to give Catan a perfect ten, bec...",Catan,77596,https://cf.geekdo-images.com/micro/img/e0y6Bog...,"i didn't want to give a perfect ten, because ...","[n't, want, give, perfect, ten, ,, 's, even, f...","[0.020022582, 0.037402343, 0.0032592774, 0.119..."
3,13,asauve19,10.0,My all time favorite game. Even if you are rol...,Catan,77596,https://cf.geekdo-images.com/micro/img/e0y6Bog...,my all time favorite game. even if you are rol...,"[time, favorite, game, ., even, rolling, bad, ...","[0.071272224, 0.087642275, 0.019033367, 0.1054..."
4,13,Feelie,10.0,One of (if not) the best board game. Ever chan...,Catan,77596,https://cf.geekdo-images.com/micro/img/e0y6Bog...,one of (if not) the best board game. ever chan...,"[one, (, ), best, board, game, ., ever, changi...","[-0.041544598, -0.03446452, -0.0030263264, 0.1..."
