In [9]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib.pyplot as plt

# keras imports
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [18]:
data_dir = "data/steam_reviews.csv"
df = pd.read_csv(data_dir)
print(type(df))
print(df)

<class 'pandas.core.frame.DataFrame'>
       date_posted  funny  helpful  hour_played  is_early_access_review  \
0       2019-02-10      2        4          578                   False   
1       2019-02-10      0        0          184                   False   
2       2019-02-07      0        0          892                   False   
3       2018-06-14    126     1086          676                   False   
4       2017-06-20     85     2139          612                   False   
...            ...    ...      ...          ...                     ...   
434886  2018-11-17      1       37           10                   False   
434887  2018-11-17      3       41           38                   False   
434888  2018-11-20      0        0           36                   False   
434889  2018-11-18      1       44           12                   False   
434890  2019-01-21      1       28           20                   False   

       recommendation                                        

In [3]:
df.head()

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns
1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns
2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns
3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight
4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight


# Clean up textual data (try this stuff later, do vanilla run first)
* Remove common stopwords?
* Lower case everything 
* All ‘s in the data is preceded by a ‘\’ which should be cleaned out
* Maybe don't lower case everything... ex: 
    * "I HIGHLY RECOMMEND THIS GAME CAUSE THERE ARE SOME KILLERS WILL FOLLOW YOU NO MATTER WHERE YOU GO IN WHICH YOU'LL GO LOOPING EACH OTHER Who said this game is a horror game but MEHHHH Lieesssss ahhahahaha xDDDI never laugh so hard cause of this game I LOVE IT &lt 3333"

In [4]:
subset_reviews = df['review'][:50].tolist()

In [5]:
# a lot of (), so I'm cleaning it out
# add space after "."
cleaned_reviews = []

for each_review in subset_reviews:
    each_cleaned_review = []
    words = each_review.split(" ")
    for idx, each_word in enumerate(words):
        cleaned = each_word.replace("(", "").replace(")", "") # remove ()
        if "." in cleaned:
            if cleaned.split(".")[1] not in ["", "'"]:
                cleaned = cleaned.replace(".", ". ") # add space after "." 
        each_cleaned_review.append(cleaned)
    cleaned_reviews.append(" ".join(each_cleaned_review))
cleaned_reviews[0]

"&gt Played as German Reich&gt Declare war on Belgium&gt Can't break Belgium so go through France&gt Capitulate France in order to get to Belgium&gt Get True Blitzkrieg achievementThis game is dad"

**Ben's Date Stuff**

In [25]:
newColumns = df["date_posted"].str.split("-", n = 2, expand = True) 

In [26]:
df['Year'] = newColumns[0]
df['Month'] = newColumns[1]
df['Day'] = newColumns[2]
df.drop(columns = ['date_posted'], inplace = True)

df

Unnamed: 0,funny,helpful,hour_played,is_early_access_review,recommendation,review,title,Year,Month,Day
0,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns,2019,02,10
1,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns,2019,02,10
2,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns,2019,02,07
3,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight,2018,06,14
4,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight,2017,06,20
...,...,...,...,...,...,...,...,...,...,...
434886,1,37,10,False,Recommended,YOUR FLESH WILL ROT AND DECAY.STEEL IS IMMORTA...,"Warhammer 40,000: Mechanicus",2018,11,17
434887,3,41,38,False,Recommended,Domini and Dominae I believe what we are deali...,"Warhammer 40,000: Mechanicus",2018,11,17
434888,0,0,36,False,Recommended,First off if you like X Com style of games you...,"Warhammer 40,000: Mechanicus",2018,11,20
434889,1,44,12,False,Recommended,As a disclaimer I'm an AdMech player on the ta...,"Warhammer 40,000: Mechanicus",2018,11,18


# Encoding text below

In [6]:
VOCAB_SIZE = None
MAX_SEQ_LEN = 0

# find vocab_size
all_words = {}
for each_review in cleaned_reviews:
    word_list = each_review.split(" ")
    # find max seq len
    if len(word_list) > MAX_SEQ_LEN:
        MAX_SEQ_LEN = len(word_list)
        sent = word_list

    for ea_word in word_list:
        if ea_word in all_words:
            all_words[ea_word] += 1
        else:
            all_words[ea_word] = 1
VOCAB_SIZE = len(all_words.keys())
print ('vocab_size = ', VOCAB_SIZE)
print ('max_seq_len = ', MAX_SEQ_LEN)
# print (MAX_SEQ_LEN, sent)

vocab_size =  1841
max_seq_len =  1344


In [7]:
encoded_reviews = [one_hot(x, VOCAB_SIZE) for x in cleaned_reviews]
padded_reviews = pad_sequences(encoded_reviews, maxlen=MAX_SEQ_LEN, padding='pre')
padded_reviews.shape

(50, 1344)

In [8]:
padded_docs

NameError: name 'padded_docs' is not defined

# Code below is to save cleaned dataset -- don't edit

In [None]:
def save_cleaned_dataset(dataset):
    # dataset must be a np array
    root = "data/"
    savetxt(root + 'cleaned_steam_data.csv', dataset, delimiter=',')
save_cleaned_dataset(padded_docs) # testing