In [1]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib.pyplot as plt

# keras imports
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding

from sklearn.feature_extraction.text import TfidfVectorizer


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data_dir = "data/steam_reviews.csv"
df = pd.read_csv(data_dir)
print(type(df))
df.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns
1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns
2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns
3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight
4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight


# remove nan reviews

In [3]:
orig_len = len(df)
df.dropna(axis=0, inplace=True)
print ('dropped {} nan reviews'.format(orig_len - len(df)))

dropped 1514 nan reviews


In [4]:
import string
table = str.maketrans('', '', string.punctuation)
len(table)

32

In [18]:
table

{33: None,
 34: None,
 35: None,
 36: None,
 37: None,
 38: None,
 39: None,
 40: None,
 41: None,
 42: None,
 43: None,
 44: None,
 45: None,
 46: None,
 47: None,
 58: None,
 59: None,
 60: None,
 61: None,
 62: None,
 63: None,
 64: None,
 91: None,
 92: None,
 93: None,
 94: None,
 95: None,
 96: None,
 123: None,
 124: None,
 125: None,
 126: None}

In [6]:
'@!testTEST!@#!@#'.translate(table).lower()

'testtest'

# Clean up textual data (try this stuff later, do vanilla run first)
* Remove common stopwords?
* Lower case everything 
* All ‘s in the data is preceded by a ‘\’ which should be cleaned out
* Maybe don't lower case everything... ex: 
    * "I HIGHLY RECOMMEND THIS GAME CAUSE THERE ARE SOME KILLERS WILL FOLLOW YOU NO MATTER WHERE YOU GO IN WHICH YOU'LL GO LOOPING EACH OTHER Who said this game is a horror game but MEHHHH Lieesssss ahhahahaha xDDDI never laugh so hard cause of this game I LOVE IT &lt 3333"

In [7]:
# subset_reviews = df['review'][:50].tolist()
subset_reviews = df['review'].tolist()

In [8]:
# a lot of (), so I'm cleaning it out
# add space after "."
cleaned_reviews = []

for each_review in subset_reviews:
    each_cleaned_review = []
    words = each_review.split(" ")
    for idx, each_word in enumerate(words):
        cleaned = each_word.replace("(", "").replace(")", "") # remove ()
        if "." in cleaned:
            if cleaned.split(".")[1] not in ["", "'"]:
                cleaned = cleaned.replace(".", ". ") # add space after "." 
        cleaned = cleaned.translate(table).lower()
        each_cleaned_review.append(cleaned)
    cleaned_reviews.append(" ".join(each_cleaned_review))
cleaned_reviews[0]

'gt played as german reichgt declare war on belgiumgt cant break belgium so go through francegt capitulate france in order to get to belgiumgt get true blitzkrieg achievementthis game is dad'

In [None]:
"""
france's
frances

soldier's gun --> soldiers gun
"""


In [9]:
# df['is_early_access_review'] = df['is_early_access_review'].astype('int')
# df['recommendation'] = df['recommendation'].map({'Recommended':1, 'Not Recommended':0})
# df = pd.get_dummies(df, columns = ['title'] )
df['cleaned_reviews'] = cleaned_reviews
df.head()

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title,cleaned_reviews
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns,gt played as german reichgt declare war on bel...
1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns,yes
2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns,very good game although a bit overpriced in my...
3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight,out of all the reviews i wrote this one is pro...
4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight,disclaimer i survivor main i play games for fu...


In [15]:
cleaned_reviews

['gt played as german reichgt declare war on belgiumgt cant break belgium so go through francegt capitulate france in order to get to belgiumgt get true blitzkrieg achievementthis game is dad',
 'yes',
 'very good game although a bit overpriced in my opinion id prefer playing the game with mods historical accuracy so on although the vanilla version is good aswell 710',
 'out of all the reviews i wrote this one is probably the most serious one i wrote for starters the community of this game sucks just like every online game you dont wanna talk to them because theyre all just jerks who wanna say you suck or you camped or blah blah blah its rare to get a nice compliment out of people in this game but you dont have to talk to the community either unlike most online games this one doesnt have a mic system so you dont have to worry about hearing a twelve year old cry baby raging at you for nothing only time you can talk to people in the game is in the beginning before the match starts if you

In [17]:
str('reseña'.encode('utf8'))

"b'rese\\xc3\\xb1a'"

In [10]:
len(df)

433377

In [11]:
# clean out foreign language and bold/italics, etc
reviews = df['cleaned_reviews'].tolist()
deleted_rows = 0
bad_rows = []

for idx, eachreview in enumerate(reviews):
    word_list = eachreview.split(" ")
    for eachword in word_list:
        encoded_text = eachword.encode('utf8')
        if "\\" in str(encoded_text):
            bad_rows.append(idx)
            break


test_drop = df.drop(df.index[bad_rows])

In [12]:
len(df) - len(test_drop)
# dropped less, look into this

8070

In [13]:
raw_text = test_drop['cleaned_reviews']
vectorizer = TfidfVectorizer(use_idf=True)
vectors = vectorizer.fit_transform(raw_text.apply(lambda x: np.str_(x))) #.apply(lambda x: np.str_(x))

# X_as_array = X.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list
# print(len(X_as_array))

feature_names = vectorizer.get_feature_names()
# dense = vectors.todense()
# denselist = dense.tolist()
# tf_idf = pd.DataFrame(vectors, columns=feature_names)

In [14]:
# feature_names[len(feature_names)-500:]
len(feature_names)

225606

In [16]:
reviews = raw_text.tolist()

deleted_rows = 0
nans = 0
for eachreview in reviews:
    try:
        word_list = eachreview.split(" ")
        for eachword in word_list:
            encoded_text = eachword.encode('utf8')
            if "\\" in str(encoded_text):
                deleted_rows+=1
                break
    except:
        nans+=1
        
        
print (deleted_rows)
print (nans) 

9082
1514


In [8]:
print(vectors.shape)
print(len(feature_names))
print(vectors)

(434891, 221024)
221024
  (0, 80564)	0.477604160118617
  (0, 131209)	0.06157526962731858
  (0, 20129)	0.05405914598020932
  (0, 76287)	0.1607889165432482
  (0, 143109)	0.18335691936876072
  (0, 48647)	0.1587967897741817
  (0, 184040)	0.12009617973874011
  (0, 122420)	0.045204821858718644
  (0, 26375)	0.5734606605415646
  (0, 33893)	0.04768001993933596
  (0, 30592)	0.1033859108185502
  (0, 158329)	0.04841371155240533
  (0, 77663)	0.06763355601093524
  (0, 171090)	0.08038321265507184
  (0, 70410)	0.34571139445779414
  (0, 34299)	0.20690441600455656
  (0, 90300)	0.04086868568602996
  (0, 124322)	0.10345399240500303
  (0, 172263)	0.06643569610726961
  (0, 76349)	0.0986422316908891
  (0, 174835)	0.1037987983547056
  (0, 28433)	0.20690441600455656
  (0, 12708)	0.2134156667069383
  (0, 73431)	0.02501313747197943
  (0, 93772)	0.034083972580775404
  :	:
  (434890, 88824)	0.08118860896370828
  (434890, 111490)	0.09961697176786455
  (434890, 24345)	0.10181130057710057
  (434890, 68015)	0.10274922

**Ben's Date Stuff**

In [25]:
newColumns = df["date_posted"].str.split("-", n = 2, expand = True) 

In [26]:
df['Year'] = newColumns[0]
df['Month'] = newColumns[1]
df['Day'] = newColumns[2]
df.drop(columns = ['date_posted'], inplace = True)

df

Unnamed: 0,funny,helpful,hour_played,is_early_access_review,recommendation,review,title,Year,Month,Day
0,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns,2019,02,10
1,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns,2019,02,10
2,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns,2019,02,07
3,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight,2018,06,14
4,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight,2017,06,20
...,...,...,...,...,...,...,...,...,...,...
434886,1,37,10,False,Recommended,YOUR FLESH WILL ROT AND DECAY.STEEL IS IMMORTA...,"Warhammer 40,000: Mechanicus",2018,11,17
434887,3,41,38,False,Recommended,Domini and Dominae I believe what we are deali...,"Warhammer 40,000: Mechanicus",2018,11,17
434888,0,0,36,False,Recommended,First off if you like X Com style of games you...,"Warhammer 40,000: Mechanicus",2018,11,20
434889,1,44,12,False,Recommended,As a disclaimer I'm an AdMech player on the ta...,"Warhammer 40,000: Mechanicus",2018,11,18


# Encoding text below

In [6]:
VOCAB_SIZE = None
MAX_SEQ_LEN = 0

# find vocab_size
all_words = {}
for each_review in cleaned_reviews:
    word_list = each_review.split(" ")
    # find max seq len
    if len(word_list) > MAX_SEQ_LEN:
        MAX_SEQ_LEN = len(word_list)
        sent = word_list

    for ea_word in word_list:
        if ea_word in all_words:
            all_words[ea_word] += 1
        else:
            all_words[ea_word] = 1
VOCAB_SIZE = len(all_words.keys())
print ('vocab_size = ', VOCAB_SIZE)
print ('max_seq_len = ', MAX_SEQ_LEN)
# print (MAX_SEQ_LEN, sent)

vocab_size =  1841
max_seq_len =  1344


In [7]:
encoded_reviews = [one_hot(x, VOCAB_SIZE) for x in cleaned_reviews]
padded_reviews = pad_sequences(encoded_reviews, maxlen=MAX_SEQ_LEN, padding='pre')
padded_reviews.shape

(50, 1344)

In [None]:
# the fox jumped over the log
# 
# the=1, fox=2, jumped=3... log=6


In [8]:
padded_docs

NameError: name 'padded_docs' is not defined

# Code below is to save cleaned dataset -- don't edit

In [None]:
def save_cleaned_dataset(dataset):
    # dataset must be a np array
    root = "data/"
    savetxt(root + 'cleaned_steam_data.csv', dataset, delimiter=',')
save_cleaned_dataset(padded_docs) # testing