# jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [1]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib.pyplot as plt
import re, itertools
import string
from collections import OrderedDict
from operator import itemgetter
import time

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# keras imports
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding

# nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# text cleanup
from pattern.en import suggest
import enchant
from enchant.checker import SpellChecker
from spellchecker import SpellChecker

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data_dir = "data/steam_reviews.csv"
df = pd.read_csv(data_dir)
df.head()

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns
1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns
2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns
3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight
4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight


# remove nan reviews

In [3]:
orig_len = len(df)
df.dropna(axis=0, inplace=True)
print ('dropped {} nan reviews'.format(orig_len - len(df)))

dropped 1516 nan reviews


In [4]:
table = str.maketrans('', '', string.punctuation)
len(table)

32

# one-hot encode title & early access & recommendation

In [5]:
df['is_early_access_review'] = df['is_early_access_review'].astype('int')
df['recommendation'] = df['recommendation'].map({'Recommended':1, 'Not Recommended':0})
df = pd.get_dummies(df, columns=['title'])
df.head()

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title_ACE COMBAT™ 7: SKIES UNKNOWN,title_ARK: Survival Evolved,title_ASTRONEER,...,title_Subnautica,title_Subnautica: Below Zero,title_Survivor Pass: Vikendi,title_Tannenberg,title_Terraria,title_The Elder Scrolls V: Skyrim Special Edition,title_Tom Clancy's Rainbow Six® Siege,title_Wallpaper Engine,title_Wargroove,"title_Warhammer 40,000: Mechanicus"
0,2019-02-10,2,4,578,0,1,&gt Played as German Reich&gt Declare war on B...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2019-02-10,0,0,184,0,1,yes.,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2019-02-07,0,0,892,0,1,Very good game although a bit overpriced in my...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2018-06-14,126,1086,676,0,1,Out of all the reviews I wrote This one is pro...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2017-06-20,85,2139,612,0,1,Disclaimer I survivor main. I play games for f...,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Only keep reviews <= 300 words
* Need to calculate 10th, 25th, 50th, 75th, 90th percentile of helpful/funny votes on the removed reviews to unders

In [6]:
# remove reviews over a limit
removed_idx = []
for i in range(len(df)):
    review = df.iloc[i].review
    words = review.split(" ")
    if len(words) > 200:
        removed_idx.append(i)


df.drop(df.index[removed_idx], inplace=True)
print ("removed {} reviews > 200 words".format(len(removed_idx)))

removed 15916 reviews > 300 words


# Clean up textual data (try this stuff later, do vanilla run first)
* Remove common stopwords?
* Lower case everything 
* All ‘s in the data is preceded by a ‘\’ which should be cleaned out
* Maybe don't lower case everything... ex: 
    * "I HIGHLY RECOMMEND THIS GAME CAUSE THERE ARE SOME KILLERS WILL FOLLOW YOU NO MATTER WHERE YOU GO IN WHICH YOU'LL GO LOOPING EACH OTHER Who said this game is a horror game but MEHHHH Lieesssss ahhahahaha xDDDI never laugh so hard cause of this game I LOVE IT &lt 3333"

In [7]:
subset_reviews = df['review'].tolist()

In [8]:
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [9]:
cleaned_reviews = []
remove_rows = []
stop_words = set(stopwords.words('english'))

orig_len = len(df)
start_time = time.time()
for i, each_review in enumerate(subset_reviews):
    each_cleaned_review = []
    words = each_review.split(" ")
    for idx, each_word in enumerate(words):
        encoded_text = each_word.encode('utf8')
        if "\\" in str(encoded_text):
            remove_rows.append(i)
            break
        cleaned = each_word.replace("(", "").replace(")", "") # remove ()
        if "." in cleaned:
            if cleaned.split(".")[1] not in ["", "'"]:
                cleaned = cleaned.replace(".", ". ") # add space after "." 
        cleaned = cleaned.translate(table).lower()
        cleaned = reduce_lengthening(cleaned)
        if len(cleaned) != 0 and cleaned not in stop_words:
            #figure out how to stop empty strings
            each_cleaned_review.append(cleaned)
    cleaned_reviews.append(" ".join(each_cleaned_review))
    if len(each_cleaned_review) == 0:
#         cleaned_reviews.append(" ".join(each_cleaned_review))
#     else:
        remove_rows.append(i)

cleaned_reviews = [i for j, i in enumerate(cleaned_reviews) if j not in remove_rows]
df.drop(df.index[remove_rows], inplace=True)
print ("Removed {} rows".format(orig_len - len(df)))
print ("time took: ".format(time.time() - start_time))
print(cleaned_reviews)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [10]:
print (len(cleaned_reviews))
print (len(df))

405671
405671


In [11]:
# this cell takes ~20 minutes to run
unknowns = {} 
chkr = enchant.checker.SpellChecker("en_EN")

start_time = time.time()

for i, review in enumerate(cleaned_reviews):
    if i % 50000 == 0:
        print("Took {} for 50000 reviews ".format(time.time() - start_time))
        start_time = time.time()
    each_cleaned_review = []
    words = review.split(" ")
    for idx, each_word in enumerate(words):
        if len(each_word) == 0: # to handle double spaces
            continue
        if not chkr.check(each_word):
            if not chkr.check(each_word[0].upper()+each_word[1:]): # check proper nouns
                if each_word not in unknowns.keys():
                    unknowns[each_word] = 1
                else:
                    unknowns[each_word] = unknowns[each_word] + 1

common_unknowns = OrderedDict(sorted(unknowns.items(), key = itemgetter(1), reverse = True))

# print(common_unknowns[0:20])

Took 0.0009996891021728516 for 50000 reviews 
Took 125.6769585609436 for 50000 reviews 
Took 83.14071822166443 for 50000 reviews 
Took 119.57928419113159 for 50000 reviews 
Took 102.48810195922852 for 50000 reviews 
Took 128.31521463394165 for 50000 reviews 
Took 110.00389075279236 for 50000 reviews 
Took 111.78918862342834 for 50000 reviews 
Took 113.54140734672546 for 50000 reviews 


In [12]:
common_unknowns = OrderedDict(sorted(unknowns.items(), key = itemgetter(1), reverse = True))

print(common_unknowns)



In [13]:
slang_dict = {}
with open("data/slangdict.txt") as f:
    for line in f:
        slang = line.split("-")
        if len(slang) > 1:
            key = slang[0].strip().translate(table).lower()
            val = slang[1].strip().translate(table).lower()
            slang_dict[key] = val
# print(slang_dict)

In [14]:
spell = SpellChecker(distance=1)
final_cleaned_reviews = []
unk_counter = 0

start_time = time.time()
for i, review in enumerate(cleaned_reviews):
    if i%50000 == 0:
        print ("For 50000, took {} seconds".format(time.time() - start_time))
        start_time = time.time()
    each_cleaned_review = []
    words = review.split(" ")
    for idx, each_word in enumerate(words):
        if each_word in unknowns.keys():
            if unknowns[each_word] >= 50:
                continue
            else:
                if each_word in slang_dict.keys():
                    continue
                    # words[idx] =slang_dict[each_word]:
                # fix = suggest(each_word)
                # w1 = fix[0][0]
                w1 = spell.correction(each_word)
                if w1 != each_word:
                    words[idx] = w1
                else:
                    words[idx] = 'UNK'
                    unk_counter+=1
                    
    final_cleaned_reviews.append((" ".join(words)))

For 50000, took 0.0 seconds
For 50000, took 27.515434503555298 seconds
For 50000, took 17.21400260925293 seconds
For 50000, took 23.624836206436157 seconds
For 50000, took 21.182337999343872 seconds
For 50000, took 26.620850801467896 seconds
For 50000, took 22.688340425491333 seconds
For 50000, took 22.935685634613037 seconds
For 50000, took 24.11748504638672 seconds


In [15]:
unk_counter

146805

In [27]:
cleaned_reviews

['gt played german reichgt declare war belgiumgt cant break belgium go francegt capitulate france order get belgiumgt get true blitzkrieg achievementthis game dad',
 'yes',
 'good game although bit overpriced opinion id prefer playing game mods historical accuracy although vanilla version good aswell 710',
 'disclaimer survivor main play games fun competition dbd community doesnt really get get bad killer face camps oh well die move next game get good killer finds immediately hooks bc cant juke whoops patrol camps im 100 ok biggie next game please think lot community salt killers comes dealing rank one survivors dont actually play game properly run around flash killer face legitimate strategy takes away fun game opinion as killer horrible one ive also called names survivors tell get gud older players ruin newer players players dont put 100s hours relax fun game dont pip always next game said game crazy fun dont let saltiness community keep buying game grab fun ignore anyone might give 

#todo spellcheck

In [16]:
len(df)

405671

In [17]:
len(final_cleaned_reviews)

405671

In [18]:
df['cleaned_reviews'] = final_cleaned_reviews

# tf-idf below

In [None]:
raw_text = test_drop['review']
vectorizer = TfidfVectorizer(use_idf=True)
vectors = vectorizer.fit_transform(raw_text.apply(lambda x: np.str_(x))) #.apply(lambda x: np.str_(x))

# X_as_array = X.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list
# print(len(X_as_array))

feature_names = vectorizer.get_feature_names()

In [None]:
# dense = vectors.todense()
vectorlist = vectors.tolist()
tf_idf = pd.DataFrame(vectorlist, columns=feature_names)

In [None]:
print(vectors.shape)
print(feature_names)

**Ben's Date Stuff**

In [19]:
newColumns = df["date_posted"].str.split("-", n = 2, expand = True) 

In [20]:
df['Year'] = newColumns[0]
df['Month'] = newColumns[1]
df['Day'] = newColumns[2]
df.drop(columns = ['date_posted'], inplace = True)

df.head()

Unnamed: 0,funny,helpful,hour_played,is_early_access_review,recommendation,review,title_ACE COMBAT™ 7: SKIES UNKNOWN,title_ARK: Survival Evolved,title_ASTRONEER,title_Battlefleet Gothic: Armada 2,...,title_Terraria,title_The Elder Scrolls V: Skyrim Special Edition,title_Tom Clancy's Rainbow Six® Siege,title_Wallpaper Engine,title_Wargroove,"title_Warhammer 40,000: Mechanicus",cleaned_reviews,Year,Month,Day
0,2,4,578,0,1,&gt Played as German Reich&gt Declare war on B...,0,0,0,0,...,0,0,0,0,0,0,gt played german UNK declare war UNK cant brea...,2019,2,10
1,0,0,184,0,1,yes.,0,0,0,0,...,0,0,0,0,0,0,yes,2019,2,10
2,0,0,892,0,1,Very good game although a bit overpriced in my...,0,0,0,0,...,0,0,0,0,0,0,good game although bit overpriced opinion id p...,2019,2,7
7,295,219,71,0,1,I have never been told to kill myself more tha...,0,0,0,0,...,0,0,0,0,0,0,never told kill playing game,2018,12,24
9,380,271,414,0,1,if you think cs go is toxic try this game,0,0,0,0,...,0,0,0,0,0,0,think cs go toxic try game,2018,12,5


# Encoding text below

In [22]:
# remove reviews over a limit
removed_idx = []
for i in range(len(df)):
    review = df.iloc[i].cleaned_reviews
    words = review.split(" ")
    if len(words) > 200:
        removed_idx.append(i)


df_ = df.drop(df.index[removed_idx])
print ("removed {} reviews > 200 words".format(len(removed_idx)))

removed 2 reviews > 200 words


In [24]:
VOCAB_SIZE = None
MAX_SEQ_LEN = 0

# find vocab_size
all_words = {}
for each_review in df_.cleaned_reviews.tolist():
    word_list = each_review.split(" ")
    # find max seq len
    if len(word_list) > MAX_SEQ_LEN:
        MAX_SEQ_LEN = len(word_list)
        sent = word_list

    for ea_word in word_list:
        if ea_word in all_words:
            all_words[ea_word] += 1
        else:
            all_words[ea_word] = 1
VOCAB_SIZE = len(all_words.keys())
print ('vocab_size = ', VOCAB_SIZE)
print ('max_seq_len = ', MAX_SEQ_LEN)
# print (MAX_SEQ_LEN, sent)
# vocab_size =  52716
# max_seq_len =  7984

vocab_size =  41248
max_seq_len =  194


In [25]:
final_cleaned_reviews = df_.cleaned_reviews.tolist()

In [26]:
encoded_reviews = [one_hot(x, VOCAB_SIZE) for x in final_cleaned_reviews]
padded_reviews = pad_sequences(encoded_reviews, maxlen=MAX_SEQ_LEN, padding='pre')
padded_reviews.shape

(405669, 194)

In [27]:
padded_reviews

array([[    0,     0,     0, ..., 25557, 14122, 23610],
       [    0,     0,     0, ...,     0,     0, 30552],
       [    0,     0,     0, ..., 40182,  6951, 39745],
       ...,
       [    0,     0,     0, ..., 16098, 28024, 35506],
       [    0,     0,     0, ..., 36768,  8813, 18763],
       [    0,     0,     0, ...,  2336, 17223, 23064]])

In [29]:
encoded_text_cols = padded_reviews.shape[1]
for col_idx in range(encoded_text_cols):
    df_.insert(len(df_.columns), "encoded_{}".format(col_idx+1), padded_reviews[:, col_idx])
df_.head()

Unnamed: 0,funny,helpful,hour_played,is_early_access_review,recommendation,review,title_ACE COMBAT™ 7: SKIES UNKNOWN,title_ARK: Survival Evolved,title_ASTRONEER,title_Battlefleet Gothic: Armada 2,...,encoded_185,encoded_186,encoded_187,encoded_188,encoded_189,encoded_190,encoded_191,encoded_192,encoded_193,encoded_194
0,2,4,578,0,1,&gt Played as German Reich&gt Declare war on B...,0,0,0,0,...,33572,15335,18484,25557,18484,12429,8541,25557,14122,23610
1,0,0,184,0,1,yes.,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30552
2,0,0,892,0,1,Very good game although a bit overpriced in my...,0,0,0,0,...,14122,39490,27271,39764,28167,8875,1494,40182,6951,39745
7,295,219,71,0,1,I have never been told to kill myself more tha...,0,0,0,0,...,0,0,0,0,0,34155,14517,19363,40032,14122
9,380,271,414,0,1,if you think cs go is toxic try this game,0,0,0,0,...,0,0,0,0,10818,2776,9421,19118,3218,14122


In [30]:
df_.shape

(405669, 252)

# Code below is to save cleaned dataset -- don't edit

In [31]:
def save_cleaned_dataset(final_df):
    # dataset must be a pd dataframe
    root = "data/"
    final_df.to_csv(root + "cleaned_steam_data_3-29.csv")
save_cleaned_dataset(df_) # testing

In [64]:
def save_cleaned_dataset(dataset):
    # dataset must be a np array
    root = "data/"
    cols_added = dataset.shape[1]
    col_names = []
    for col_idx in range(cols_added):
        col_names.append("embedded_{}".format(col_idx))
    dataset_df = pd.DataFrame(data=dataset, columns=col_names)
    dataset_df.to_csv("steam_text_data_3-28.csv")
#     savetxt(root + 'cleaned_steam_data_3-28.csv', dataset, delimiter=',')
save_cleaned_dataset(padded_reviews) # testing

KeyboardInterrupt: 