# jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [19]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib.pyplot as plt
import re, itertools
import string
from collections import OrderedDict
from operator import itemgetter
import time

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# keras imports
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding

# nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# text cleanup
from pattern.en import suggest
import enchant
from enchant.checker import SpellChecker
from spellchecker import SpellChecker

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
data_dir = "data/steam_reviews.csv"
df = pd.read_csv(data_dir)
df.head()

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns
1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns
2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns
3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight
4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight


# remove nan reviews

In [28]:
orig_len = len(df)
df.dropna(axis=0, inplace=True)
print ('dropped {} nan reviews'.format(orig_len - len(df)))

dropped 1516 nan reviews


In [29]:
table = str.maketrans('', '', string.punctuation)
len(table)

32

# one-hot encode title & early access & recommendation

In [30]:
df['is_early_access_review'] = df['is_early_access_review'].astype('int')
df['recommendation'] = df['recommendation'].map({'Recommended':1, 'Not Recommended':0})
df = pd.get_dummies(df, columns=['title'])
df.head()

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title_ACE COMBAT™ 7: SKIES UNKNOWN,title_ARK: Survival Evolved,title_ASTRONEER,...,title_Subnautica,title_Subnautica: Below Zero,title_Survivor Pass: Vikendi,title_Tannenberg,title_Terraria,title_The Elder Scrolls V: Skyrim Special Edition,title_Tom Clancy's Rainbow Six® Siege,title_Wallpaper Engine,title_Wargroove,"title_Warhammer 40,000: Mechanicus"
0,2019-02-10,2,4,578,0,1,&gt Played as German Reich&gt Declare war on B...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2019-02-10,0,0,184,0,1,yes.,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2019-02-07,0,0,892,0,1,Very good game although a bit overpriced in my...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2018-06-14,126,1086,676,0,1,Out of all the reviews I wrote This one is pro...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2017-06-20,85,2139,612,0,1,Disclaimer I survivor main. I play games for f...,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Clean up textual data (try this stuff later, do vanilla run first)
* Remove common stopwords?
* Lower case everything 
* All ‘s in the data is preceded by a ‘\’ which should be cleaned out
* Maybe don't lower case everything... ex: 
    * "I HIGHLY RECOMMEND THIS GAME CAUSE THERE ARE SOME KILLERS WILL FOLLOW YOU NO MATTER WHERE YOU GO IN WHICH YOU'LL GO LOOPING EACH OTHER Who said this game is a horror game but MEHHHH Lieesssss ahhahahaha xDDDI never laugh so hard cause of this game I LOVE IT &lt 3333"

In [31]:
subset_reviews = df['review'].tolist()

In [32]:
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [33]:
cleaned_reviews = []
remove_rows = []
stop_words = set(stopwords.words('english'))

orig_len = len(df)
start_time = time.time()
for i, each_review in enumerate(subset_reviews):
    each_cleaned_review = []
    words = each_review.split(" ")
    for idx, each_word in enumerate(words):
        encoded_text = each_word.encode('utf8')
        if "\\" in str(encoded_text):
            remove_rows.append(i)
            break
        cleaned = each_word.replace("(", "").replace(")", "") # remove ()
        if "." in cleaned:
            if cleaned.split(".")[1] not in ["", "'"]:
                cleaned = cleaned.replace(".", ". ") # add space after "." 
        cleaned = cleaned.translate(table).lower()
        cleaned = reduce_lengthening(cleaned)
        if len(cleaned) != 0 and cleaned not in stop_words:
            #figure out how to stop empty strings
            each_cleaned_review.append(cleaned)
    cleaned_reviews.append(" ".join(each_cleaned_review))
    if len(each_cleaned_review) == 0:
#         cleaned_reviews.append(" ".join(each_cleaned_review))
#     else:
        remove_rows.append(i)

cleaned_reviews = [i for j, i in enumerate(cleaned_reviews) if j not in remove_rows]
df.drop(df.index[remove_rows], inplace=True)
print ("Removed {} rows".format(orig_len - len(df)))
print ("time took: ".format(time.time() - start_time))
print(cleaned_reviews)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [34]:
print (len(cleaned_reviews))
print (len(df))

420952
420952


In [35]:
# this cell takes ~20 minutes to run
unknowns = {} 
chkr = enchant.checker.SpellChecker("en_EN")

start_time = time.time()

for i, review in enumerate(cleaned_reviews):
    if i % 50000 == 0:
        print("Took {} for 50000 reviews ".format(time.time() - start_time))
        start_time = time.time()
    each_cleaned_review = []
    words = review.split(" ")
    for idx, each_word in enumerate(words):
        if len(each_word) == 0: # to handle double spaces
            continue
        if not chkr.check(each_word):
            if not chkr.check(each_word[0].upper()+each_word[1:]): # check proper nouns
                if each_word not in unknowns.keys():
                    unknowns[each_word] = 1
                else:
                    unknowns[each_word] = unknowns[each_word] + 1

common_unknowns = OrderedDict(sorted(unknowns.items(), key = itemgetter(1), reverse = True))

# print(common_unknowns[0:20])

Took 0.0 for 50000 reviews 
Took 201.74360156059265 for 50000 reviews 
Took 98.34306955337524 for 50000 reviews 
Took 170.25280809402466 for 50000 reviews 
Took 136.42627453804016 for 50000 reviews 
Took 174.168310880661 for 50000 reviews 
Took 141.93176817893982 for 50000 reviews 
Took 140.22408533096313 for 50000 reviews 
Took 166.5477135181427 for 50000 reviews 


In [44]:
common_unknowns = OrderedDict(sorted(unknowns.items(), key = itemgetter(1), reverse = True))

print(common_unknowns)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [39]:
slang_dict = {}
with open("data/slangdict.txt") as f:
    for line in f:
        slang = line.split("-")
        if len(slang) > 1:
            key = slang[0].strip().translate(table).lower()
            val = slang[1].strip().translate(table).lower()
            slang_dict[key] = val
# print(slang_dict)

In [40]:
spell.correction('flashlights1010')

NameError: name 'spell' is not defined

In [45]:
spell = SpellChecker(distance=1)
final_cleaned_reviews = []
unk_counter = 0

start_time = time.time()
for i, review in enumerate(cleaned_reviews):
    if i%50000 == 0:
        print ("For 50000, took {} seconds".format(time.time() - start_time))
        start_time = time.time()
    each_cleaned_review = []
    words = review.split(" ")
    for idx, each_word in enumerate(words):
        if each_word in unknowns.keys():
            if unknowns[each_word] >= 10:
                continue
            else:
                if each_word in slang_dict.keys():
                    continue
                    # words[idx] =slang_dict[each_word]:
                # fix = suggest(each_word)
                # w1 = fix[0][0]
                w1 = spell.correction(each_word)
                if w1 != each_word:
                    words[idx] = w1
                else:
                    words[idx] = 'UNK'
                    unk_counter+=1
                    
    final_cleaned_reviews.append((" ".join(words)))

For 50000, took 0.0 seconds
For 50000, took 30.151373624801636 seconds
For 50000, took 15.430725812911987 seconds
For 50000, took 24.0816171169281 seconds
For 50000, took 21.93634867668152 seconds
For 50000, took 26.08426022529602 seconds
For 50000, took 21.73989200592041 seconds
For 50000, took 20.700653314590454 seconds
For 50000, took 25.330251693725586 seconds


In [46]:
unk_counter

154578

In [47]:
cleaned_reviews

['gt played german reichgt declare war belgiumgt cant break belgium go francegt capitulate france order get belgiumgt get true blitzkrieg achievementthis game dad',
 'yes',
 'good game although bit overpriced opinion id prefer playing game mods historical accuracy although vanilla version good aswell 710',
 'reviews wrote one probably serious one wrote starters community game sucks like every online game dont wanna talk theyre jerks wanna say suck camped blah blah blah rare get nice compliment people game but dont talk community either unlike online games one doesnt mic system dont worry hearing twelve year old cry baby raging nothing time talk people game beginning match starts survivor end match killer survivor end chat one killer talk everyone match though mostly say gg good game as survivor basically play every man sneaky say screw players try good team mate rescue get hook someone jerk chat match know save get hooked mostly play killer find fun playing survivor someones gotta play

#todo spellcheck

In [48]:
len(df)

420952

In [49]:
len(final_cleaned_reviews)

420952

In [50]:
df['cleaned_reviews'] = final_cleaned_reviews

In [None]:
raw_text = test_drop['review']
vectorizer = TfidfVectorizer(use_idf=True)
vectors = vectorizer.fit_transform(raw_text.apply(lambda x: np.str_(x))) #.apply(lambda x: np.str_(x))

# X_as_array = X.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list
# print(len(X_as_array))

feature_names = vectorizer.get_feature_names()

In [None]:
# dense = vectors.todense()
vectorlist = vectors.tolist()
tf_idf = pd.DataFrame(vectorlist, columns=feature_names)

In [None]:
print(vectors.shape)
print(feature_names)

**Ben's Date Stuff**

In [52]:
newColumns = df["date_posted"].str.split("-", n = 2, expand = True) 

In [53]:
df['Year'] = newColumns[0]
df['Month'] = newColumns[1]
df['Day'] = newColumns[2]
df.drop(columns = ['date_posted'], inplace = True)

df.head()

Unnamed: 0,funny,helpful,hour_played,is_early_access_review,recommendation,review,title_ACE COMBAT™ 7: SKIES UNKNOWN,title_ARK: Survival Evolved,title_ASTRONEER,title_Battlefleet Gothic: Armada 2,...,title_Terraria,title_The Elder Scrolls V: Skyrim Special Edition,title_Tom Clancy's Rainbow Six® Siege,title_Wallpaper Engine,title_Wargroove,"title_Warhammer 40,000: Mechanicus",cleaned_reviews,Year,Month,Day
0,2,4,578,0,1,&gt Played as German Reich&gt Declare war on B...,0,0,0,0,...,0,0,0,0,0,0,gt played german UNK declare war UNK cant brea...,2019,2,10
1,0,0,184,0,1,yes.,0,0,0,0,...,0,0,0,0,0,0,yes,2019,2,10
2,0,0,892,0,1,Very good game although a bit overpriced in my...,0,0,0,0,...,0,0,0,0,0,0,good game although bit overpriced opinion id p...,2019,2,7
3,126,1086,676,0,1,Out of all the reviews I wrote This one is pro...,0,0,0,0,...,0,0,0,0,0,0,reviews wrote one probably serious one wrote s...,2018,6,14
4,85,2139,612,0,1,Disclaimer I survivor main. I play games for f...,0,0,0,0,...,0,0,0,0,0,0,disclaimer survivor main play games fun compet...,2017,6,20


# Encoding text below

In [54]:
VOCAB_SIZE = None
MAX_SEQ_LEN = 0

# find vocab_size
all_words = {}
for each_review in final_cleaned_reviews:
    word_list = each_review.split(" ")
    # find max seq len
    if len(word_list) > MAX_SEQ_LEN:
        MAX_SEQ_LEN = len(word_list)
        sent = word_list

    for ea_word in word_list:
        if ea_word in all_words:
            all_words[ea_word] += 1
        else:
            all_words[ea_word] = 1
VOCAB_SIZE = len(all_words.keys())
print ('vocab_size = ', VOCAB_SIZE)
print ('max_seq_len = ', MAX_SEQ_LEN)
# print (MAX_SEQ_LEN, sent)

vocab_size =  52716
max_seq_len =  7984


In [55]:
encoded_reviews = [one_hot(x, VOCAB_SIZE) for x in final_cleaned_reviews]
padded_reviews = pad_sequences(encoded_reviews, maxlen=MAX_SEQ_LEN, padding='pre')
padded_reviews.shape

(420952, 7984)

In [57]:
padded_reviews

array([[    0,     0,     0, ..., 37261, 29001,  1333],
       [    0,     0,     0, ...,     0,     0, 13974],
       [    0,     0,     0, ...,  1558, 48883, 52226],
       ...,
       [    0,     0,     0, ..., 49683, 33130, 15486],
       [    0,     0,     0, ..., 16438, 11520, 50366],
       [    0,     0,     0, ...,  3311, 30022,  4011]])

In [59]:
encoded_text_cols = padded_reviews.shape[1]
for col_idx in range(encoded_text_cols):
    df.insert(len(df.columns), "encoded_{}".format(col_idx+1), padded_reviews[:, col_idx])
padded_reviews.head()

MemoryError: Unable to allocate 6.39 GiB for an array with shape (4075, 420952) and data type int32

# Code below is to save cleaned dataset -- don't edit

In [None]:
def save_cleaned_dataset(final_df):
    # dataset must be a pd dataframe
    root = "data/"
    final_df.to_csv("cleaned_steam_data_3-28")
save_cleaned_dataset(padded_reviews) # testing

In [64]:
def save_cleaned_dataset(dataset):
    # dataset must be a np array
    root = "data/"
    cols_added = dataset.shape[1]
    col_names = []
    for col_idx in range(cols_added):
        col_names.append("embedded_{}".format(col_idx))
    dataset_df = pd.DataFrame(data=dataset, columns=col_names)
    dataset_df.to_csv("steam_text_data_3-28.csv")
#     savetxt(root + 'cleaned_steam_data_3-28.csv', dataset, delimiter=',')
save_cleaned_dataset(padded_reviews) # testing

KeyboardInterrupt: 