# jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [None]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib.pyplot as plt

# keras imports
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding

from sklearn.feature_extraction.text import TfidfVectorizer

import re, itertools

import nltk

from nltk.corpus import stopwords

from pattern.en import suggest

import enchant

from enchant.checker import SpellChecker
from spellchecker import SpellChecker

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
data_dir = "data/steam_reviews.csv"
df = pd.read_csv(data_dir)
# print(type(df))
df.head()

# remove nan reviews

In [None]:
orig_len = len(df)
df.dropna(axis=0, inplace=True)
print ('dropped {} nan reviews'.format(orig_len - len(df)))

In [None]:
import string
table = str.maketrans('', '', string.punctuation)
len(table)

In [None]:
# table

In [None]:
'@!testTEST!@#!@#'.translate(table).lower()

# Clean up textual data (try this stuff later, do vanilla run first)
* Remove common stopwords?
* Lower case everything 
* All ‘s in the data is preceded by a ‘\’ which should be cleaned out
* Maybe don't lower case everything... ex: 
    * "I HIGHLY RECOMMEND THIS GAME CAUSE THERE ARE SOME KILLERS WILL FOLLOW YOU NO MATTER WHERE YOU GO IN WHICH YOU'LL GO LOOPING EACH OTHER Who said this game is a horror game but MEHHHH Lieesssss ahhahahaha xDDDI never laugh so hard cause of this game I LOVE IT &lt 3333"

In [None]:
# subset_reviews = df['review'][:50].tolist()
subset_reviews = df['review'].tolist()

In [None]:
# a lot of (), so I'm cleaning it out
# add space after "."
# cleaned_reviews = []

# for each_review in subset_reviews:
#     each_cleaned_review = []
#     words = each_review.split(" ")
#     for idx, each_word in enumerate(words):
#         cleaned = each_word.replace("(", "").replace(")", "") # remove ()
#         if "." in cleaned:
#             if cleaned.split(".")[1] not in ["", "'"]:
#                 cleaned = cleaned.replace(".", ". ") # add space after "." 
#         cleaned = cleaned.translate(table).lower()
#         each_cleaned_review.append(cleaned)
#     cleaned_reviews.append(" ".join(each_cleaned_review))
# cleaned_reviews[0]

In [None]:
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)


# word = "pls"
# word_wlf = reduce_lengthening(word) #calling function defined above
# print(word_wlf) #word lengthening isn't being able to fix it completely

# correct_word = suggest(word_wlf) 
# print(correct_word)

# from autocorrect import Speller

# spell = Speller(lang='en')
# print(spell(word))

In [None]:
# words = ['&gt', 'Played', 'as', 'German', 'Reich&gt', 'Declare', 'war', 'on', 'Belgium&gt', "Can't", 'break', 'Belgium', 'so', 'go', 'through', 'France&gt', 'Capitulate', 'France', 'in', 'order', 'to', 'get', 'to', 'Belgium&gt', 'Get', 'True', 'Blitzkrieg', 'achievementThis', 'game', 'is', 'dad']
# for idx, each_word in enumerate(words):
#     print(each_word)

In [None]:
from nltk.corpus import stopwords 
from collections import OrderedDict
from operator import itemgetter    

cleaned_reviews = []
remove_rows = []
stop_words = set(stopwords.words('english'))


for i, each_review in enumerate(subset_reviews):
    each_cleaned_review = []
    words = each_review.split(" ")
    for idx, each_word in enumerate(words):
        encoded_text = each_word.encode('utf8')
        if "\\" in str(encoded_text):
            remove_rows.append(idx)
            break
        cleaned = each_word.replace("(", "").replace(")", "") # remove ()
        if "." in cleaned:
            if cleaned.split(".")[1] not in ["", "'"]:
                cleaned = cleaned.replace(".", ". ") # add space after "." 
        cleaned = cleaned.translate(table).lower()
        cleaned = reduce_lengthening(cleaned)
        if len(cleaned) != 0 and cleaned not in stop_words:
            #figure out how to stop empty strings
            each_cleaned_review.append(cleaned)#.replace(" ", ""))
        else:
            remove_rows.append(i) 
            continue
    if len(each_cleaned_review) > 0:
        cleaned_reviews.append(" ".join(each_cleaned_review))
# df.drop(df.index[remove_rows])
# print(cleaned_reviews)

In [None]:
print(cleaned_reviews[4775])

In [None]:
unknowns = {}

chkr = enchant.checker.SpellChecker("en_EN")

for i, review in enumerate(cleaned_reviews):
    each_cleaned_review = []
    words = review.split(" ")
    for idx, each_word in enumerate(words):
        if len(each_word) == 0:
            continue
        if not chkr.check(each_word):
            if not chkr.check(each_word[0].upper()+each_word[1:]):
                if each_word not in unknowns.keys():
                    unknowns[each_word] = 1
                else:
                    if 
                        
                    else:
                        unknowns[each_word] = unknowns[each_word] + 1



In [None]:
common_unknowns = OrderedDict(sorted(unknowns.items(), key = itemgetter(1), reverse = True))

print(common_unknowns)


In [None]:
word_wlf = 'pervect'

print(chkr.check(word_wlf))

correct_word = suggest(word_wlf) 
print(correct_word)
print(correct_word[0][1])

# from autocorrect import Speller

# spells = Speller(lang='en')
# print(spells(word_wlf))

spell = SpellChecker(distance=1)
print(list(spell.unknown([word_wlf])))
print(list(spell.known([word_wlf])))
w = list(spell.unknown([word_wlf]))[0]
print(spell.correction(w))
print(spell.candidates(w))


In [None]:
slang_dict = {}
with open("data/slangdict.txt") as f:
    for line in f:
        slang = line.split("-")
        if len(slang) > 1:
            key = slang[0].strip().translate(table).lower()
            val = slang[1].strip().translate(table).lower()
            slang_dict[key] = val
print(slang_dict)

In [None]:
spell = SpellChecker(distance=1)

for i, review in enumerate(cleaned_reviews):
    each_cleaned_review = []
    words = review.split(" ")
    for idx, each_word in enumerate(words):
        if each_word in unknowns.keys():
            if unknowns[each_word] >= 500:
                continue
            else:
                if each_word in slang_dict.keys():
    #                  words[idx] =slang_dict[each_word]:
                        continue
                else:
                    if unknowns[each_word] < 500:
                        fix = suggest(each_word)
                        w1 = fix[0][0]
#                         prob = fix[0][1]
#                         print(list(spell.unknown([word_wlf])))
#                         print(list(spell.known([word_wlf])))
#                         w = list(spell.unknown([word_wlf]))[0]
                        w2 = spell.correction(each_word)
                        if w1 == w2:
                            words[idx] = w1
                        else:
                            words[idx] = 'UNK'
    cleaned_reviews[i] = (" ".join(words))
# print(cleaned_reviews)

#todo spellcheck

In [None]:
# df['is_early_access_review'] = df['is_early_access_review'].astype('int')
# df['recommendation'] = df['recommendation'].map({'Recommended':1, 'Not Recommended':0})
# df = pd.get_dummies(df, columns = ['title'] )
df['cleaned_reviews'] = cleaned_reviews
df.head()

In [None]:
len(df)

In [None]:
# clean out foreign language and bold/italics, etc
# reviews = df['cleaned_reviews'].tolist()
# reviews = df['review'].tolist()
# deleted_rows = 0
# bad_rows = []

# for idx, eachreview in enumerate(reviews):
#     word_list = eachreview.split(" ")
#     for eachword in word_list:
#         encoded_text = eachword.encode('utf8')
#         if "\\" in str(encoded_text):
#             bad_rows.append(idx)
#             break


# test_drop = df.drop(df.index[bad_rows])

In [None]:
len(df) - len(test_drop)
# dropped less, look into this

In [None]:
raw_text = test_drop['review']
vectorizer = TfidfVectorizer(use_idf=True)
vectors = vectorizer.fit_transform(raw_text.apply(lambda x: np.str_(x))) #.apply(lambda x: np.str_(x))

# X_as_array = X.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list
# print(len(X_as_array))

feature_names = vectorizer.get_feature_names()

In [None]:
# dense = vectors.todense()
vectorlist = vectors.tolist()
tf_idf = pd.DataFrame(vectorlist, columns=feature_names)

In [None]:
print(vectors.shape)
print(feature_names)

**Ben's Date Stuff**

In [None]:
newColumns = df["date_posted"].str.split("-", n = 2, expand = True) 

In [None]:
df['Year'] = newColumns[0]
df['Month'] = newColumns[1]
df['Day'] = newColumns[2]
df.drop(columns = ['date_posted'], inplace = True)

df

# Encoding text below

In [None]:
VOCAB_SIZE = None
MAX_SEQ_LEN = 0

# find vocab_size
all_words = {}
for each_review in cleaned_reviews:
    word_list = each_review.split(" ")
    # find max seq len
    if len(word_list) > MAX_SEQ_LEN:
        MAX_SEQ_LEN = len(word_list)
        sent = word_list

    for ea_word in word_list:
        if ea_word in all_words:
            all_words[ea_word] += 1
        else:
            all_words[ea_word] = 1
VOCAB_SIZE = len(all_words.keys())
print ('vocab_size = ', VOCAB_SIZE)
print ('max_seq_len = ', MAX_SEQ_LEN)
# print (MAX_SEQ_LEN, sent)

In [None]:
encoded_reviews = [one_hot(x, VOCAB_SIZE) for x in cleaned_reviews]
padded_reviews = pad_sequences(encoded_reviews, maxlen=MAX_SEQ_LEN, padding='pre')
padded_reviews.shape

In [None]:
padded_docs

# Code below is to save cleaned dataset -- don't edit

In [None]:
def save_cleaned_dataset(dataset):
    # dataset must be a np array
    root = "data/"
    savetxt(root + 'cleaned_steam_data.csv', dataset, delimiter=',')
save_cleaned_dataset(padded_docs) # testing