# jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [None]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import re, itertools
import string
from collections import OrderedDict
from operator import itemgetter
import time

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# keras imports
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding

# nltk
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
nltk.download('stopwords')

# text cleanup
# from pattern.en import suggest
import enchant
from enchant.checker import SpellChecker
from spellchecker import SpellChecker

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
data_dir = "data/steam_reviews.csv"
df = pd.read_csv(data_dir)
df.head()

# Percentile graphing below -- it takes a very long time to run?

In [None]:
# perc = np.percentile(df['helpful'], 99) #10,25,50,75,90
d = df['helpful']
print(np.max(df['helpful']))
p = np.linspace(90, 100, 10)
# p = [10.0,25.0,50.0,75.0,90.0]
perc = np.percentile(d, p)

plt.plot(d)
plt.ylabel('# of Helpful Votes')
plt.xlabel('Reviews')
plt.show()

fig = plt.figure()
ax = plt.axes()
ax.plot(p, perc)
plt.ylabel('# of Votes')
plt.xlabel('Percentile')

rec = df['recommendation']
yes = df['recommendation'].sum()
no = len(df['recommendation']) - yes
print(yes)
print(no)

x = ("Recommended", "Not Recommended")
y = [yes, no]
fig2 = plt.figure()

plt.bar(x, y, align='center', alpha=0.5)
plt.ylabel('# of Reviews')
plt.show()

fig3 = plt.figure()

# remove nan reviews

In [None]:
orig_len = len(df)
df.dropna(axis=0, inplace=True)
print ('dropped {} nan reviews'.format(orig_len - len(df)))

In [None]:
table = str.maketrans('', '', string.punctuation)
len(table)

# Moved the below code after removing nan reviews, also refactored for speed

In [None]:
r = df["review"].tolist()
r1 = []
for i in range(len(r)):
    word_len = len(r[i].split(" "))
    r1.append(word_len)
print (np.mean(r1))
print (np.sum(r1))

# one-hot encode title & early access & recommendation

In [None]:
df['is_early_access_review'] = df['is_early_access_review'].astype('int')
df['recommendation'] = df['recommendation'].map({'Recommended':1, 'Not Recommended':0})
df = pd.get_dummies(df, columns=['title'])
df.head()

# Only keep reviews <= 300 words
* Need to calculate 10th, 25th, 50th, 75th, 90th percentile of helpful/funny votes on the removed reviews to unders

In [None]:
# remove reviews over a limit
removed_idx = []
for i in range(len(df)):
    review = df.iloc[i].review
    words = review.split(" ")
    if len(words) > 200: 
        removed_idx.append(i)


df.drop(df.index[removed_idx], inplace=True)
print ("removed {} reviews > 200 words".format(len(removed_idx)))

# Clean up textual data (try this stuff later, do vanilla run first)
* Remove common stopwords?
* Lower case everything 
* All ‘s in the data is preceded by a ‘\’ which should be cleaned out
* Maybe don't lower case everything... ex: 
    * "I HIGHLY RECOMMEND THIS GAME CAUSE THERE ARE SOME KILLERS WILL FOLLOW YOU NO MATTER WHERE YOU GO IN WHICH YOU'LL GO LOOPING EACH OTHER Who said this game is a horror game but MEHHHH Lieesssss ahhahahaha xDDDI never laugh so hard cause of this game I LOVE IT &lt 3333"

In [None]:
subset_reviews = df['review'].tolist()

In [None]:
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [None]:
lemmatizer = WordNetLemmatizer() 
cleaned_reviews = []
remove_rows = []
stop_words = set(stopwords.words('english'))

orig_len = len(df)
start_time = time.time()
for i, each_review in enumerate(subset_reviews):
    each_cleaned_review = []
    words = each_review.split(" ")
    for idx, each_word in enumerate(words):
        encoded_text = each_word.encode('utf8')
        if "\\" in str(encoded_text):
            remove_rows.append(i)
            break
        cleaned = each_word.replace("(", "").replace(")", "") # remove ()
        if "." in cleaned:
            if cleaned.split(".")[1] not in ["", "'"]:
                cleaned = cleaned.replace(".", ". ") # add space after "." 
        cleaned = cleaned.translate(table).lower()
        cleaned = reduce_lengthening(cleaned)
        if len(cleaned) != 0 and cleaned not in stop_words:
            #figure out how to stop empty strings
            cleaned = lemmatizer.lemmatize(cleaned)
            each_cleaned_review.append(cleaned)
    cleaned_reviews.append(" ".join(each_cleaned_review))
    if len(each_cleaned_review) == 0:
        remove_rows.append(i)

cleaned_reviews = [i for j, i in enumerate(cleaned_reviews) if j not in remove_rows]
df.drop(df.index[remove_rows], inplace=True)
print ("Removed {} rows".format(orig_len - len(df)))
print ("time took: ".format(time.time() - start_time))
print(cleaned_reviews)

In [None]:
print (len(cleaned_reviews))
print (len(df))

In [None]:
# this cell takes ~20 minutes to run
#DOESNT LOOK LIKE WE ARE USING THIS?
# unknowns = {} 
# chkr = enchant.checker.SpellChecker("en_EN")

# start_time = time.time()

# for i, review in enumerate(cleaned_reviews):
#     if i % 50000 == 0:
#         print("Took {} for 50000 reviews ".format(time.time() - start_time))
#         start_time = time.time()
#     each_cleaned_review = []
#     words = review.split(" ")
#     for idx, each_word in enumerate(words):
#         if len(each_word) == 0: # to handle double spaces
#             continue
#         if not chkr.check(each_word):
#             if not chkr.check(each_word[0].upper()+each_word[1:]): # check proper nouns
#                 if each_word not in unknowns.keys():
#                     unknowns[each_word] = 1
#                 else:
#                     unknowns[each_word] = unknowns[each_word] + 1

# common_unknowns = OrderedDict(sorted(unknowns.items(), key = itemgetter(1), reverse = True))

# print(common_unknowns[0:20])

In [None]:
# common_unknowns = OrderedDict(sorted(unknowns.items(), key = itemgetter(1), reverse = True))

# print(common_unknowns)

In [None]:
# print(len(common_unknowns))

In [None]:
# slang_dict = {}
# with open("data/slangdict.txt") as f:
#     for line in f:
#         slang = line.split("-")
#         if len(slang) > 1:
#             key = slang[0].strip().translate(table).lower()
#             val = slang[1].strip().translate(table).lower()
#             slang_dict[key] = val
# print(slang_dict)

In [None]:
word_count_dict = {}
for i, review in enumerate(cleaned_reviews):
    if i% 50000 == 0:
        print ("For 50000, took {} seconds".format(time.time() - start_time))
        start_time = time.time()
    words = review.split(" ")
    for idx, each_word in enumerate(words):
        if each_word not in word_count_dict.keys():
            word_count_dict[each_word] = 1
        else:
            word_count_dict[each_word] += 1

In [None]:
ordered_word_count_dict = OrderedDict(sorted(word_count_dict.items(), key = itemgetter(1), reverse = True)[:10000])

In [None]:
print(len(ordered_word_count_dict))

In [None]:
final_cleaned_reviews = []
unk_counter = 0
vocab_10k = ordered_word_count_dict.keys()

start_time = time.time()
for i, review in enumerate(cleaned_reviews):
    if i% 50000 == 0:
        print ("For 50000, took {} seconds".format(time.time() - start_time))
        start_time = time.time()
    each_cleaned_review = []
    words = review.split(" ")
    for idx, each_word in enumerate(words):
        if each_word not in vocab_10k:
            words[idx] = 'UNK'
    final_cleaned_reviews.append((" ".join(words)))

### For steam dataset, set count to < 10 for 15K word dictionary size

In [None]:
# spell = SpellChecker(distance=1)
# final_cleaned_reviews = []
# unk_counter = 0

# start_time = time.time()
# for i, review in enumerate(cleaned_reviews):
#     if i% 50000 == 0:
#         print ("For 50000, took {} seconds".format(time.time() - start_time))
#         start_time = time.time()
#     each_cleaned_review = []
#     words = review.split(" ")
#     for idx, each_word in enumerate(words):
#         if word_count_dict[each_word] < 10:
#             words[idx] = 'UNK'
# #         if each_word in unknowns.keys():
# #             if unknowns[each_word] < 10000000: #trying to cut down words
# #                 if each_word in slang_dict.keys():
# #                     continue
# #                 w1 = spell.correction(each_word)
# #                 if w1 != each_word:
# #                     words[idx] = w1
# #                 else:
# #                 words[idx] = 'UNK'
# #                 unk_counter+=1
                    
#     final_cleaned_reviews.append((" ".join(words)))

In [None]:
final_cleaned_reviews

In [None]:
# find out vocab size after this cleaning portion
word_set = {}
for each_review in final_cleaned_reviews:
    word_list = each_review.split(" ")
    for each_word in word_list:
        word_set[each_word] = 1

In [None]:
# cleaned vocab size
# 1K min word count --> 38K vocab size
# 2k min word count --> 38k vocab size
# 10K min word count --> 30K


len(word_set.keys())

In [None]:
len(word_set)

# todo spellcheck

In [None]:
len(df)

In [None]:
len(final_cleaned_reviews)

In [None]:
df['cleaned_reviews'] = final_cleaned_reviews

# tf-idf below

In [None]:
raw_text = test_drop['review']
vectorizer = TfidfVectorizer(use_idf=True)
vectors = vectorizer.fit_transform(raw_text.apply(lambda x: np.str_(x))) #.apply(lambda x: np.str_(x))

# X_as_array = X.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list
# print(len(X_as_array))

feature_names = vectorizer.get_feature_names()

In [None]:
# dense = vectors.todense()
vectorlist = vectors.tolist()
tf_idf = pd.DataFrame(vectorlist, columns=feature_names)

In [None]:
print(vectors.shape)
print(feature_names)

**Ben's Date Stuff**

In [None]:
newColumns = df["date_posted"].str.split("-", n = 2, expand = True) 

In [None]:
df['Year'] = newColumns[0]
df['Month'] = newColumns[1]
df['Day'] = newColumns[2]
df.drop(columns = ['date_posted'], inplace = True)

df.head()

# Encoding text below

In [None]:
max_len = -1
len_set = []
for i in range(len(df)):
    review = df.iloc[i].cleaned_reviews
    words = review.split(" ")
    len_set.append(len(words))
    if len(words) > max_len:
        max_len = len(words)
len_set.sort()
print(len_set)

In [None]:
print(len_set)

In [None]:
# remove reviews over a limit
max_len = -1
removed_idx = []
for i in range(len(df)):
    review = df.iloc[i].cleaned_reviews
    words = review.split(" ")
    if len(words) > 171:
        removed_idx.append(i)


df_ = df.drop(df.index[removed_idx])
print ("removed {} reviews > 200 words".format(len(removed_idx)))

In [None]:
VOCAB_SIZE = None
MAX_SEQ_LEN = 0

# find vocab_size
all_words = {}
for each_review in df_.cleaned_reviews.tolist():
    word_list = each_review.split(" ")
    # find max seq len
    if len(word_list) > MAX_SEQ_LEN:
        MAX_SEQ_LEN = len(word_list)
        sent = word_list

    for ea_word in word_list:
        if ea_word in all_words:
            all_words[ea_word] += 1
        else:
            all_words[ea_word] = 1
VOCAB_SIZE = len(all_words.keys())
print ('vocab_size = ', VOCAB_SIZE)
print ('max_seq_len = ', MAX_SEQ_LEN)
# print (MAX_SEQ_LEN, sent)
# vocab_size =  52716
# max_seq_len =  7984

In [None]:
final_cleaned_reviews = df_.cleaned_reviews.tolist()

In [None]:
encoded_reviews = [one_hot(x, VOCAB_SIZE) for x in final_cleaned_reviews]
padded_reviews = pad_sequences(encoded_reviews, maxlen=MAX_SEQ_LEN, padding='pre')
padded_reviews.shape

In [None]:
padded_reviews

In [None]:
encoded_text_cols = padded_reviews.shape[1]
for col_idx in range(encoded_text_cols):
    df_.insert(len(df_.columns), "encoded_{}".format(col_idx+1), padded_reviews[:, col_idx])
df_.head()

In [None]:
df_.shape

# Code below is to save cleaned dataset -- don't edit

In [None]:
def save_cleaned_dataset(final_df):
    # dataset must be a pd dataframe
    root = "data/"
    final_df.to_csv(root + "cleaned_steam_data_4-15_15Kwords.csv")
save_cleaned_dataset(df_) # testing

In [None]:
def save_cleaned_dataset(dataset):
    # dataset must be a np array
    root = "data/"
    cols_added = dataset.shape[1]
    col_names = []
    for col_idx in range(cols_added):
        col_names.append("embedded_{}".format(col_idx))
    dataset_df = pd.DataFrame(data=dataset, columns=col_names)
    dataset_df.to_csv("steam_text_data_4-15.csv")
#     savetxt(root + 'cleaned_steam_data_3-28.csv', dataset, delimiter=',')
save_cleaned_dataset(padded_reviews) # testing

# AMAZON DATASET CLEANING

In [None]:
import gzip

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('data/reviews_Video_Games_5.json.gz')

In [None]:
df.head()