In [2]:
import numpy as np
from numpy import savetxt
import pandas as pd
import matplotlib.pyplot as plt

# keras imports
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding

from sklearn.feature_extraction.text import TfidfVectorizer

# import nltk

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
data_dir = "data/steam_reviews.csv"
df = pd.read_csv(data_dir)
print(type(df))
print(df)

<class 'pandas.core.frame.DataFrame'>
       date_posted  funny  helpful  hour_played  is_early_access_review  \
0       2019-02-10      2        4          578                   False   
1       2019-02-10      0        0          184                   False   
2       2019-02-07      0        0          892                   False   
3       2018-06-14    126     1086          676                   False   
4       2017-06-20     85     2139          612                   False   
...            ...    ...      ...          ...                     ...   
434886  2018-11-17      1       37           10                   False   
434887  2018-11-17      3       41           38                   False   
434888  2018-11-20      0        0           36                   False   
434889  2018-11-18      1       44           12                   False   
434890  2019-01-21      1       28           20                   False   

       recommendation                                        

In [4]:
df.head()

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns
1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns
2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns
3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight
4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight


# Clean up textual data (try this stuff later, do vanilla run first)
* Remove common stopwords?
* Lower case everything 
* All ‘s in the data is preceded by a ‘\’ which should be cleaned out
* Maybe don't lower case everything... ex: 
    * "I HIGHLY RECOMMEND THIS GAME CAUSE THERE ARE SOME KILLERS WILL FOLLOW YOU NO MATTER WHERE YOU GO IN WHICH YOU'LL GO LOOPING EACH OTHER Who said this game is a horror game but MEHHHH Lieesssss ahhahahaha xDDDI never laugh so hard cause of this game I LOVE IT &lt 3333"

In [None]:
subset_reviews = df['review'][:50].tolist()

In [None]:
# a lot of (), so I'm cleaning it out
# add space after "."
cleaned_reviews = []

for each_review in subset_reviews:
    each_cleaned_review = []
    words = each_review.split(" ")
    for idx, each_word in enumerate(words):
        cleaned = each_word.replace("(", "").replace(")", "") # remove ()
        if "." in cleaned:
            if cleaned.split(".")[1] not in ["", "'"]:
                cleaned = cleaned.replace(".", ". ") # add space after "." 
        each_cleaned_review.append(cleaned)
    cleaned_reviews.append(" ".join(each_cleaned_review))
cleaned_reviews[0]

In [None]:
# df['is_early_access_review'] = df['is_early_access_review'].astype('int')
# df['recommendation'] = df['recommendation'].map({'Recommended':1, 'Not Recommended':0})
# df = pd.get_dummies(df, columns = ['title'] )
df.head()


In [5]:
raw_text = df['review']
vectorizer = TfidfVectorizer(use_idf=True)
vectors = vectorizer.fit_transform(raw_text.apply(lambda x: np.str_(x))) #.apply(lambda x: np.str_(x))

# X_as_array = X.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list
# print(len(X_as_array))

feature_names = vectorizer.get_feature_names()
# dense = vectors.todense()
# denselist = dense.tolist()
tf_idf = pd.DataFrame(vectors, columns=feature_names)


ValueError: Shape of passed values is (434891, 1), indices imply (434891, 221024)

In [6]:
print(vectors.shape)
print(len(feature_names))
print(vectors)

(434891, 221024)
221024
  (0, 80564)	0.477604160118617
  (0, 131209)	0.06157526962731858
  (0, 20129)	0.05405914598020932
  (0, 76287)	0.1607889165432482
  (0, 143109)	0.18335691936876072
  (0, 48647)	0.1587967897741817
  (0, 184040)	0.12009617973874011
  (0, 122420)	0.045204821858718644
  (0, 26375)	0.5734606605415646
  (0, 33893)	0.04768001993933596
  (0, 30592)	0.1033859108185502
  (0, 158329)	0.04841371155240533
  (0, 77663)	0.06763355601093524
  (0, 171090)	0.08038321265507184
  (0, 70410)	0.34571139445779414
  (0, 34299)	0.20690441600455656
  (0, 90300)	0.04086868568602996
  (0, 124322)	0.10345399240500303
  (0, 172263)	0.06643569610726961
  (0, 76349)	0.0986422316908891
  (0, 174835)	0.1037987983547056
  (0, 28433)	0.20690441600455656
  (0, 12708)	0.2134156667069383
  (0, 73431)	0.02501313747197943
  (0, 93772)	0.034083972580775404
  :	:
  (434890, 88824)	0.08118860896370828
  (434890, 111490)	0.09961697176786455
  (434890, 24345)	0.10181130057710057
  (434890, 68015)	0.10274922

**Ben's Date Stuff**

In [None]:
newColumns = df["date_posted"].str.split("-", n = 2, expand = True) 

In [None]:
df['Year'] = newColumns[0]
df['Month'] = newColumns[1]
df['Day'] = newColumns[2]
df.drop(columns = ['date_posted'], inplace = True)

df

# Encoding text below

In [None]:
VOCAB_SIZE = None
MAX_SEQ_LEN = 0

# find vocab_size
all_words = {}
for each_review in cleaned_reviews:
    word_list = each_review.split(" ")
    # find max seq len
    if len(word_list) > MAX_SEQ_LEN:
        MAX_SEQ_LEN = len(word_list)
        sent = word_list

    for ea_word in word_list:
        if ea_word in all_words:
            all_words[ea_word] += 1
        else:
            all_words[ea_word] = 1
VOCAB_SIZE = len(all_words.keys())
print ('vocab_size = ', VOCAB_SIZE)
print ('max_seq_len = ', MAX_SEQ_LEN)
# print (MAX_SEQ_LEN, sent)

In [None]:
encoded_reviews = [one_hot(x, VOCAB_SIZE) for x in cleaned_reviews]
padded_reviews = pad_sequences(encoded_reviews, maxlen=MAX_SEQ_LEN, padding='pre')
padded_reviews.shape

In [None]:
padded_docs

# Code below is to save cleaned dataset -- don't edit

In [None]:
def save_cleaned_dataset(dataset):
    # dataset must be a np array
    root = "data/"
    savetxt(root + 'cleaned_steam_data.csv', dataset, delimiter=',')
save_cleaned_dataset(padded_docs) # testing