 
## XY Hacker Data pull and initial cleaning

In [1]:
import pandas as pd

# pd.set_option('display.max_colwidth', -1)
# pd.set_option('display.max_rows', 10000)

## Import data and convert to pandas DF

In [4]:
%%time
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

data_ratings = getDF('/Users/lorinfields/Dropbox/Lambda/mlai/xyhacker/localFolder/data/reviews_Grocery_and_Gourmet_Food.json.gz')

print(data_ratings.shape)
print(data_ratings.dtypes)
print(data_ratings.head(4))
df_ratings = pd.DataFrame(data_ratings)

(1297156, 9)
reviewerID         object
asin               object
reviewerName       object
helpful            object
reviewText         object
overall           float64
summary            object
unixReviewTime      int64
reviewTime         object
dtype: object
       reviewerID        asin                  reviewerName helpful  \
0  A1ZQZ8RJS1XVTX  0657745316                      gsxrgirl  [0, 0]   
1  A31W38VGZAUUM4  0700026444                      FIFA Lvr  [1, 1]   
2  A3I0AV0UJX5OH0  1403796890                      Alicia b  [0, 0]   
3  A3QAAOLIXKV383  1403796890  Danny K. Tilley "Dan Tilley"  [0, 0]   

                                          reviewText  overall  \
0  No sugar, no GMO garbage, no fillers that come...      5.0   
1  This is my absolute, undisputed favorite tea r...      5.0   
2  I ordered spongbob slippers and I got John Cen...      1.0   
3  The cart is fine and works for the purpose for...      3.0   

                      summary  unixReviewTime   reviewTim

## Import and enable gender_guesser

In [5]:
! pip install --upgrade pip
! pip3 install gender-guesser

Requirement already up-to-date: pip in /Users/lorinfields/anaconda3/lib/python3.6/site-packages (18.0)


In [6]:
import gender_guesser.detector as gender
d = gender.Detector()

In [7]:
print(d.get_gender('Carla'))

female


## Simple filter and initial counts

In [6]:
# capitalize all fist initials as required by gender_guesser
df = df_ratings
df['reviewerName'] = df['reviewerName'].str.capitalize()
df['first_name'] = df['reviewerName'].str.split(' ', expand=True)[0]

In [7]:
df['first_name'].head()

0    Gsxrgirl
1        Fifa
2      Alicia
3       Danny
4     Chelmic
Name: first_name, dtype: object

In [8]:
df['gender'] = [d.get_gender(str(first_name)) for first_name in df['first_name']]
 

In [9]:
df['gender'][2]

'female'

In [10]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,first_name,gender
0,A1ZQZ8RJS1XVTX,0657745316,Gsxrgirl,"[0, 0]","No sugar, no GMO garbage, no fillers that come...",5.0,Best vanilla I've ever had,1381449600,"10 11, 2013",Gsxrgirl,unknown
1,A31W38VGZAUUM4,0700026444,Fifa lvr,"[1, 1]","This is my absolute, undisputed favorite tea r...",5.0,Terrific Tea!,1354752000,"12 6, 2012",Fifa,unknown
2,A3I0AV0UJX5OH0,1403796890,Alicia b,"[0, 0]",I ordered spongbob slippers and I got John Cen...,1.0,grrrrrrr,1385942400,"12 2, 2013",Alicia,female
3,A3QAAOLIXKV383,1403796890,"Danny k. tilley ""dan tilley""","[0, 0]",The cart is fine and works for the purpose for...,3.0,Storage on Wheels Cart,1307836800,"06 12, 2011",Danny,male
4,AB1A5EGHHVA9M,141278509X,Chelmic,"[1, 1]",This product by Archer Farms is the best drink...,5.0,The best drink mix,1332547200,"03 24, 2012",Chelmic,unknown


In [11]:
df_names = df[['reviewerName', 'first_name', 'gender']]
print(df_names['gender'].value_counts())
df_names.head(500)

unknown          664240
female           320425
male             233629
mostly_female     42778
mostly_male       22126
andy              13958
Name: gender, dtype: int64


Unnamed: 0,reviewerName,first_name,gender
0,Gsxrgirl,Gsxrgirl,unknown
1,Fifa lvr,Fifa,unknown
2,Alicia b,Alicia,female
3,"Danny k. tilley ""dan tilley""",Danny,male
4,Chelmic,Chelmic,unknown
5,Tr-rhodeisland,Tr-rhodeisland,unknown
6,"Trevor l ""god is on the side of the army with...",Trevor,male
7,Ixalmida,Ixalmida,unknown
8,"Mary s ""one800mary""",Mary,mostly_female
9,"Antaeus feldspar ""af""",Antaeus,unknown


In [12]:
# checking data
# df_males = df_names.loc[df_names['gender'] == 'male']
df_andy = df.loc[df['first_name'] == 'Andy']
df_andy.head()
# andy looks like mostly male but will ignore as we have enough high probobility data

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,first_name,gender
1554,A1BSIOE091CUR8,B00008CQRK,Andy,"[0, 2]","Good coffee, nothing special. We frequently or...",4.0,Coffee,1288569600,"11 1, 2010",Andy,male
4006,A3OR8HXP22QPBQ,B0000CNU5D,Andy p. jung,"[4, 5]",...is because a major ingredient that ruins it...,2.0,The reason why it is bad...,1274832000,"05 26, 2010",Andy,male
7124,A2VQHXGJQEXNIB,B0000DID5R,Andy,"[2, 3]","If you're looking for a HOT sauce, this will d...",5.0,This sauce is no joke,1400198400,"05 16, 2014",Andy,male
20012,A1V7F8UYZDJMKK,B00016AQS4,Andy shed,"[1, 1]",The tea has great flavor and just the right am...,5.0,Great product,1367712000,"05 5, 2013",Andy,male
23115,A1C8YXCO6FV6RY,B00018CWSY,Andy s,"[1, 3]","Barry Farm, in my humble opinion, produces the...",5.0,Excellent Product,1354752000,"12 6, 2012",Andy,male


In [112]:
# df_males.head(5000)

### combine mostly male/female into male/female. Filter out unknows and andy

In [13]:
df = df[df['gender'] != 'unknown']
df = df[df['gender'] != 'andy']

print(df['gender'].value_counts())

female           320425
male             233629
mostly_female     42778
mostly_male       22126
Name: gender, dtype: int64


In [14]:
df = df.replace({'gender' : 'mostly_male'}, 'male')
df = df.replace({'gender' : 'mostly_female'}, 'female')
print(df['gender'].value_counts())
print(df.shape)

female    363203
male      255755
Name: gender, dtype: int64
(618958, 11)


In [15]:
# convert male/female to catagories
df['gender_cat'] = df['gender']
df['gender_cat'] = pd.get_dummies(df['gender_cat'])
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,first_name,gender,gender_cat
2,A3I0AV0UJX5OH0,1403796890,Alicia b,"[0, 0]",I ordered spongbob slippers and I got John Cen...,1.0,grrrrrrr,1385942400,"12 2, 2013",Alicia,female,1
3,A3QAAOLIXKV383,1403796890,"Danny k. tilley ""dan tilley""","[0, 0]",The cart is fine and works for the purpose for...,3.0,Storage on Wheels Cart,1307836800,"06 12, 2011",Danny,male,0
6,A3LZA698SQPCXE,1453060464,"Trevor l ""god is on the side of the army with...","[0, 0]",My wife picked some of this up on sale. I usu...,3.0,Tastes a Bit like Cough Syrup,1374019200,"07 17, 2013",Trevor,male,0
8,A2MWO0CISKXJ9,1603112251,"Mary s ""one800mary""","[0, 0]",I had a martini at a local distillery that use...,3.0,Interesting bitters,1391904000,"02 9, 2014",Mary,female,1
10,A11WU9TYDLB1RG,1613170416,Jon stoffel,"[3, 3]",And they're pretty nice! One set looks like ic...,4.0,They're dice!,1392163200,"02 12, 2014",Jon,male,0


## Import / Export data

In [2]:
# df.to_csv('600kClean_lorinData.csv')

df = pd.read_csv('localFolder/600kClean_lorinData.csv')

### Build baseline from RNN with NLP 

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist

ModuleNotFoundError: No module named 'tensorflow'

In [4]:
# import from Keras.
# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [5]:
# create x y
x = df['reviewText'].astype(str)
y = df['gender_cat']

In [6]:
# create train and test sets
from sklearn.model_selection import train_test_split

print('Loading data...')
x_train_text, x_test_text, y_train, y_test = train_test_split(
      x, y, test_size=0.5, random_state=42)
 
print(len(x_train_text), 'training sequences')
print(len(x_test_text), 'testing sequences')

Loading data...
309479 training sequences
309479 testing sequences


In [7]:
print(y.value_counts())
print(y_train.value_counts())

1    363203
0    255755
Name: gender_cat, dtype: int64
1    181433
0    128046
Name: gender_cat, dtype: int64


In [8]:
# combine text for total corpus for tokenizing later
data_text = x_train_text + x_test_text



In [9]:
x_train_text.head(1)
# x_train_text.shape

366949    These packets were not that expensive and are ...
Name: reviewText, dtype: object

In [10]:
y_train.dtypes

dtype('int64')

In [11]:
x_train_text.dtypes

dtype('O')

In [153]:
# # text is reading as object. need to convert to strings for analysis.
# df['reviewText'] = df['reviewText'].astype(str)
# x_train_text[1]

## Tokenizer


In [12]:
num_words = 15000

In [13]:
tokenizer = Tokenizer(num_words=num_words)

In [14]:
%time
tokenizer.fit_on_texts(x)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs


In [15]:
tokenizer.word_index

{'the': 1,
 'i': 2,
 'and': 3,
 'a': 4,
 'it': 5,
 'to': 6,
 'of': 7,
 'this': 8,
 'is': 9,
 'in': 10,
 'for': 11,
 'my': 12,
 'that': 13,
 'but': 14,
 'was': 15,
 'with': 16,
 'not': 17,
 'you': 18,
 'have': 19,
 'are': 20,
 'as': 21,
 'on': 22,
 'they': 23,
 'like': 24,
 'so': 25,
 'good': 26,
 'these': 27,
 'great': 28,
 'taste': 29,
 'them': 30,
 'very': 31,
 'coffee': 32,
 'be': 33,
 'just': 34,
 'product': 35,
 'at': 36,
 'or': 37,
 'flavor': 38,
 'one': 39,
 'if': 40,
 'love': 41,
 'all': 42,
 'tea': 43,
 'from': 44,
 'will': 45,
 "it's": 46,
 'more': 47,
 'when': 48,
 'me': 49,
 'can': 50,
 'would': 51,
 'had': 52,
 'we': 53,
 'has': 54,
 'use': 55,
 'than': 56,
 'out': 57,
 'get': 58,
 'really': 59,
 'some': 60,
 'no': 61,
 'other': 62,
 'were': 63,
 'much': 64,
 'too': 65,
 'buy': 66,
 'only': 67,
 'time': 68,
 'up': 69,
 'about': 70,
 'best': 71,
 'price': 72,
 'an': 73,
 'find': 74,
 "don't": 75,
 'little': 76,
 'make': 77,
 'what': 78,
 'your': 79,
 'also': 80,
 'am': 81,


In [16]:
x_train_tokens  = tokenizer.texts_to_sequences(x_train_text)

In [17]:
x_train_text.head(1)

366949    These packets were not that expensive and are ...
Name: reviewText, dtype: object

In [18]:
np.array(x_train_tokens[1])

array([2341,  536,  652,  699,   32,    9,  215,   18,   51,   17,   98,
        146,   13,    5,    9,  536,  250,    1,  301,   45,  188,  207,
         86,   10,    1,  734])

In [19]:
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

## padding and truncating data
The Recurrent Neural Network can take sequences of arbitrary length as input, but in order to use a whole batch of data, the sequences need to have the same length. There are two ways of achieving this: (A) Either we ensure that all sequences in the entire data-set have the same length, or (B) we write a custom data-generator that ensures the sequences have the same length within each batch.



In [20]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [21]:

print(np.mean(num_tokens))
print(np.max(num_tokens))

56.16136797650245
3364


In [22]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

173

In [23]:
# this covers around 96% of the data
np.sum((num_tokens  < max_tokens) / len(num_tokens))

0.9604528901799477

 When padding or truncating the sequences that have a different length, we need to determine if we want to do this padding or truncating 'pre' or 'post'. If a sequence is truncated, it means that a part of the sequence is simply thrown away. If a sequence is padded, it means that zeros are added to the sequence.

So the choice of 'pre' or 'post' can be important because it determines whether we throw away the first or last part of a sequence when truncating, and it determines whether we add zeros to the beginning or end of the sequence when padding. This may confuse the Recurrent Neural Network.

In [24]:
pad = 'pre'

In [25]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

In [26]:
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [27]:
# num_classes = np.max(y_train) + 1
# print(num_classes, 'classes')

from keras.utils import to_categorical  # Makes "one-hot" encoding from label

y_train_hot = to_categorical(y_train, num_classes=2)
y_test_hot = to_categorical(y_test, num_classes=2)
print(y_train_hot[4])

[0. 1.]


Using TensorFlow backend.


In [28]:
# now we have transformed the training and test set into a big matrix of tokens
print(x_train_pad.shape)
print(x_test_pad.shape)

(309479, 173)
(309479, 173)


In [29]:
x_train_pad[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0, 2341,  536,  652,  6

## Bring in clean data

In [2]:
import numpy as np

y_train = np.load('localFolder/y_train.npy')
y_test = np.load('localFolder/y_test.npy')
x_train = np.load('localFolder/x_train.npy')
x_test = np.load('localFolder/x_test.npy')



#  method for exporting array and importing back
# np.save('y_train', y_train)
# np.save('y_test', y_test)
# np.save('x_train', x_train_pad)
# np.save('x_test', x_test_pad)


## create small test set

In [3]:
y_train_small = y_train[0:40000] 
y_test_small = y_test[0:40000] 
x_train_small = x_train[0:40000] 
x_test_small = x_test[0:40000] 
print(y_train_small[10])
# print(y_test_small[1])
# print(x_train_small[1])
# print(x_test_small[1])
print(x_test_small.shape[1])
x_test_small.shape

1
173


(40000, 173)

## Create the Recurrent Neural Network


In [4]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters


In [5]:
model = Sequential()
embedding_size = 100
max_tokens = x_test_small.shape[1]
num_words = 15000

In [6]:
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

In [7]:
# model.add(GRU(units=64, return_sequences=True))
# model.add(GRU(units=32, return_sequences=True))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))



In [8]:
optimizer  = 'Adam'

In [9]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 173, 100)          1500000   
_________________________________________________________________
gru (GRU)                    (None, 173, 16)           5616      
_________________________________________________________________
gru_1 (GRU)                  (None, 173, 8)            600       
_________________________________________________________________
gru_2 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 1,506,377
Trainable params: 1,506,377
Non-trainable params: 0
_________________________________________________________________


## Train the RNN

In [12]:
# train a small one first
%time
model.fit(x_train, y_train,
          validation_split=0.04, epochs=3, batch_size=32)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 9.3 µs
Train on 297099 samples, validate on 12380 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0xb3b8a0208>

## test model


In [13]:
%time
result =  model.evaluate(x_test, y_test)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.06 µs


In [14]:
print("Accuracy: {0:.2%}".format(result[1]))

Accuracy: 67.49%
