In [171]:
# Importing packages
import json
import numpy as np
import pandas as pd
import os
# Load Basic Model
import codecs
from keras_bert import load_trained_model_from_checkpoint
# Convert Data to Array
from tqdm import tqdm
from keras_bert import Tokenizer
# Build Custom Model
from tensorflow.python import keras
from keras_bert import AdamWarmup, calc_train_steps

In [4]:
# Reading in JSON file and putting into a list
runway = []
for line in open("renttherunway_final_data.json", 'r'):
    runway.append(json.loads(line))

In [17]:
# Looking at the structure of the data
# We will train using the review text as feature and rating as label
runway[0]

{'age': '28',
 'body type': 'hourglass',
 'bust size': '34d',
 'category': 'romper',
 'fit': 'fit',
 'height': '5\' 8"',
 'item_id': '2260466',
 'rating': '10',
 'rented for': 'vacation',
 'review_date': 'April 20, 2016',
 'review_summary': 'So many compliments!',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'size': 14,
 'user_id': '420272',
 'weight': '137lbs'}

In [18]:
# Looking at the length of the dataset
len(runway)

192544

In [64]:
# Isolating labels
rating = []
for i in range(192544):
    rating.append(runway[i]['rating'])

In [23]:
# Isolating features
review_text = []
for j in range(192544):
    review_text.append(runway[j]['review_text'])

In [113]:
# Defining a preprocessing function for the review text
def split_to_tokens(text):
    '''Function takes in text and preprocesses it for use in Bert'''
    for x in string.punctuation:
        text = [item.replace(x, '') for item in text]
    for y in string.digits:
        text = [item.replace(y, '8') for item in text]
    text = [item.lower() for item in text]
    return(text)

In [114]:
# Preprocessing the review text
review_processed = split_to_tokens(review_text)
review_processed[0:4]

['an adorable romper belt and zipper were a little hard to navigate in a full day of wearbathroom use but thats to be expected wish it had pockets but other than that absolutely perfect i got a million compliments',
 'i rented this dress for a photo shoot the theme was hollywood glam and big beautiful hats the dress was very comfortable and easy to move around in it is definitely on my list to rent again for another formal event ',
 'this hugged in all the right places it was a perfect dress for my event and i received so many compliments on it not to mention customer service was great getting this to me in less than 88 hours',
 'i rented this for my companys black tie awards banquet  i liked that this dress was short but was a little fancier with the sequins i generally dont care for long dresses  i would describe the color as more rose gold than yellow gold  i have blonde hair and fair skin and the color was very flattering  this is a very forgiving dress its form fitting without mak

In [46]:
# Looking at the different categories of ratings available
set(rating_look)

{'10', '2', '4', '6', '8', None}

In [66]:
# Put in dataframe and one-hot encode
review_df = pd.DataFrame()
review_df['Text'] = review_text
review_df['Rating'] = rating

one_hot = pd.get_dummies(review_df["Rating"])
review_one_hot = pd.DataFrame()
review_one_hot = pd.concat([review_df['Text'], one_hot], axis=1)
review_one_hot.head()

Unnamed: 0,Text,10,2,4,6,8
0,An adorable romper! Belt and zipper were a lit...,1,0,0,0,0
1,I rented this dress for a photo shoot. The the...,1,0,0,0,0
2,This hugged in all the right places! It was a ...,1,0,0,0,0
3,I rented this for my company's black tie award...,0,0,0,0,1
4,I have always been petite in my upper body and...,1,0,0,0,0


In [55]:
# Defining hyperparameters
SEQ_LEN = 128
BATCH_SIZE = 128
EPOCHS = 5
LR = 1e-4

In [60]:
# These files must be put in current directory for paths to work
# Need to install wget via homebrew before can download pretrained model
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

# TF_KERAS must be added to environment variables in order to use TPU
os.environ['TF_KERAS'] = '1'

In [62]:
# Load Basic Model
token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)
        
model = load_trained_model_from_checkpoint(
    config_path,
    checkpoint_path,
    training=True,
    trainable=True,
    seq_len=SEQ_LEN,
)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [139]:
len(review_processed)

192544

In [147]:
# Convert data to Array
# Tokenizer class is used for splitting texts and generating indices
# Question: do you need to tokenize and encode, or just encode
tokenizer = Tokenizer(token_dict)

token_list = []
for i in tqdm(range(0,len(review_processed))):
    tokens = tokenizer.tokenize(review_processed[i])
    token_list.append(tokens)

100%|██████████| 192544/192544 [02:03<00:00, 1557.71it/s]


In [149]:
# token_list contains the tokenized version of processed user review text
len(token_list)

192544

In [159]:
token_indices, review_indices = [], []
for j in tqdm(range(0,len(token_list))):
    for k in range(0,len(token_list[j])):
        ids, segments = tokenizer.encode(token_list[j][k], second=None, max_len=SEQ_LEN)
        token_indices.append(ids)
        review_indices.append(segments)

100%|██████████| 192544/192544 [20:51<00:00, 153.79it/s]  


In [161]:
# the shape and data within these lists seem wrong -- John (thoughts)?
len(token_indices)
len(review_indices)

12295617

In [173]:
# Build custom model
# Had to hard code the first argument because it is a list not array (review_processed)
# Again, unsure of whether it is the right shape to put here or what right shape should be
inputs = model.inputs[:2]
# NSP > Next Sentence Prediction Dense Layer
dense = model.get_layer('NSP-Dense').output
outputs = keras.layers.Dense(units=5, activation='softmax')(dense)
decay_steps, warmup_steps = calc_train_steps(
    192544,
    batch_size = BATCH_SIZE,
    epochs = EPOCHS,)

model = keras.models.Model(inputs, outputs)
model.compile(
    AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR),
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'])