In [1]:
import json
import numpy as np
import pandas as pd
import random
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [2]:
import nltk
import string
from nltk.stem.porter import *
from nltk.corpus import stopwords

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/meiyihe/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
len(set(stopwords.words('english')))

179

In [11]:
file = "../data/renttherunway_final_data.json"
df = pd.read_json(file, lines=True)

In [13]:
# strip punctuations
review = []
table = str.maketrans({key: None for key in string.punctuation})
for idx, row in df.iterrows():
    tmp = row['review_summary'] + " " +row['review_text']
    tmp = tmp.translate(table)
    review.append(tmp)

In [14]:
review[:5]

['So many compliments An adorable romper Belt and zipper were a little hard to navigate in a full day of wearbathroom use but thats to be expected Wish it had pockets but other than that absolutely perfect I got a million compliments',
 'I felt so glamourous I rented this dress for a photo shoot The theme was Hollywood Glam and Big Beautiful Hats The dress was very comfortable and easy to move around in It is definitely on my list to rent again for another formal event ',
 'It was a great time to celebrate the almost completion of my first year of law school This hugged in all the right places It was a perfect dress for my event and I received so many compliments on it Not to mention customer service was great getting this to me in less than 24 hours',
 'Dress arrived on time and in perfect condition  I rented this for my companys black tie awards banquet  I liked that this dress was short but was a little fancier with the sequins I generally dont care for long dresses  I would describ

In [19]:
# let's try text mining + prediction here

def getTopWords(data, num):
    stemmer = PorterStemmer()
    wordCount = defaultdict(int)
    punctuation = set(string.punctuation)
    for d in data:
        r = ''.join([c.lower() for c in data if not c in punctuation])
        for w in r.split():
            w = stemmer.stem(w)
            if w not in stopwords.words("english"):
                wordCount[w] += 1
    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort(reverse=True)
    print(counts[:10])
    wordList = [x[1] for x in counts[:num]]
        
    return wordList

In [20]:
getTopWords(review[:100], 100)[:10]

[(16600, 'dress'), (16400, 'wa'), (11200, 'thi'), (7400, 'fit'), (5000, 'size'), (4500, 'would'), (4200, 'love'), (3900, 'great'), (3700, 'perfect'), (3600, 'veri')]


['dress',
 'wa',
 'thi',
 'fit',
 'size',
 'would',
 'love',
 'great',
 'perfect',
 'veri']

In [3]:
df.head()

Unnamed: 0,age,body type,bust size,category,fit,height,item_id,rating,rented for,review_date,review_summary,review_text,size,user_id,weight
0,28.0,hourglass,34d,romper,fit,"5' 8""",2260466,10.0,vacation,"April 20, 2016",So many compliments!,An adorable romper! Belt and zipper were a lit...,14,420272,137lbs
1,36.0,straight & narrow,34b,gown,fit,"5' 6""",153475,10.0,other,"June 18, 2013",I felt so glamourous!!!,I rented this dress for a photo shoot. The the...,12,273551,132lbs
2,116.0,,,sheath,fit,"5' 4""",1063761,10.0,party,"December 14, 2015",It was a great time to celebrate the (almost) ...,This hugged in all the right places! It was a ...,4,360448,
3,34.0,pear,34c,dress,fit,"5' 5""",126335,8.0,formal affair,"February 12, 2014",Dress arrived on time and in perfect condition.,I rented this for my company's black tie award...,8,909926,135lbs
4,27.0,athletic,34b,gown,fit,"5' 9""",616682,10.0,wedding,"September 26, 2016",Was in love with this dress !!!,I have always been petite in my upper body and...,12,151944,145lbs


In [16]:
# Pre-process Data 

# print original data shape
print(df.shape)
# drop NANs 
df = df.dropna()
print(df.shape)
# map 'body type' to numerical value
labels = df['body type'].astype('category').cat.categories.tolist()
replace_map = {'body type' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
print('--- map body type to numerical value ---')
print(replace_map)
df.replace(replace_map, inplace=True)

# remove the string after 'bust size'
df['bust size'] = df['bust size'].str.extract('(\d+)').astype(int)

# parse height to usable numerical format
def parse_height(ht):
    ht_ = ht.split("' ")
    ft_ = float(ht_[0])
    in_ = float(ht_[1].replace("\"",""))
    return (12*ft_) + in_
df['height'] = df['height'].apply(lambda x:parse_height(x))

# map 'category' to numerical value
# ------ I think this part needs more cleansing -----
labels = df['category'].astype('category').cat.categories.tolist()
replace_map = {'category' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

print('--- map category to numerical value ---')
print(replace_map)
df.replace(replace_map, inplace=True)


# map 'fit' to numerical value
labels = df['fit'].astype('category').cat.categories.tolist()
replace_map = {'fit' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
print('--- map fit to numerical value ---')
print(replace_map)
df.replace(replace_map, inplace=True)

# remove strings after ':' in 'party: cocktail'
df['rented for'] = df['rented for'].str.split(':').str[0]

# remove lbs after 'weight'
df['weight'] = df['weight'].str.extract('(\d+)').astype(int)


# map 'rented for' to numerical value
labels = df['rented for'].astype('category').cat.categories.tolist()
replace_map = {'rented for' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
print('--- map rented for to numerical value ---')
print(replace_map)
df.replace(replace_map, inplace=True)

(192544, 15)
(146381, 15)
--- map body type to numerical value ---
{'body type': {'apple': 1, 'athletic': 2, 'full bust': 3, 'hourglass': 4, 'pear': 5, 'petite': 6, 'straight & narrow': 7}}
--- map category to numerical value ---
{'category': {'ballgown': 1, 'blazer': 2, 'blouse': 3, 'blouson': 4, 'bomber': 5, 'buttondown': 6, 'caftan': 7, 'cami': 8, 'cape': 9, 'cardigan': 10, 'coat': 11, 'combo': 12, 'crewneck': 13, 'culotte': 14, 'culottes': 15, 'down': 16, 'dress': 17, 'duster': 18, 'for': 19, 'frock': 20, 'gown': 21, 'henley': 22, 'hoodie': 23, 'jacket': 24, 'jeans': 25, 'jogger': 26, 'jumpsuit': 27, 'kaftan': 28, 'kimono': 29, 'knit': 30, 'legging': 31, 'leggings': 32, 'maxi': 33, 'midi': 34, 'mini': 35, 'overalls': 36, 'overcoat': 37, 'pant': 38, 'pants': 39, 'parka': 40, 'peacoat': 41, 'poncho': 42, 'print': 43, 'pullover': 44, 'romper': 45, 'sheath': 46, 'shift': 47, 'shirt': 48, 'shirtdress': 49, 'skirt': 50, 'skirts': 51, 'skort': 52, 'suit': 53, 'sweater': 54, 'sweatershirt'

In [17]:
df.head()

Unnamed: 0,age,body type,bust size,category,fit,height,item_id,rating,rented for,review_date,review_summary,review_text,size,user_id,weight
0,28.0,4,34,45,1,68.0,2260466,10.0,6,"April 20, 2016",So many compliments!,An adorable romper! Belt and zipper were a lit...,14,420272,137
1,36.0,7,34,21,1,66.0,153475,10.0,4,"June 18, 2013",I felt so glamourous!!!,I rented this dress for a photo shoot. The the...,12,273551,132
3,34.0,5,34,17,1,65.0,126335,8.0,3,"February 12, 2014",Dress arrived on time and in perfect condition.,I rented this for my company's black tie award...,8,909926,135
4,27.0,2,34,21,1,69.0,616682,10.0,7,"September 26, 2016",Was in love with this dress !!!,I have always been petite in my upper body and...,12,151944,145
5,45.0,2,32,17,1,68.0,364092,8.0,1,"April 30, 2016",Traditional with a touch a sass,Didn't actually wear it. It fit perfectly. The...,8,734848,138


In [18]:
df.dtypes

age               float64
body type           int64
bust size           int64
category            int64
fit                 int64
height            float64
item_id             int64
rating            float64
rented for          int64
review_date        object
review_summary     object
review_text        object
size                int64
user_id             int64
weight              int64
dtype: object

In [19]:
df_train, df_test = train_test_split(df, test_size=0.33, random_state=42)

In [20]:
len(df_train)

98075

In [21]:
label = 'rented for'

X_train = df_train.drop([label, 'review_date','review_summary', 'review_text'],axis=1)
y_train = df_train[label]
X_test = df_test.drop([label, 'review_date','review_summary', 'review_text'],axis=1)
y_test = df_test[label]

In [22]:
X_train.head()

Unnamed: 0,age,body type,bust size,category,fit,height,item_id,rating,size,user_id,weight
114785,37.0,4,34,46,1,67.0,1851598,10.0,8,516231,135
126377,32.0,4,32,17,1,64.0,174391,10.0,1,254634,130
117721,37.0,2,34,17,1,65.0,1316534,8.0,8,995881,130
167358,35.0,2,34,17,1,66.0,1055399,10.0,8,87081,130
97435,28.0,4,36,21,1,64.0,242782,8.0,24,838084,155


In [12]:
# import machine learning models
from sklearn.ensemble import RandomForestClassifier

In [25]:
clf = RandomForestClassifier(criterion = 'entropy', random_state = 42)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [26]:
pred_test = clf.predict(X_test)
accuracy_score(y_test, pred_test)

# probably needs text mining instead of using other features

0.40336604148552974