In [46]:
import json
from collections import defaultdict
import random
import string
import numpy as np
import operator
import math
import re

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


In [2]:
data =[]

with open('renttherunway_final_data.json', 'r') as file:
    for line in file:
        line_data = json.loads(line)
        data.append(line_data)


In [21]:
# drop rows that have incomplete columns
clean_data = []
for d in data:
    if len(d) == 15:
        clean_data.append(d)

In [22]:
print(len(clean_data))
print(clean_data[0])

146431
{'fit': 'fit', 'user_id': '441940', 'bust size': '34a', 'item_id': '148690', 'weight': '140lbs', 'rating': '10', 'rented for': 'wedding', 'review_text': 'I saw another reviewer with a similar figure to mine - I have a small waist and flat chest but curvier hips/butt and often find dresses to be too loose in the top and too tight at the bottom. Well thank you to all those who review because the 8 fit well and the 6 fit like a glove. I could have worn either but chose the 6 and just wore little inserts instead of a bra. Loved this fun LBD. ', 'body type': 'pear', 'review_summary': 'Perfect fit! I felt amazing in this dress. ', 'category': 'dress', 'height': '5\' 9"', 'size': 12, 'age': '29', 'review_date': 'January 20, 2015'}


In [23]:
# train, val, test split
random.seed(7)
random.shuffle(clean_data)

train_data = clean_data[:int(0.8*len(clean_data))]
valid_data = clean_data[int(0.8*len(clean_data)):int(0.9*len(clean_data))]
test_data = clean_data[int(0.9*len(clean_data)):]

In [26]:
def feature(d):
    f = [1]  #offset

    size = d['size']
    f.append(size)
    
    height_transform = re.findall(r'\d+', d['height'])
    height = int(height_transform[0]) * 12 + int(height_transform[1])
    f.append(height)

    weight = int(d['weight'][:-3])
    f.append(weight)
    
    bust = int(d['bust size'][:2])
    f.append(bust)
    cup = ord(d['bust size'][2])
    f.append(cup)

    age = int(d['age'])
    f.append(age)
    
    
    
    # one-hot encoding for rented for and body type
    rented_for_dict = {'date':0, 'everyday':1, 'formal affair':2, 'other':3, 'party':4, 'party: cocktail':5, 'vacation':6, 'wedding':7, 'work':8}
    f.append(1.0 * (d['rented for'] == 'date'))
    f.append(1.0 * (d['rented for'] == 'everyday'))
    f.append(1.0 * (d['rented for'] == 'formal affair'))
    f.append(1.0 * (d['rented for'] == 'other'))
    f.append(1.0 * (d['rented for'] == 'party'))
    f.append(1.0 * (d['rented for'] == 'party: cocktail'))
    f.append(1.0 * (d['rented for'] == 'vacation'))
    f.append(1.0 * (d['rented for'] == 'wedding'))
    f.append(1.0 * (d['rented for'] == 'work'))
    
    body_type_dict = {'apple':0, 'athletic':1, 'full bust':2, 'hourglass':3, 'pear':4, 'petite':5, 'straight & narrow':6}
    f.append(1.0 * (d["body type"] == 'apple'))
    f.append(1.0 * (d["body type"] == 'athletic'))
    f.append(1.0 * (d["body type"] == 'full bust'))
    f.append(1.0 * (d["body type"] == 'hourglass'))
    f.append(1.0 * (d["body type"] == 'pear'))
    f.append(1.0 * (d["body type"] == 'petite'))
    f.append(1.0 * (d["body type"] == 'straight & narrow'))
    

    ### add more features?
    ### processed review_text prediction results?
    ### e.g. f.append(sentiment_model.predict(d["review_text"]))

    
    # label: change 3 categories to 
    if d['fit'] == 'fit':
        label = 1
    else:
        label = 0      # not fit

    return f, label

    

In [27]:
train_X = []
train_Y = []
for d in train_data:
    x, y = feature(d)
    train_X.append(x)
    train_Y.append(y)

valid_X = []
valid_Y = []
for d in valid_data:
    x, y = feature(d)
    valid_X.append(x)
    valid_Y.append(y)


test_X = []
test_Y = []
for d in test_data:
    x, y = feature(d)
    test_X.append(x)
    test_Y.append(y)


In [33]:
def calculate_accuracy(y, pred):
    return sum([a==b for a, b in zip(y, pred)]) / len(y)

In [None]:
### logistic regression

In [31]:
lr_mod = LogisticRegression(C=1)
lr_mod.fit(train_X, train_Y)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
lr_predictions = lr_mod.predict(test_X)
lr_accuracy = calculate_accuracy(test_Y, lr_predictions)


In [35]:
lr_accuracy

0.7379814258399344

In [39]:
lr_mod.coef_[0]

array([-1.45257755e-04, -2.79333135e-02,  2.13074835e-03,  6.40706699e-03,
        2.22749188e-02, -6.74038629e-03,  5.30430154e-03, -4.60227278e-02,
       -1.51773507e-01,  2.24701150e-01, -4.75935813e-03, -5.24035712e-02,
        1.01261420e-04, -2.55906835e-02,  8.89504166e-02, -3.33482377e-02,
       -5.95857380e-03,  2.65305562e-02, -4.38648734e-02,  4.27063498e-02,
       -4.64063228e-02,  1.98587816e-02,  6.98882464e-03])

In [None]:
### random forest

In [42]:
rf_model = RandomForestClassifier()
rf_model.fit(train_X, train_Y)

In [43]:
rf_predictions = rf_model.predict(test_X)
rf_accuracy = calculate_accuracy(test_Y, rf_predictions)

In [44]:
rf_accuracy

0.7026085768915596

In [None]:
### use grid search to improve random forest classifier

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 300, 500, 800, 1600],
    'random_state': [6, 42, 100, 277, 832, 2022]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform the grid search on your training data
grid_search.fit(train_X, train_Y)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)