In [10]:
import json
from collections import defaultdict
import random
import string
import numpy as np
import operator

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize


In [11]:
data =[]

with open('renttherunway_final_data.json', 'r') as file:
    for line in file:
        line_data = json.loads(line)
        data.append(line_data)


In [12]:
data[0]

{'fit': 'fit',
 'user_id': '420272',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': '10',
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016'}

In [13]:
# only use 1 category: dress for fit/notfit prediction

In [16]:
dress_data = [d for d in data if d["category"] == "dress" ]

In [17]:
len(dress_data)

92884

In [18]:
# train, val, test split
random.seed(6)
random.shuffle(dress_data)

train_data = dress_data[:int(0.8*len(dress_data))]
valid_data = dress_data[int(0.8*len(dress_data)):int(0.9*len(dress_data))]
test_data = dress_data[int(0.9*len(dress_data)):]


In [19]:
len(train_data), len(valid_data), len(test_data)

(74307, 9288, 9289)

In [20]:
item_data = {}
item_index = {}
user_index = {}
user_data = {}
u_index = 0
i_index = 0
for r in train_data:
    if r['item_id'] + '|' + str(r['size']) not in item_data:
        item_data[r['item_id'] + '|' + str(r['size'])] = [r]
        item_index[r['item_id'] + '|' + str(r['size'])] = i_index
        i_index += 1
    else:
        item_data[r['item_id'] + '|' + str(r['size'])].append(r)
        
    if r['user_id'] not in user_data:
        user_data[r['user_id']] = [r]
        user_index[r['user_id']] = u_index
        u_index += 1
    else:
        user_data[r['user_id']].append(r)

In [21]:
len(user_data), len(user_index), len(item_data), len(item_index)

(49329, 49329, 12678, 12678)

In [37]:
def calc_accuracy():
    correct_predictions = 0
    total_predictions = 0

    for r in test_data:
        try:
            user_idx = user_index[r['user_id']]
            item_key = r['item_id'] + '|' + str(r['size'])
            if item_key in item_index:
                item_idx = item_index[item_key]
                s = true_size_cust[user_idx]
                t = true_size_item[item_idx]

                prediction = predict_fit(s, t)
                if prediction == r['fit']:
                    correct_predictions += 1

                total_predictions += 1
        except KeyError:
            pass  # Handle missing data or keys

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    print('Accuracy:', accuracy, " Total Predictions: ", total_predictions)

In [25]:
def predict_fit(user_size, item_size):
    if f(user_size, item_size) < b_1:
        return 'small'
    elif f(user_size, item_size) >= b_1 and f(user_size, item_size) <= b_2:
        return 'fit'
    else:
        return 'large'

In [26]:
def f(s,t):
    return w*(s-t)

In [27]:
def cal_loss_user(user, cust_size):
    loss = 0
    for r in user_data[user]:
        if 'small' in r['fit']:
            loss += max(0, 1 - f(cust_size, true_size_item[item_index[r['item_id'] + '|' + str(r['size'])]]) + b_2)
        elif 'fit' in r['fit']:
            loss += max(0, 1 + f(cust_size, true_size_item[item_index[r['item_id'] + '|' + str(r['size'])]]) - b_2)
            loss += max(0, 1 - f(cust_size, true_size_item[item_index[r['item_id'] + '|' + str(r['size'])]]) + b_1)
        elif 'large' in r['fit']:
            loss += max(0, 1 + f(cust_size, true_size_item[item_index[r['item_id'] + '|' + str(r['size'])]]) - b_1)
    return loss
            
def cal_loss_item(item, product_size):
    loss = 0
    for r in item_data[item]:
        if 'small' in r['fit']:
            loss += max(0, 1 - f(true_size_cust[user_index[r['user_id']]], product_size) + b_2)
        elif 'fit' in r['fit']:
            loss += max(0, 1 + f(true_size_cust[user_index[r['user_id']]], product_size) - b_2)
            loss += max(0, 1 - f(true_size_cust[user_index[r['user_id']]], product_size) + b_1)
        elif 'large' in r['fit']:
            loss += max(0, 1 + f(true_size_cust[user_index[r['user_id']]], product_size) - b_1)
    return loss

In [28]:
def total_loss():
    loss = 0
    for item in item_data:
        for r in item_data[item]:
            product_size = true_size_item[item_index[r['item_id'] + '|' + str(r['size'])]]
            if 'small' in r['fit']:
                loss += max(0, 1 - f(true_size_cust[user_index[r['user_id']]], product_size) + b_2)
            elif 'fit' in r['fit']:
                loss += max(0, 1 + f(true_size_cust[user_index[r['user_id']]], product_size) - b_2)
                loss += max(0, 1 - f(true_size_cust[user_index[r['user_id']]], product_size) + b_1)
            elif 'large' in r['fit']:
                loss += max(0, 1 + f(true_size_cust[user_index[r['user_id']]], product_size) - b_1)
    return loss

In [29]:
### training

In [38]:
true_size_item = np.zeros(len(item_data))
true_size_cust = np.zeros(len(user_data))
w = 1; b_1 = -1; b_2 = 1; lamda = 2

for item in item_data:
    true_size_item[item_index[item]] = int(item.split('|')[1])

In [39]:
num_iter = 300
lr = 0.000005

for iterr in range(0,num_iter):
    
    ## Phase 1
    for user in user_data:
        candidate_sizes = []
        for r in user_data[user]:
            if 'small' in r['fit']:
                candidate_sizes.append(true_size_item[item_index[r['item_id'] + '|' + str(r['size'])]] + ((b_2+1)/w))
            elif 'fit' in r['fit']:
                candidate_sizes.append(true_size_item[item_index[r['item_id'] + '|' + str(r['size'])]] + ((b_1+1)/w))
                candidate_sizes.append(true_size_item[item_index[r['item_id'] + '|' + str(r['size'])]] + ((b_2-1)/w))
            elif 'large' in r['fit']:
                candidate_sizes.append(true_size_item[item_index[r['item_id'] + '|' + str(r['size'])]] + ((b_1-1)/w))

        flag = 0
        candidate_sizes = list(set(candidate_sizes))
        candidate_sizes = sorted(candidate_sizes)

        if len(candidate_sizes) == 1:
            true_size_cust[user_index[user]] = candidate_sizes[0]
        else:
            for s in range(1, len(candidate_sizes)):
                slope = (cal_loss_user(user, candidate_sizes[s]) - cal_loss_user(user, candidate_sizes[s-1]))/(candidate_sizes[s] - candidate_sizes[s-1])
                if slope>=0:
                    flag=1
                    true_size_cust[user_index[user]] = candidate_sizes[s-1]
                    break

            if flag==0:
                true_size_cust[user_index[user]] = candidate_sizes[-1]

    ## Phase 2
    for item in item_data:
        candidate_sizes = []
        for r in item_data[item]:
            if 'small' in r['fit']:
                candidate_sizes.append(true_size_cust[user_index[r['user_id']]] - ((b_2+1)/w))
            elif 'fit' in r['fit']:
                candidate_sizes.append(true_size_cust[user_index[r['user_id']]] - ((b_1+1)/w))
                candidate_sizes.append(true_size_cust[user_index[r['user_id']]] - ((b_2-1)/w))
            elif 'large' in r['fit']:
                candidate_sizes.append(true_size_cust[user_index[r['user_id']]] - ((b_1-1)/w))

        flag = 0
        candidate_sizes = list(set(candidate_sizes))
        candidate_sizes = sorted(candidate_sizes)
        if len(candidate_sizes) == 1:
            true_size_item[item_index[item]] = candidate_sizes[0]
        else:
            for s in range(1, len(candidate_sizes)):
                slope = (cal_loss_item(item, candidate_sizes[s]) - cal_loss_item(item, candidate_sizes[s-1]))/(candidate_sizes[s] - candidate_sizes[s-1])
                if slope>=0:
                    flag=1
                    true_size_item[item_index[item]] = candidate_sizes[s-1]
                    break

            if flag==0:
                true_size_item[item_index[item]] = candidate_sizes[-1]

    ## Phase 3
    learning_rate = lr/np.sqrt(iterr+1)
    grad_w = 0
    grad_b1 = 0
    grad_b2 = 0
    for r in train_data:
        s = true_size_cust[user_index[r['user_id']]]
        t = true_size_item[item_index[r['item_id'] + '|' + str(r['size'])]]

        if 'small' in r['fit']:
            A = 1 - f(s, t) + b_2
            if A>0:
                grad_w += -1*(s - t)
                grad_b2 += 1
        elif 'fit' in r['fit']:
            B = 1 + f(s, t) - b_2
            C = 1 - f(s, t) + b_1
            if B>0:
                grad_w += (s - t)
                grad_b2 += -1
            if C>0:
                grad_w += -1*(s - t)
                grad_b1 += 1
        elif 'large' in r['fit']:
            D = 1 + f(s, t) - b_1
            if D>0:
                grad_w += (s - t)
                grad_b1 += -1

    w -= learning_rate*(grad_w + 2*lamda*w)
    b_1 -= learning_rate*(grad_b1 + 2*lamda*b_1)
    b_2 -= learning_rate*(grad_b2 + 2*lamda*b_2)
    if iterr%5 == 0:
        print(iterr, total_loss())
        calc_accuracy()

0 50663.96705474177
Accuracy: 0.40741588464179446  Total Predictions:  4369
5 49527.74946358187
Accuracy: 0.34721904325932706  Total Predictions:  4369
10 47268.631635746475
Accuracy: 0.34561684596017395  Total Predictions:  4369
15 46713.53486077828
Accuracy: 0.3394369420920119  Total Predictions:  4369
20 47502.421197272306
Accuracy: 0.3378347447928588  Total Predictions:  4369


KeyboardInterrupt: 

In [97]:
train_features = []; train_labels = []
for r in train_data:
    fe = []
    fe.append(true_size_cust[user_index[r['user_id']]])
    fe.append(true_size_item[item_index[r['item_id'] + '|' + str(r['size'])]])
    train_features.append(fe)

    if 'small' in r['fit']:
        train_labels.append(0)
    elif 'fit' in r['fit']:
        train_labels.append(1)
    elif 'large' in r['fit']:
        train_labels.append(2)

c = 1
clf_1LV = LogisticRegression(fit_intercept=True, multi_class='ovr', C=c)
clf_1LV.fit(train_features, train_labels)

test_features = []; test_labels = []; test_labels_auc = []
for r in valid_data:
    fe = []
    try:
        u = user_index[r['user_id']]
        fe.append(true_size_cust[u])
    except KeyError:
        fe.append(np.mean(true_size_cust))
    try:
        fe.append(true_size_item[item_index[r['item_id'] + '|' + str(r['size'])]])
    except KeyError:
        fe.append(np.mean(true_size_item))

    test_features.append(fe)
    label = [0, 0, 0]
    if 'small' in r['fit']:
        test_labels.append(0)
        label[0] = 1
    elif 'fit' in r['fit']:
        test_labels.append(1)
        label[1] = 1
    elif 'large' in r['fit']:
        test_labels.append(2)
        label[2] = 1
    test_labels_auc.append(label)

test_labels_auc = np.array(test_labels_auc)

predictions = clf_1LV.predict(test_features)
correct_predictions = sum(predictions == test_labels)
total_predictions = len(test_labels)
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
print('Accuracy:', accuracy)

Accuracy: 0.48320413436692505
