In [134]:
import json
import numpy as np
import pandas as pd
import random
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

In [110]:
file = "/Users/shutinggu/Desktop/1001-project/renttherunway_final_data.json"
df = pd.read_json(file,lines=True)


In [111]:
# transfer bust size into bust size and cups
df=df.dropna()
df['cups'] = df['bust size'].str.extract(r'([a-z])')
labels = df['cups'].astype('category').cat.categories.tolist()
replace_map = {'cups' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
print('--- map cups to numerical value ---')
print(replace_map)
df.replace(replace_map, inplace=True)
df['bust size'] = df['bust size'].str.extract('(\d+)').astype(int)
# remove lbs after 'weight'
df['weight'] = df['weight'].str.extract('(\d+)').astype(int)
# parse height to usable numerical format
def parse_height(ht):
    ht_ = ht.split("' ")
    ft_ = float(ht_[0])
    in_ = float(ht_[1].replace("\"",""))
    return (12*ft_) + in_
df['height'] = df['height'].apply(lambda x:parse_height(x))
# change 'party: cocktail' in 'rented for' to 'others'
df.loc[df['rented for'] == 'party: cocktail', df.columns == 'rented for'] = 'other'

--- map cups to numerical value ---
{'cups': {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9}}


In [112]:
df=df.drop(columns=['user_id', 'item_id','review_date','review_summary','review_text','fit'])

In [113]:
df['category'].unique()

array(['romper', 'gown', 'dress', 'sheath', 'leggings', 'sweater',
       'jacket', 'shirtdress', 'jumpsuit', 'shift', 'top', 'shirt',
       'mini', 'skirt', 'maxi', 'pants', 'suit', 'coat', 'blouse',
       'trench', 'cape', 'bomber', 'blazer', 'vest', 'duster', 'ballgown',
       'tank', 'poncho', 'frock', 'tunic', 'cardigan', 'down', 'culottes',
       'midi', 'legging', 'print', 'pant', 'knit', 'culotte',
       'sweatshirt', 'peacoat', 'trouser', 'kaftan', 'overalls', 'jogger',
       'tee', 'combo', 'henley', 'blouson', 'pullover', 'turtleneck',
       'trousers', 'overcoat', 'hoodie', 't-shirt', 'caftan', 'tight',
       'kimono', 'cami', 'for', 'crewneck', 'skirts', 'parka',
       'buttondown', 'skort', 'sweatershirt', 'jeans', 'sweatpants'],
      dtype=object)

In [114]:
df.loc[df['category'].isin(['sweater','shirt','blouse','vest','tank','tunic','print','sweatshirt',
                            'tee','blouson','turtleneck','hoodie','t-shirt','cami','crewneck',
                            'buttondown','sweatershirt'
                           ]),df.columns=='category']='tops'
df.loc[df['category'].isin(['pants','down','culottes','pant','trouser','culotte','jogger',
                            'trousers','jeans', 'sweatpants'
                           ]),df.columns=='category']='bottoms'
df.loc[df['category'].isin(['ballgown']),df.columns=='category']='gown'
df.loc[df['category'].isin(['leggings','legging']),df.columns=='category']='active'
df.loc[df['category'].isin(['jacket','trench','cape','bomber','blazer','duster','poncho',
                            'cardigan','peacoat','pullover','overcoat','parka'
                           ]),df.columns=='category']='coat'
df.loc[df['category'].isin(['mini','midi','skirts']),df.columns=='category']='skirt'
df.loc[df['category'].isin(['jumpsuit']),df.columns=='category']='romper'
df.loc[df['category'].isin(['sheath','shift','maxi','shirtdress','frock','kaftan','caftan'
                           ]),df.columns=='category']='dress'
df.loc[df['category'].isin(['overalls','combo','henley','tight','kimono','for','skort']),df.columns=='category']='other'

In [115]:
df.info()
df= pd.get_dummies(df, columns=['body type','category','rented for'])


<class 'pandas.core.frame.DataFrame'>
Int64Index: 146381 entries, 0 to 192543
Data columns (total 10 columns):
age           146381 non-null float64
body type     146381 non-null object
bust size     146381 non-null int64
category      146381 non-null object
height        146381 non-null float64
rating        146381 non-null float64
rented for    146381 non-null object
size          146381 non-null int64
weight        146381 non-null int64
cups          146381 non-null int64
dtypes: float64(3), int64(4), object(3)
memory usage: 12.3+ MB


In [116]:
df.head()

Unnamed: 0,age,bust size,height,rating,size,weight,cups,body type_apple,body type_athletic,body type_full bust,...,category_top,category_tops,rented for_date,rented for_everyday,rented for_formal affair,rented for_other,rented for_party,rented for_vacation,rented for_wedding,rented for_work
0,28.0,34,68.0,10.0,14,137,4,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,36.0,34,66.0,10.0,12,132,2,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,34.0,34,65.0,8.0,8,135,3,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,27.0,34,69.0,10.0,12,145,2,0,1,0,...,0,0,0,0,0,0,0,0,1,0
5,45.0,32,68.0,8.0,8,138,2,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [117]:
x=df.drop(columns=['rating'],axis=1)
y=df['rating']
x_train, x_test,y_train,y_test = train_test_split(x,y, test_size=0.33)

In [118]:
#Linear Regression
lg=LinearRegression(normalize=True).fit(x_train,y_train)
lg_train_pred=lg.predict(x_train)
lg_train_mse=mean_squared_error(y_train, lg_train_pred)
print('Linear Regression Train MSE:',lg_train_mse)
lg_test_pred=lg.predict(x_test)
lg_test_mse=mean_squared_error(y_test, lg_test_pred)
print('Linear Regression Test MSE:',lg_test_mse)

Linear Regression Train MSE: 2.04277823371119
Linear Regression Test MSE: 2.0303368217796143


In [155]:
#Decision Tree Regressor
dt=DecisionTreeRegressor(min_samples_split=10,min_samples_leaf=10, max_depth=10).fit(x_train,y_train)
dt_train_pred=dt.predict(x_train)
dt_train_mse=mean_squared_error(y_train, dt_train_pred)
print('Decision Tree Regressor Train MSE:',dt_train_mse)
dt_test_pred=dt.predict(x_test)
dt_test_mse=mean_squared_error(y_test, dt_test_pred)
print('Decision Tree Regressor Test MSE:',dt_test_mse)

Decision Tree Regressor Train MSE: 1.9919434427017608
Decision Tree Regressor Test MSE: 2.0673287226618458


In [129]:
#Random Forest regressor
rm=RandomForestRegressor(max_depth=10,n_estimators=100,min_samples_split=10,min_samples_leaf=10).fit(x_train,y_train)
rm_train_pred=rm.predict(x_train)
rm_train_mse=mean_squared_error(y_train, rm_train_pred)
print('Random Forest Regressor Train MSE:',rm_train_mse)
rm_test_pred=rm.predict(x_test)
rm_test_mse=mean_squared_error(y_test, rm_test_pred)
print('Random Forest Regressor Test MSE:',rm_test_mse)

Random Forest Train MSE: 1.969289156665057
Random Forest Test MSE: 2.022420601355452


In [150]:
#Gradient Boosting Regressor
gb=GradientBoostingRegressor(loss='ls', learning_rate=0.001, n_estimators=100, min_samples_split=10, max_depth=10).fit(x_train,y_train)
gb_train_pred=gb.predict(x_train)
gb_train_mse=mean_squared_error(y_train, gb_train_pred)
print('Gradient Boosting Regressor Train MSE:',gb_train_mse)
gb_test_pred=gb.predict(x_test)
gb_test_mse=mean_squared_error(y_test, gb_test_pred)
print('Gradient Boosting Regressor Test MSE:',gb_test_mse)

Gradient Boosting Regressor Train MSE: 2.0585457329425094
Gradient Boosting Regressor Test MSE: 2.046469875371044
