In [1]:
import os

import gzip
import numpy as np
import pandas as pd

import json
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE,RFECV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import balanced_accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline

import torch
import torch.nn as nn
import torch.optim as optim
import spacy

from spacy.lang.en import English

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield json.loads(l)

In [3]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [4]:
rent_runway = pd.DataFrame(list(readGz("./renttherunway_final_data.json.gz")))

In [5]:
# Data Cleaning Cell
rent_runway["bust size"] = rent_runway["bust size"].str[2:]
rent_runway["weight"] = rent_runway["weight"].str[:-3].astype(float)
rent_runway["height"] = (rent_runway["height"].str[0].astype(float) * 12) + rent_runway["height"].str[2:-1].astype(float)
rent_runway["age"] = rent_runway["age"].astype(float)
rent_runway["rating"] = rent_runway["rating"].astype(float)
rent_runway["review_date"] = pd.to_datetime(rent_runway["review_date"])
rent_runway["year"] = rent_runway["review_date"].dt.year
rent_runway["month"] = rent_runway["review_date"].dt.month
rent_runway.loc[rent_runway["rented for"] == "party: cocktail", "rented for"] = "other"
rent_runway = rent_runway.drop(columns="review_date")
filtered_runway = rent_runway.dropna()

# EDA

- Turn `category` columns into values that inform the `rented for` column
    - If an item is not largely associated with a particular category, label it as "general" or "other"
- Look for correlated features and point them out
- If you have time, perform DSC80 style permutation tests with the TVD test stat to get pvalues that represent if a correlation exists with the `rented for` column
    - These would include the following columns: \['fit', 'body type', 'age', 'year', 'month'\]
- Any extra stuff would be sick

# Model Dev

In [6]:
X = filtered_runway.drop(columns=["rented for","user_id","item_id"])
X["age"] = pd.qcut(X["age"],q=10).astype(str)
y = filtered_runway["rented for"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=57)

In [7]:
# Baseline model
categories = ['other','vacation', 'formal', 'wedding', 'date', 'everyday', 'party', 'work']
preds = pd.DataFrame(columns=categories)
for category in categories[1:]:
    preds[category] = filtered_runway["review_text"].str.count(category)
preds = preds.fillna(0)
preds = preds.T.idxmax()
preds = preds.str.replace('formal',"formal affair")

In [8]:
balanced_accuracy_score(y,preds)

0.2640868832151243

## Real Model Dev

In [None]:
sizes = ['a', 'aa', 'b', 'c', 'd', 'd+', 'dd', 'ddd/e', 'f', 'g', 'h', 'i', 'j']
one_hot_vars = ['fit', 'body type', 'age', 'year', 'month']
numerical_cols = ["weight","height","size","rating"]
preper = ColumnTransformer([
    ("one-hot",OneHotEncoder(), one_hot_vars),
    ("ordinal", OrdinalEncoder(categories=[sizes]), ['bust size']),
    ("identity", FunctionTransformer(), numerical_cols)
])
pipe = Pipeline([
    ("preproc", preper),
#     ("standard", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=2000))
])

In [13]:
X_preproc = preper.fit_transform(X)

In [14]:
selector = RFECV(LogisticRegression(), min_features_to_select=10, step=2, cv=5, n_jobs=-1)
selector = selector.fit(X_preproc, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [16]:
selector.support_

array([ True,  True,  True, False,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False,  True])

In [15]:
selector.score(X_preproc,y)

0.36396800131164564

## Costin's NLP Zone

In [9]:
nlp = spacy.load('en_core_web_sm')
tokens = nlp("Costin's rock.")

In [10]:
nlp = spacy.load('en_core_web_sm')
def tokenize_sentence(sent):
    return [token.text for token in nlp(sent)]

In [11]:
def get_data(text_col):
    corpus = load_corpus()
    sents = segment_and_tokenize(corpus)
    word_to_ix = make_word_to_ix(sents)
    vectorized_sents = vectorize_sents(sents,word_to_ix)

    vocab_size = len(word_to_ix)

    return vectorized_sents, vocab_size