In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action="ignore", category=DataConversionWarning)
warnings.filterwarnings(action="ignore", category=FutureWarning)

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [5]:
df = pd.read_csv("reviews.csv.gz", compression="gzip")

In [6]:
df.shape

(48136, 15)

In [7]:
df.describe(include='all')

Unnamed: 0,age,body type,bust size,category,fit,height,item_id,rating,rented for,review_date,review_summary,review_text,size,user_id,weight
count,47894.0,44544,43567,48136,48136,47952,48136.0,48116.0,48133,48136,48136,48136,48136.0,48136.0,40594
unique,,7,95,62,3,24,,,8,2120,41417,47935,,,173
top,,hourglass,34b,dress,fit,"5' 4""",,,wedding,"June 15, 2016",b'Stylist Review',b'.',,,130lbs
freq,,13887,6732,23179,35430,7100,,,14411,208,230,18,,,3576
mean,33.86658,,,,,,1043875.0,9.084255,,,,,12.248255,499091.94447,
std,8.018446,,,,,,806755.6,1.436092,,,,,8.52307,288853.095706,
min,0.0,,,,,,123373.0,2.0,,,,,0.0,47.0,
25%,29.0,,,,,,194182.0,8.0,,,,,8.0,249516.0,
50%,32.0,,,,,,945880.0,10.0,,,,,12.0,498811.0,
75%,37.0,,,,,,1678888.0,10.0,,,,,16.0,749178.75,


<p>So from the above I could see that there was 48136 examples in the dataset however many of the columns were missing values </p>
<p>Also some possible outliers can be seen, such as the maximum age is 117</p>
<p>Also the height and weight which should be numbers are strings/objects</p>
<p>The most common review is empty so those are not needed and should be removed</p>

In [8]:
df.dropna(subset=['weight'], inplace=True) # maybe impute cause 7000 examples gone 

In [9]:
df['weight'] = df['weight'].str.strip('lbs')

In [10]:
df['weight'] = df['weight'].astype(int)

<p>here I am converting the weight to an int. But something I noticed but i didn't include here was that removing the nan's ended up removing about 7000 examples so this is one feature maybe could be imputed</p>

In [11]:
df['age'].unique()

array([ 36.,  37.,  33.,  31.,  23.,  47.,  29.,  44.,  42.,  32.,  30.,
        28.,  54.,  41.,  38.,  25.,  27.,  46.,  48.,  40.,  39.,  63.,
        56.,  34.,  26.,  22.,  53.,  43.,  60.,  45.,  59.,  35.,  50.,
        21.,  75.,  66.,  24.,  69.,  17.,  57.,  16.,  49.,  20.,  64.,
        51.,  52.,  nan,  19.,  87.,  18.,  67.,  70.,  65.,  58.,  55.,
        61.,  62.,  72.,   0.,  85.,  15.,   9.,  68., 116.,  76.,   1.,
         4.,   2., 117.,   3.,  14.,  91.,   5.,  73.,  99., 112.,  77.])

In [12]:
df = df[(df['age'] <= 99) & (df['age'] >= 10)]

<p>Removed the exmaples where the ages were outside these values because could be outliers</p>

In [13]:
df.dropna(subset=['age'], inplace=True)

In [14]:
df['height'].unique()

array(['5\' 6"', '5\' 5"', '5\' 3"', '5\' 7"', '5\' 4"', '5\' 1"',
       '5\' 2"', '5\' 8"', '5\' 10"', '5\' 9"', '5\' 11"', '5\' 0"',
       '4\' 11"', '6\' 2"', '6\' 0"', nan, '4\' 10"', '6\' 1"', '4\' 9"',
       '4\' 8"', '4\' 7"', '4\' 6"', '6\' 3"', '6\' 6"', '6\' 4"'],
      dtype=object)

In [15]:
df.dropna(subset=['height'], inplace=True)

In [16]:
df['height'] = df['height'].str.strip("\"")
df['height'] = df['height'].str.replace("'", '.')
df['height'] = df['height'].str.replace(" ", '')

In [17]:
df['height'] = df['height'].astype(float)

<p>Converting the height to a float because the height of a person should be an int/float and the 'distance' betweens heights are related and important for example, if the dress is too short or too long because of their height</p>

In [18]:
df['fit'].unique()

array(['fit', 'large', 'small'], dtype=object)

In [19]:
df['body type'].unique()

array(['athletic', 'full bust', 'petite', 'hourglass',
       'straight & narrow', 'pear', 'apple', nan], dtype=object)

In [20]:
df.dropna(subset=['body type'], inplace=True)

In [21]:
df['bust size'].unique()

array(['32ddd/e', '36d', '34b', '34c', '36c', '32b', '36aa', '32c', '36b',
       '34dd', '32d', '36a', '34d', nan, '38ddd/e', '38d', '34a', '38c',
       '36ddd/e', '34d+', '32a', '34ddd/e', '32d+', '30dd', '34g', '38b',
       '40dd', '32dd', '34h', '36d+', '36h', '36dd', '30c', '34aa', '44b',
       '38dd', '30d', '32g', '38d+', '34f', '36g', '28dd', '32aa', '32f',
       '28a', '30ddd/e', '40d', '30b', '38a', '30a', '44g', '32h', '38f',
       '42g', '28c', '42ddd/e', '42d', '42dd', '36f', '40f', '28aa',
       '38g', '30f', '28d', '42b', '30g', '40c', '42c', '40ddd/e', '38aa',
       '30h', '28ddd/e', '36i', '28b', '44d', '40h', '40g', '44f', '32j',
       '38i', '28f', '36j', '30aa', '44c', '44ddd/e', '38j', '38h',
       '44dd', '40b'], dtype=object)

In [22]:
df.dropna(subset=['bust size'], inplace=True) 

In [23]:
df['category'].unique()

array(['sweater', 'gown', 'dress', 'sheath', 'maxi', 'romper', 'top',
       'mini', 'shirtdress', 'blouse', 'jumpsuit', 'shift', 'pants',
       'coat', 'culotte', 'jacket', 'tank', 'culottes', 'skirt', 'tunic',
       'blazer', 'sweatshirt', 'down', 'frock', 'vest', 'overalls',
       'skirts', 'cape', 'cardigan', 'bomber', 'shirt', 'suit', 'henley',
       'hoodie', 'poncho', 'for', 'kimono', 'blouson', 'pullover',
       'trousers', 'turtleneck', 'kaftan', 'pant', 't-shirt', 'ballgown',
       'knit', 'legging', 'print', 'trench', 'cami', 'leggings', 'duster',
       'trouser', 'tee', 'midi', 'peacoat', 'combo', 'skort', 'parka',
       'buttondown', 'crewneck'], dtype=object)

In [24]:
df['category'] = df['category'].str.replace("culote", "culottes")
df['category'] = df['category'].str.replace("leggings", "leggings")
df['category'] = df['category'].str.replace("pant", "pants")
df['category'] = df['category'].str.replace("skirt", "skirts")
df['category'] = df['category'].str.replace("trousers", "trouser")

<p>Correcting some mistakes</p>

In [25]:
df['rented for'].unique()

array(['everyday', 'wedding', 'work', 'other', 'formal affair', 'party',
       'date', 'vacation', nan], dtype=object)

In [26]:
df.dropna(subset=['rented for'], inplace=True)

In [27]:
df['size'].unique()

array([ 8, 20, 12,  1,  4, 15, 16, 32, 24, 13, 11, 14, 58,  9, 35, 17, 28,
        5,  7, 45, 39, 29,  3,  0, 25, 21,  2, 26, 36, 40, 51, 57, 19, 48,
       33, 54, 42, 27, 46, 10, 38, 22, 52, 23, 43, 49,  6, 37, 34, 18, 30])

In [28]:
df['rating'].unique()

array([10.,  8.,  6.,  4.,  2., nan])

In [29]:
df.dropna(subset=['rating'], inplace=True)

In [30]:
df.reset_index(drop=True, inplace=True)

In [31]:
y = df["rating"]

<p>So I tried out 8 differernt models, 2 with standardscaler and onehotencoding, 2 with imputing the weights, 2 with tfidf, 2 with PCA all as both regressor and classifier to see which would retrun the best values. PCA because many features see irrelevent to me such as the item_id and the rented_for features</p>
<p>I used tfidf to see if I could find similarities in the reviews that would help to see a pattern between the other features</p>
<p>And imputing obviously because dropping nan's in weight removes about 7000 examples</p>

In [37]:
numeric_features_1 = ["age", "size", "user_id","item_id", "weight", "height" ]
nominal_features_1 = ["body type", "bust size", "category", "fit", "rented for", "review_date"]

numeric_features_2 = ["age", "size", "user_id","item_id", "weight", "height" ]
nominal_features_2 = ["body type", "bust size", "category", "fit", "rented for", "review_date", 'review_summary', 'review_text']

preprocessor_with_impute = ColumnTransformer([
        ("num", Pipeline([("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
                          ("scaler", StandardScaler())]), 
                numeric_features_2),
        ("nom", Pipeline([("imputer", SimpleImputer(missing_values=np.nan, strategy="most_frequent")), 
                          ("binarizer", OneHotEncoder(handle_unknown="ignore"))]), 
                nominal_features_2)], 
        remainder="drop")

preprocessor_with_tfidf = (ColumnTransformer([        
    ("num", StandardScaler(), numeric_features_1),
    ("tfidf", TfidfVectorizer(), 'review_summary'), 
    ("tfidf1",TfidfVectorizer(), 'review_text'), 
    ("nom", OneHotEncoder(handle_unknown='ignore'),nominal_features_1)]))

preprocessor_with_PCA = ColumnTransformer([
        ("num", PCA(n_components=0.9), numeric_features_2),
        ("nom", OneHotEncoder(handle_unknown="ignore"), nominal_features_2)], 
        remainder="drop")

preprocessor = ColumnTransformer([
("num", StandardScaler(), numeric_features_2),
("nom", OneHotEncoder(handle_unknown="ignore"), nominal_features_2)],
remainder="drop")

pipeline_reg = Pipeline([
("pre", preprocessor),
("est", LinearRegression())])

pipeline_reg_PCA = Pipeline([
("pre", preprocessor_with_PCA),
("est", LinearRegression())])

pipeline_reg_impute = Pipeline([
("pre", preprocessor_with_impute),
("est", LinearRegression())])

pipeline_reg_tfidf = Pipeline([
("pre", preprocessor_with_tfidf),
("est", LinearRegression())])

pipeline_class = Pipeline([
("pre", preprocessor),
("est", LogisticRegression(multi_class="multinomial", solver="newton-cg"))])

pipeline_class_PCA = Pipeline([
("pre", preprocessor_with_PCA),
("est", LogisticRegression(multi_class="multinomial", solver="newton-cg"))])

pipeline_class_impute = Pipeline([
("pre", preprocessor_with_impute),
("est", LogisticRegression(multi_class="multinomial", solver="newton-cg"))])

pipeline_class_tfidf = Pipeline([
("pre", preprocessor_with_tfidf),
("est", LogisticRegression(multi_class="multinomial", solver="newton-cg"))])


maj_pipeline_reg = DummyRegressor(strategy = "mean")
maj_pipeline_class = DummyClassifier(strategy = "most_frequent")

In [38]:
ss = ShuffleSplit(n_splits=1, train_size=0.8)

In [None]:
print("neg mean : ", cross_val_score(pipeline_reg_tfidf, df, y, scoring="neg_mean_absolute_error", cv=ss))
#print("classifier : ",cross_val_score(pipeline_class, df, y, scoring="accuracy", cv=ss))
print("dummy : ", cross_val_score(maj_pipeline_reg, df, y, scoring="neg_mean_absolute_error", cv=ss))
#print("dummy : ", cross_val_score(maj_pipeline_class, df, y, scoring="accuracy", cv=ss))

<h1>Other Models</h1>
<p>I did other models for example one where I imputed that value of weight as 7000 examples had a 'nan' value</p>

<p>These are the results of hte regressor with imputing the values of weight </p>
<p>array([-1.0752906 , -1.05740605, -1.08392036, -1.07584011, -1.07038251,
       -1.08051338, -1.07397215, -1.09999074, -1.09103645, -1.07517578]</p>

<p>And for the Classifier it ended up being worse than the dummy class</p>

<p>classifier : [0.6369583 0.64221434 0.64630488 0.64385056 0.6425648 0.64283765 0.64529332 0.63383356 0.63673581 0.6345524 ]</p>

<p>Another model was with PCA regressor on the features </p>
<p>neg mean :  [-1.10998851 -1.0870953  -1.12144093 -1.11844097 -1.10428622 -1.12002683
 -1.09118082 -1.12969712 -1.12047193 -1.10897752]</p>

<p>And running the first set of features on the classifier</p>
<p>classifier :  [0.6369583  0.64221434 0.64630488 0.64385056 0.6425648  0.64283765
 0.64529332 0.63383356 0.63673581 0.6345524 ]</p>