In [0]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook

In [3]:
cd "drive/My Drive/Colab Notebooks/dw_matrix"

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [0]:
df = pd.read_csv('data/shoes_prices_men.csv', low_memory=False)

In [5]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension', 'ean', 'features',
       'flavors', 'imageurls', 'isbn', 'keys', 'manufacturer',
       'manufacturernumber', 'merchants', 'name', 'prices_amountmin',
       'prices_amountmax', 'prices_availability', 'prices_color',
       'prices_condition', 'prices_count', 'prices_currency',
       'prices_dateadded', 'prices_dateseen', 'prices_flavor', 'prices_issale',
       'prices_merchant', 'prices_offer', 'prices_returnpolicy',
       'prices_shipping', 'prices_size', 'prices_source', 'prices_sourceurls',
       'prices_warranty', 'quantities', 'reviews', 'sizes', 'skus',
       'sourceurls', 'upc', 'vin', 'websiteids', 'weight'],
      dtype='object')

In [0]:
def run_model(feautures, model=DecisionTreeRegressor(max_depth=5)):
  X = df[ feautures ].values
  y = df['prices_amountmin'].values

  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')

  return np.mean(scores), np.std(scores)

In [13]:
df['brand_cat'] = df['brand'].map(lambda x: str(x).lower()).factorize()[0]
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [14]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [15]:
df.head

<bound method NDFrame.head of                          id asins  ... weight brand_cat
0      AVpfHrJ6ilAPnD_xVXOI   NaN  ...    NaN         0
1      AVpfHrJ6ilAPnD_xVXOI   NaN  ...    NaN         0
2      AVpfHsWP1cnluZ0-eVZ7   NaN  ...    NaN         1
3      AVpfHsWP1cnluZ0-eVZ7   NaN  ...    NaN         1
4      AVpfHsWP1cnluZ0-eVZ7   NaN  ...    NaN         1
...                     ...   ...  ...    ...       ...
18275  AVpfdSjlilAPnD_xcGPm   NaN  ...    NaN      1731
18276  AVpf3bFWilAPnD_xjrQ2   NaN  ...    NaN       299
18277  AVpf0fJXLJeJML43EVe9   NaN  ...    NaN       399
18278  AVpf0fJXLJeJML43EVe9   NaN  ...    NaN       399
18279  AVpf0fJXLJeJML43EVe9   NaN  ...    NaN       399

[18280 rows x 49 columns]>

In [16]:
df.features.head().values

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SER

In [21]:
str_dict = '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]'
literal_eval(str_dict)

[{'key': 'Gender', 'value': ['Men']},
 {'key': 'Color', 'value': ['Black']},
 {'key': 'Shipping Weight (in pounds)', 'value': ['0.45']},
 {'key': 'Condition', 'value': ['New']},
 {'key': 'Brand', 'value': ['SERVUS BY HONEYWELL']},
 {'key': 'manufacturer_part_number', 'value': ['ZSR101BLMLG']}]

In [0]:
def parse_feautures(x):
  output_dict = {}
  if str(x) == 'nan': return {}

  features =  literal_eval(x.replace('\\"','"'))
  
  for item in features:
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()

    output_dict[key] = value

  return output_dict


df['features_parsed'] = df['features'].map(parse_feautures)

In [49]:
keys = set()

df['features_parsed'].map(lambda x: keys.update(x.keys()))

len(keys)

476

In [34]:
df.features_parsed.head().values

array([{'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'shoe size': 'm', 'shoe category': "men's shoes", 'color': 'multicolor', 'manufacturer part number': '8190-w-navy-7.5', 'brand': 'josmo'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'},
       {'gender': 'men', 'color': 'black', 'shipping weight (in pounds)': '0.45', 'condition': 'new', 'brand': 'servus by honeywell', 'manufacturer_part_number': 'zsr101blmlg'}],
      dtype=object)

In [60]:
def get_name_feat(key):
  return 'feat_' + key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [61]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_applicable', 'feat_frame type', 'feat_country of manufacturer',
       'feat_occasion', 'feat_animal type', 'feat_item style',
       'feat_eye protection type', 'feat_has mercury', 'feat_has paper wood',
       'feat_international shipping'],
      dtype='object', length=526)

In [63]:
df[ df['feat_athlete'].isnull()].shape

(18272, 526)

In [64]:
df.shape

(18280, 526)

In [76]:
df[ False == df['feat_athlete'].isnull()].shape[0] / df.shape[0] *100

0.0437636761487965

In [0]:
keys_stat = {}

for key in keys:
  keys_stat[key] = df[ False == df[get_name_feat(key)].isnull()].shape[0] / df.shape[0] *100

In [91]:
{k:v for k,v in keys_stat.items() if v > 30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
df['feat_color_cat'] = df['feat_color'].factorize()[0]
df['feat_gender_cat'] = df['feat_gender'].factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
df['feat_material_cat'] = df['feat_material'].factorize()[0]

df['feat_sport_cat'] = df['feat_sport'].factorize()[0]
df['feat_style_cat'] = df['feat_style'].factorize()[0]

for key in keys:
  df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].factorize()[0]

In [117]:
df[ df.brand != df.feat_brand ][ ['brand', 'feat_brand'] ].head()

Unnamed: 0,brand,feat_brand
0,Josmo,josmo
1,Josmo,josmo
2,SERVUS BY HONEYWELL,servus by honeywell
3,SERVUS BY HONEYWELL,servus by honeywell
4,SERVUS BY HONEYWELL,servus by honeywell


In [119]:
df['brand'] = df['brand'].map(lambda x: str(x).lower())
df[ df.brand != df.feat_brand ][ ['brand', 'feat_brand'] ].head()

Unnamed: 0,brand,feat_brand
21,rubies,generic
22,rubies,generic
23,rubies,generic
24,unbranded,
31,american fighter,


In [170]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
result = run_model(['brand_cat'], model)
result

(-57.27178415954161, 4.151746188287394)

In [0]:
feats_cat = [x for x in df.columns if 'cat' in x]
feats_cat

In [168]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_shape_cat', 'feat_metal type_cat', 'feat_gender_cat', 'feat_material_cat', 'feat_sport_cat', 'feat_style_cat']

model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(feats, model)

(-57.257545821087525, 4.215740016225139)

In [171]:
X = df[ feats ].values
y = df[ 'prices_amountmin' ].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X,y)

print(result)
perm = PermutationImportance(m, random_state=1).fit(X, y)
eli5.show_weights(perm, feature_names=feats)

(-57.27178415954161, 4.151746188287394)


Weight,Feature
0.2557  ± 0.0079,brand_cat
0.1032  ± 0.0075,feat_material_cat
0.0252  ± 0.0026,feat_gender_cat
0.0177  ± 0.0007,feat_brand_cat
0.0127  ± 0.0012,feat_shape_cat
0.0096  ± 0.0017,feat_metal type_cat
0.0032  ± 0.0005,feat_style_cat
0.0002  ± 0.0000,feat_sport_cat


In [165]:
df.brand.value_counts(normalize=True)

nike                     0.097210
puma                     0.033315
ralph lauren             0.028775
vans                     0.021116
new balance              0.020295
                           ...   
gelato mens              0.000055
under armor              0.000055
ryan seacrest            0.000055
eddie bauer              0.000055
professional's choice    0.000055
Name: brand, Length: 1732, dtype: float64

In [166]:
df[ df['brand'] == 'nike' ].features_parsed.sample(5).values

array([{'gender': 'men', 'brand': 'nike', 'color': 'white/white-wolf grey-pure platinum'},
       {'sport': 'basketball, fitness, football, lacrosse, running', 'style': 'training shorts', 'material': 'polyester', 'country/region of manufacture': 'vietnam', 'color': 'blue/orange', 'condition': 'new with tags'},
       {'material': 'leather', 'gender': 'men', 'size': '11', 'color': 'black', 'model': '807219 001', 'manufacturer part number': '807219 001', 'brand': 'nike', 'age group': 'adult'},
       {'sport': 'soccer', 'main color': 'chrome'},
       {'style': 'trainers', 'sole': 'rubber', 'lining': 'textile', 'upper material': 'textile', 'colour': 'blue', 'condition': 'new without box'}],
      dtype=object)

In [167]:
df['feat_age group'].value_counts()

adult               4563
men                  350
child                 77
men's                 33
unisex                 6
infant                 4
mens                   4
toddler                4
boys'                  3
women                  2
women ,�� unisex       2
youth                  2
men||women             2
adult ,�� teen         1
12 up                  1
Name: feat_age group, dtype: int64

In [169]:
df.weight.unique() #TODO

array([nan, '3.0 lbs', '9 g', '1.45 lbs', '0.45 lbs', '1.0 lbs',
       '0.23 lbs', '5.0 lbs', '5.5 lbs', '7.45 lbs', '4.0 lbs',
       '2.7969 lbs', '3.9 lbs', '4.6 pounds', '2.1 lbs', '1.1057 lbs',
       '15.0 lbs', '2.4 ounces', '454 g', '0.105 lbs', '9.1 ounces',
       '4.8 lbs', '6.1 lbs', '6.5 lbs', '1.1041 lbs', '1.3 Kg', '91 g',
       '20.0 lbs', '6.0 lbs', '386 g', '0.81 lbs', '4.5 lbs',
       '0.5 ounces', '2.0 lbs', '3.13 lbs', '5.9 lbs', '6.15 lbs',
       '1 pounds', '1.95 lbs', '2.15 lbs', '2 pounds', '2.1 pounds',
       '14 Kg', '0.4788 lbs', '10.0 lbs', '0.38 lbs', '2.5 lbs',
       '68.912 lbs', '45 g', '13.09 lbs', '2.5 pounds', '0.21 lbs',
       '16.75 lbs', '6.3 lbs', '272 g', '1.8 Kg', '2.8 pounds', '0.1 lbs',
       '5.05 lbs', '0.28 lbs', '76.08 lbs', '0.15 lbs', '200 g',
       '7.8 pounds', '399 g', '4.95 lbs', '64.144 lbs', '24 pounds',
       '73.696 lbs', '1.6 lbs', '6.6 ounces', '5 g', '1.2 Kg', '862 g',
       '3.05 lb', '8.6 ounces', '3.6 lbs', '71.