In [77]:
%load_ext autoreload
%autoreload
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
import xgboost as xgb
from sklearn.metrics import classification_report

import sys
sys.path.append('../')
import support_utils as sup
import data_explore_utils as dex

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [78]:
pd.set_option('display.max_columns',100)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows',300)

In [79]:
from matplotlib import style
style.use('seaborn-dark')

## Read in Candsets With Features

In [133]:
# see DocString of readDataInDictionary() function for information

# ***CHANGE THE FOLLOWING VARIABLES IF STH CHANGED***
path_to_directory = '../../candsets/home/_archive/v6/'
pattern_of_filename = 'candset_feature_(.{3,5}_.{2,4})'
sep = ','
# ***********************************************

candsets_feature = sup.readDataInDictionary(path_to_directory,pattern_of_filename,sep)

In [134]:
candsets_feature.keys()

dict_keys(['katom_cdi', 'rewo_cdi', 'katom_rewo'])

### Read in the org Datasets

In [135]:
# ***CHANGE THE FOLLOWING VARIABLES IF STH CHANGED***
path_to_directory = '../../datasets/home/_archive/v2/'
pattern_of_filename = '(.{2,5})'
sep = ','
# ***********************************************

datasets = sup.readDataInDictionary(path_to_directory,pattern_of_filename,sep)

In [136]:
datasets.keys()

dict_keys(['katom', 'rewo', 'cdi'])

In [137]:
# getting the orignial attribute values for the candidate feature sets
candsets_with_org_attr = sup.getCandsetsWithOrgAttribute(candsets_feature,datasets)

#### Feature

In [85]:
# 1. Only with the spec. table attributes
spec_feature = ['style_cosine_tfidf_sim', 'style_lev_sim',
       'style_jac_q3_sim', 'style_jac_an_sim', 'style_rel_jac_an_sim',
       'style_containment_sim', 'style_exact_sim', 
       'product_type_cosine_tfidf_sim', 'product_type_lev_sim',
       'product_type_jac_q3_sim', 'product_type_jac_an_sim',
       'product_type_rel_jac_an_sim', 'product_type_containment_sim',
       'product_type_exact_sim', 'material_lev_sim', 'material_jac_q3_sim',
       'material_jac_an_sim', 'material_rel_jac_an_sim',
       'material_containment_sim', 'material_exact_sim',
       'capacity_lev_sim', 'capacity_jac_q3_sim',
       'capacity_jac_an_sim', 'capacity_rel_jac_an_sim',
       'capacity_containment_sim', 'capacity_exact_sim', 'shape_lev_sim',
       'shape_jac_q3_sim', 'shape_jac_an_sim', 'shape_rel_jac_an_sim',
       'shape_containment_sim', 'shape_exact_sim', 'category_lev_sim',
       'category_jac_q3_sim', 'category_jac_an_sim', 'category_rel_jac_an_sim',
       'category_containment_sim', 'category_exact_sim', 'finish_lev_sim',
       'finish_jac_q3_sim', 'finish_jac_an_sim', 'finish_rel_jac_an_sim',
       'finish_containment_sim', 'finish_exact_sim', 'color_lev_sim',
       'color_jac_q3_sim', 'color_jac_an_sim', 'color_rel_jac_an_sim',
       'color_containment_sim', 'color_exact_sim', 'base_cosine_tfidf_sim',
       'base_lev_sim', 'base_jac_q3_sim', 'base_jac_an_sim',
       'base_rel_jac_an_sim', 'base_containment_sim', 'base_exact_sim',
       'height_lev_sim', 'height_jac_q3_sim',
       'height_jac_an_sim', 'height_rel_jac_an_sim', 'height_containment_sim',
       'height_exact_sim']

In [86]:
#  2. Only with titles
title_feature = ['title_cosine_tfidf_sim', 'title_lev_sim', 'title_jac_q3_sim','title_jac_an_sim', 'title_rel_jac_an_sim', 'title_containment_sim','title_exact_sim']

In [87]:
#  2. Only with titles
desc_feature = ['description_cosine_tfidf_sim', 'description_lev_sim','description_jac_q3_sim', 'description_jac_an_sim',
                'description_rel_jac_an_sim', 'description_containment_sim','description_exact_sim']

In [88]:
# 3. Spec table attributes + titles
spec_title_feature = ['style_cosine_tfidf_sim', 'style_lev_sim',
       'style_jac_q3_sim', 'style_jac_an_sim', 'style_rel_jac_an_sim',
       'style_containment_sim', 'style_exact_sim', 
       'product_type_cosine_tfidf_sim', 'product_type_lev_sim',
       'product_type_jac_q3_sim', 'product_type_jac_an_sim',
       'product_type_rel_jac_an_sim', 'product_type_containment_sim',
       'product_type_exact_sim', 'material_lev_sim', 'material_jac_q3_sim',
       'material_jac_an_sim', 'material_rel_jac_an_sim',
       'material_containment_sim', 'material_exact_sim',
       'capacity_lev_sim', 'capacity_jac_q3_sim',
       'capacity_jac_an_sim', 'capacity_rel_jac_an_sim',
       'capacity_containment_sim', 'capacity_exact_sim', 'shape_lev_sim',
       'shape_jac_q3_sim', 'shape_jac_an_sim', 'shape_rel_jac_an_sim',
       'shape_containment_sim', 'shape_exact_sim', 'category_lev_sim',
       'category_jac_q3_sim', 'category_jac_an_sim', 'category_rel_jac_an_sim',
       'category_containment_sim', 'category_exact_sim', 'finish_lev_sim',
       'finish_jac_q3_sim', 'finish_jac_an_sim', 'finish_rel_jac_an_sim',
       'finish_containment_sim', 'finish_exact_sim', 'color_lev_sim',
       'color_jac_q3_sim', 'color_jac_an_sim', 'color_rel_jac_an_sim',
       'color_containment_sim', 'color_exact_sim', 'base_cosine_tfidf_sim',
       'base_lev_sim', 'base_jac_q3_sim', 'base_jac_an_sim',
       'base_rel_jac_an_sim', 'base_containment_sim', 'base_exact_sim',
       'height_lev_sim', 'height_jac_q3_sim',
       'height_jac_an_sim', 'height_rel_jac_an_sim', 'height_containment_sim',
       'height_exact_sim','title_cosine_tfidf_sim', 'title_lev_sim', 
       'title_jac_q3_sim','title_jac_an_sim', 'title_rel_jac_an_sim', 
       'title_containment_sim','title_exact_sim']

In [89]:
# 4. Spec table attributes+titles+description+brand+price
all_feature = ['capacity_lev_sim', 'capacity_jac_q3_sim',
       'capacity_jac_an_sim', 'capacity_rel_jac_an_sim',
       'capacity_containment_sim', 'capacity_exact_sim',
       'style_cosine_tfidf_sim', 'style_lev_sim', 'style_jac_q3_sim',
       'style_jac_an_sim', 'style_rel_jac_an_sim', 'style_containment_sim',
       'style_exact_sim', 'base_cosine_tfidf_sim', 'base_lev_sim',
       'base_jac_q3_sim', 'base_jac_an_sim', 'base_rel_jac_an_sim',
       'base_containment_sim', 'base_exact_sim', 'material_lev_sim',
       'material_jac_q3_sim', 'material_jac_an_sim', 'material_rel_jac_an_sim',
       'material_containment_sim', 'material_exact_sim', 'brand_lev_sim',
       'brand_jac_q3_sim', 'brand_jac_an_sim', 'brand_rel_jac_an_sim',
       'brand_containment_sim', 'brand_exact_sim', 'shape_lev_sim',
       'shape_jac_q3_sim', 'shape_jac_an_sim', 'shape_rel_jac_an_sim',
       'shape_containment_sim', 'shape_exact_sim', 'category_lev_sim',
       'category_jac_q3_sim', 'category_jac_an_sim', 'category_rel_jac_an_sim',
       'category_containment_sim', 'category_exact_sim',
       'title_cosine_tfidf_sim', 'title_lev_sim', 'title_jac_q3_sim',
       'title_jac_an_sim', 'title_rel_jac_an_sim', 'title_containment_sim',
       'title_exact_sim', 'finish_lev_sim', 'finish_jac_q3_sim',
       'finish_jac_an_sim', 'finish_rel_jac_an_sim', 'finish_containment_sim',
       'finish_exact_sim', 'product_type_cosine_tfidf_sim',
       'product_type_lev_sim', 'product_type_jac_q3_sim',
       'product_type_jac_an_sim', 'product_type_rel_jac_an_sim',
       'product_type_containment_sim', 'product_type_exact_sim',
       'color_lev_sim', 'color_jac_q3_sim', 'color_jac_an_sim',
       'color_rel_jac_an_sim', 'color_containment_sim', 'color_exact_sim',
       'height_lev_sim', 'height_jac_q3_sim', 'height_jac_an_sim',
       'height_rel_jac_an_sim', 'height_containment_sim', 'height_exact_sim',
       'description_cosine_tfidf_sim', 'description_lev_sim',
       'description_jac_q3_sim', 'description_jac_an_sim',
       'description_rel_jac_an_sim', 'description_containment_sim',
       'description_exact_sim']

In [90]:
spec_desc_brand_feature = ['capacity_lev_sim', 'capacity_jac_q3_sim',
       'capacity_jac_an_sim', 'capacity_rel_jac_an_sim',
       'capacity_containment_sim', 'capacity_exact_sim',
       'style_cosine_tfidf_sim', 'style_lev_sim', 'style_jac_q3_sim',
       'style_jac_an_sim', 'style_rel_jac_an_sim', 'style_containment_sim',
       'style_exact_sim', 'base_cosine_tfidf_sim', 'base_lev_sim',
       'base_jac_q3_sim', 'base_jac_an_sim', 'base_rel_jac_an_sim',
       'base_containment_sim', 'base_exact_sim', 'material_lev_sim',
       'material_jac_q3_sim', 'material_jac_an_sim', 'material_rel_jac_an_sim',
       'material_containment_sim', 'material_exact_sim', 'brand_lev_sim',
       'brand_jac_q3_sim', 'brand_jac_an_sim', 'brand_rel_jac_an_sim',
       'brand_containment_sim', 'brand_exact_sim', 'shape_lev_sim',
       'shape_jac_q3_sim', 'shape_jac_an_sim', 'shape_rel_jac_an_sim',
       'shape_containment_sim', 'shape_exact_sim', 'category_lev_sim',
       'category_jac_q3_sim', 'category_jac_an_sim', 'category_rel_jac_an_sim',
       'category_containment_sim', 'category_exact_sim',
       'finish_lev_sim', 'finish_jac_q3_sim',
       'finish_jac_an_sim', 'finish_rel_jac_an_sim', 'finish_containment_sim',
       'finish_exact_sim', 'product_type_cosine_tfidf_sim',
       'product_type_lev_sim', 'product_type_jac_q3_sim',
       'product_type_jac_an_sim', 'product_type_rel_jac_an_sim',
       'product_type_containment_sim', 'product_type_exact_sim',
       'color_lev_sim', 'color_jac_q3_sim', 'color_jac_an_sim',
       'color_rel_jac_an_sim', 'color_containment_sim', 'color_exact_sim',
       'height_lev_sim', 'height_jac_q3_sim', 'height_jac_an_sim',
       'height_rel_jac_an_sim', 'height_containment_sim', 'height_exact_sim',
       'description_cosine_tfidf_sim', 'description_lev_sim',
       'description_jac_q3_sim', 'description_jac_an_sim',
       'description_rel_jac_an_sim', 'description_containment_sim',
       'description_exact_sim']

In [91]:
spec_brand_feature = ['capacity_lev_sim', 'capacity_jac_q3_sim',
       'capacity_jac_an_sim', 'capacity_rel_jac_an_sim',
       'capacity_containment_sim', 'capacity_exact_sim',
       'style_cosine_tfidf_sim', 'style_lev_sim', 'style_jac_q3_sim',
       'style_jac_an_sim', 'style_rel_jac_an_sim', 'style_containment_sim',
       'style_exact_sim', 'base_cosine_tfidf_sim', 'base_lev_sim',
       'base_jac_q3_sim', 'base_jac_an_sim', 'base_rel_jac_an_sim',
       'base_containment_sim', 'base_exact_sim', 'material_lev_sim',
       'material_jac_q3_sim', 'material_jac_an_sim', 'material_rel_jac_an_sim',
       'material_containment_sim', 'material_exact_sim', 'brand_lev_sim',
       'brand_jac_q3_sim', 'brand_jac_an_sim', 'brand_rel_jac_an_sim',
       'brand_containment_sim', 'brand_exact_sim', 'shape_lev_sim',
       'shape_jac_q3_sim', 'shape_jac_an_sim', 'shape_rel_jac_an_sim',
       'shape_containment_sim', 'shape_exact_sim', 'category_lev_sim',
       'category_jac_q3_sim', 'category_jac_an_sim', 'category_rel_jac_an_sim',
       'category_containment_sim', 'category_exact_sim',
       'finish_lev_sim', 'finish_jac_q3_sim',
       'finish_jac_an_sim', 'finish_rel_jac_an_sim', 'finish_containment_sim',
       'finish_exact_sim', 'product_type_cosine_tfidf_sim',
       'product_type_lev_sim', 'product_type_jac_q3_sim',
       'product_type_jac_an_sim', 'product_type_rel_jac_an_sim',
       'product_type_containment_sim', 'product_type_exact_sim',
       'color_lev_sim', 'color_jac_q3_sim', 'color_jac_an_sim',
       'color_rel_jac_an_sim', 'color_containment_sim', 'color_exact_sim',
       'height_lev_sim', 'height_jac_q3_sim', 'height_jac_an_sim',
       'height_rel_jac_an_sim', 'height_containment_sim', 'height_exact_sim']

### Preprocessing to mitigate influence of title attribute

#### Katom_Cdi

In [92]:
# I picked the most important feature (title_containment_sim) that alone already achieved 88% F1
# and trained a LogRegCV(cv=5, solver='liblinear', max_iter=1000) model on the whole set
# I then did run predict_proba on that in oder to get the most confident cases.

features = title_feature
combo = 'katom_cdi'
random_state = 42
test_size = 0.33

estimator_title_containment = LogisticRegressionCV(random_state=random_state,cv=5, solver='liblinear', max_iter=1000)

X_title_containment = candsets_feature[combo][features].copy()
y_title_containment = candsets_feature[combo]['label'].copy()

#X_train_title_containment, X_test_title_containment, y_train_title_containment, y_test_title_containment = train_test_split(X_title_containment,y_title_containment,test_size=test_size, 
#                                                                                                                            random_state=random_state, stratify=y_title_containment)

estimator_title_containment.fit(X_title_containment,y_title_containment)
pred_title_containment = estimator_title_containment.predict(X_title_containment)
predict_proba_title_containment = estimator_title_containment.predict_proba(X_title_containment)

In [93]:
target_names = ['non-match', 'match']
print(classification_report(y_title_containment , pred_title_containment, target_names=target_names))

              precision    recall  f1-score   support

   non-match       0.96      0.87      0.91       672
       match       0.88      0.96      0.92       672

    accuracy                           0.92      1344
   macro avg       0.92      0.92      0.92      1344
weighted avg       0.92      0.92      0.92      1344



#### Katom_rewo

In [94]:
# I picked the most important feature (title_containment_sim) that alone already achieved 88% F1
# and trained a LogRegCV(cv=5, solver='liblinear', max_iter=1000) model on the whole set
# I then did run predict_proba on that in oder to get the most confident cases.

features = title_feature
combo = 'katom_rewo'
random_state = 42
test_size = 0.33

estimator_title_containment = LogisticRegressionCV(random_state=random_state,cv=5, solver='liblinear', max_iter=1000)

X_title_containment = candsets_feature[combo][features].copy()
y_title_containment = candsets_feature[combo]['label'].copy()

#X_train_title_containment, X_test_title_containment, y_train_title_containment, y_test_title_containment = train_test_split(X_title_containment,y_title_containment,test_size=test_size, 
#                                                                                                                            random_state=random_state, stratify=y_title_containment)

estimator_title_containment.fit(X_title_containment,y_title_containment)
pred_title_containment = estimator_title_containment.predict(X_title_containment)
predict_proba_title_containment = estimator_title_containment.predict_proba(X_title_containment)

In [95]:
target_names = ['non-match', 'match']
print(classification_report(y_title_containment , pred_title_containment, target_names=target_names))

              precision    recall  f1-score   support

   non-match       0.81      0.80      0.81      8549
       match       0.81      0.81      0.81      8549

    accuracy                           0.81     17098
   macro avg       0.81      0.81      0.81     17098
weighted avg       0.81      0.81      0.81     17098



#### Rewo_Cdi

In [96]:
# I picked the most important feature (title_containment_sim) that alone already achieved 88% F1
# and trained a LogRegCV(cv=5, solver='liblinear', max_iter=1000) model on the whole set
# I then did run predict_proba on that in oder to get the most confident cases.

features = title_feature
combo = 'rewo_cdi'
random_state = 42
test_size = 0.33

estimator_title_containment = LogisticRegressionCV(random_state=random_state,cv=5, solver='liblinear', max_iter=1000)

X_title_containment = candsets_feature[combo][features].copy()
y_title_containment = candsets_feature[combo]['label'].copy()

#X_train_title_containment, X_test_title_containment, y_train_title_containment, y_test_title_containment = train_test_split(X_title_containment,y_title_containment,test_size=test_size, 
#                                                                                                                            random_state=random_state, stratify=y_title_containment)

estimator_title_containment.fit(X_title_containment,y_title_containment)
pred_title_containment = estimator_title_containment.predict(X_title_containment)
predict_proba_title_containment = estimator_title_containment.predict_proba(X_title_containment)

In [97]:
target_names = ['non-match', 'match']
print(classification_report(y_title_containment , pred_title_containment, target_names=target_names))

              precision    recall  f1-score   support

   non-match       0.96      0.98      0.97      2664
       match       0.98      0.96      0.97      2664

    accuracy                           0.97      5328
   macro avg       0.97      0.97      0.97      5328
weighted avg       0.97      0.97      0.97      5328



### Additional Pre-processing

In [138]:
candsets_with_org_attr['katom_cdi'][['katom_title','katom_brand','cdi_title','cdi_brand','katom_height','cdi_height','label']]

Unnamed: 0,katom_title,katom_brand,cdi_title,cdi_brand,katom_height,cdi_height,label
0,"""Beverage Air BB48GSYF-1-B 48"" (2) Section Bar Refrigerator - Sliding Glass Doors, 115v""@en Doors 115v ""@en","""Beverage Air""@en",""" Continental Refrigerator BBC50S-GD Back Bar Cooler ""Continental | Culinary Depot""","""Continental Refrigerator""","'12 1 cu ft', '306.00'",'355.00 Pound(s)',0
1,"""Follett ABSPECLEG1 12"" Legs, Stainless Steel""@en Steel ""@en Legs","""Follett""@en",""" Red Goat 30-A-154 Cone Hopper ""Red | Culinary Depot""","""Red Goat""",,,0
2,"""Winco MXBT-2000Q 20-qt Mixing Bowl - Stainless""@en Stainless ""@en","""Winco""@en",""" Adcraft LAD-6PE 4.666 Oz. Portion Control Ladle - White ""Adcraft | Culinary Depot""","""Admiral Craft""",,,0
3,"""Accutemp ACEL-60 2203 Stationary Steam Kettle w/ 60-gal Capacity, Stainless, 220/3 V""@en V ""@en Stainless Capacity","""Accutemp""@en",""" Vulcan PLTRAIL-48 Plate Rail ""Vulcan | Culinary Depot""","""Vulcan""",,'10 5',0
4,"""Carlisle 1410FG012 Rectangular Cafeteria Tray - 13-3/4x10-5/8"" Sea Spray""@en Spray ""@en","""Carlisle""@en",""" Cambro 1000522 10"" Round Burgundy Wine Fiberglass Camtray ""Cambro | Culinary Depot""","""Cambro""",,,0
...,...,...,...,...,...,...,...
1339,"""Cambro UC1000401 10-1/2-gal Ultra Camtainer Beverage Carrier - Insulated, Slate Blue""@en Blue ""@en Insulated","""Cambro""@en",""" Cambro UC1000401 10 Gallon Slate Blue Ultra Camtainer Beverage Carrier - 25.75"" H x 16.25"" W 20.5"" D ""Cambro | Culinary Depot""","""Cambro""",'29.27',,1
1340,"""Jackson 05930-121-75-66 Cold Water Thermostat For AJ-66, AJ-80, AJX-66, AJX-80, AJ-66T, AJ-100""@en AJ-66T AJ-80 AJ-66 AJX-80 AJ-100 ""@en AJX-66","""Jackson""@en",""" Jackson 05930-121-75-66 Cold Water Thermostat (Aj-66 ""Jackson | Culinary Depot""","""Jackson""",,,1
1341,"""Accutemp E64403E120SGL Electric Floor Model Steamer w/ (6) Full Size Pan Capacity, 440v/3ph""@en Capacity 440v/3ph ""@en","""Accutemp""@en",""" AccuTemp E64403E120 SGL Connected Evolution Boilerless ""AccuTemp | Culinary Depot""","""AccuTemp""","'12 x 20', '281.00', '60'","'12 x 20', '281.00 Pound(s)'",1
1342,"""Advance Tabco 94-23-60-24RL 115"" 3-Compartment Sink w/ 20""L x 20""W Bowl, 14"" Deep""@en Bowl Deep ""@en","""Advance Tabco""@en",""" Advance Tabco 94-23-60-24RL Regaline Sink ""Advance | Culinary Depot""","""Advance Tabco""",'276.00',"'11""', '276.00 Pound(s)'",1


In [139]:
candsets_with_org_attr['katom_cdi']['cdi_height']

0                  '355.00 Pound(s)'
1                                NaN
2                                NaN
3                             '10 5'
4                                NaN
                    ...             
1339                             NaN
1340                             NaN
1341    '12 x 20', '281.00 Pound(s)'
1342        '11"', '276.00 Pound(s)'
1343               '512.00 Pound(s)'
Name: cdi_height, Length: 1344, dtype: object

In [140]:
candsets_with_org_attr['katom_cdi']['cdi_height'] = candsets_with_org_attr['katom_cdi']['cdi_height'].str.replace(r'Pound\(s\)','').str.strip()

In [141]:
candsets_with_org_attr['rewo_cdi']['cdi_height'] = candsets_with_org_attr['rewo_cdi']['cdi_height'].str.replace(r'Pound\(s\)','').str.strip()

In [142]:
candsets_with_org_attr['rewo_cdi']['cdi_height'].value_counts()

'1075.00 '             1039
'735.00 '               376
'660.00 '               305
'1320.00 '               30
'475.00 '                24
                       ... 
'no ss'                   1
'55.00 '                  1
'26 gauge', '2.94 '       1
'12 30', '731.00 '        1
'212.00 '                 1
Name: cdi_height, Length: 810, dtype: int64

In [143]:
attr_katom_cdi = ['katom_brand', 'katom_style', 'katom_product_type', 'katom_material', 'katom_color', 'katom_finish', 'katom_height', 'katom_capacity', 
        'katom_base', 'katom_shape', 'katom_category', 'katom_brand','cdi_brand', 'cdi_style', 'cdi_product_type', 'cdi_material', 'cdi_color', 
        'cdi_finish', 'cdi_height', 'cdi_capacity', 'cdi_base', 'cdi_shape', 'cdi_category', 'cdi_brand']

In [144]:
from collections import OrderedDict
candsets_with_org_attr['katom_cdi'].replace(np.nan,'',inplace=True)
candsets_with_org_attr['katom_cdi'][attr_katom_cdi] = candsets_with_org_attr['katom_cdi'][attr_katom_cdi].applymap(lambda x: x.lower().split(', ')).applymap(lambda x: OrderedDict.fromkeys(x).keys()).applymap(lambda x: ' '.join(x).strip())
candsets_with_org_attr['katom_cdi'].replace('',np.nan,inplace=True)

In [145]:
attr_rewo_cdi = ['rewo_brand', 'rewo_style', 'rewo_product_type', 'rewo_material', 'rewo_color', 'rewo_finish', 'rewo_height', 'rewo_capacity', 
        'rewo_base', 'rewo_shape', 'rewo_category', 'rewo_brand','cdi_brand', 'cdi_style', 'cdi_product_type', 'cdi_material', 'cdi_color', 
        'cdi_finish', 'cdi_height', 'cdi_capacity', 'cdi_base', 'cdi_shape', 'cdi_category', 'cdi_brand']

In [146]:
candsets_with_org_attr['rewo_cdi'].replace(np.nan,'',inplace=True)
candsets_with_org_attr['rewo_cdi'][attr_rewo_cdi] = candsets_with_org_attr['rewo_cdi'][attr_rewo_cdi].applymap(lambda x: x.lower().split(', ')).applymap(lambda x: OrderedDict.fromkeys(x).keys()).applymap(lambda x: ' '.join(x).strip())
candsets_with_org_attr['rewo_cdi'].replace('',np.nan,inplace=True)

In [147]:
attr_katom_brand = ['katom_brand', 'katom_style', 'katom_product_type', 'katom_material', 'katom_color', 'katom_finish', 'katom_height', 'katom_capacity', 
        'katom_base', 'katom_shape', 'katom_category', 'katom_brand','rewo_brand', 'rewo_style', 'rewo_product_type', 'rewo_material', 'rewo_color', 
        'rewo_finish', 'rewo_height', 'rewo_capacity', 'rewo_base', 'rewo_shape', 'rewo_category', 'rewo_brand']

In [148]:
candsets_with_org_attr['katom_rewo'].replace(np.nan,'',inplace=True)
candsets_with_org_attr['katom_rewo'][attr_katom_brand] = candsets_with_org_attr['katom_rewo'][attr_katom_brand].applymap(lambda x: x.lower().split(', ')).applymap(lambda x: OrderedDict.fromkeys(x).keys()).applymap(lambda x: ' '.join(x).strip())
candsets_with_org_attr['katom_rewo'].replace('',np.nan,inplace=True)

In [149]:
#for df in candsets_with_org_attr:
#    candsets_with_org_attr[df].to_csv('../../candsets/home/v5/candset_{}.csv'.format(df),index=False)

In [150]:
#candsets_with_org_attr['katom_cdi'][attr_katom_cdi]

In [151]:
candsets_with_org_attr['katom_cdi'][['katom_title','katom_brand','katom_material','cdi_title','cdi_brand','cdi_material']]

Unnamed: 0,katom_title,katom_brand,katom_material,cdi_title,cdi_brand,cdi_material
0,"""Beverage Air BB48GSYF-1-B 48"" (2) Section Bar Refrigerator - Sliding Glass Doors, 115v""@en Doors 115v ""@en","""beverage air""@en",'black' 'galvanized top' 'stainless steel',""" Continental Refrigerator BBC50S-GD Back Bar Cooler ""Continental | Culinary Depot""","""continental refrigerator""",
1,"""Follett ABSPECLEG1 12"" Legs, Stainless Steel""@en Steel ""@en Legs","""follett""@en",,""" Red Goat 30-A-154 Cone Hopper ""Red | Culinary Depot""","""red goat""",
2,"""Winco MXBT-2000Q 20-qt Mixing Bowl - Stainless""@en Stainless ""@en","""winco""@en",,""" Adcraft LAD-6PE 4.666 Oz. Portion Control Ladle - White ""Adcraft | Culinary Depot""","""admiral craft""",'stainless steel'
3,"""Accutemp ACEL-60 2203 Stationary Steam Kettle w/ 60-gal Capacity, Stainless, 220/3 V""@en V ""@en Stainless Capacity","""accutemp""@en",,""" Vulcan PLTRAIL-48 Plate Rail ""Vulcan | Culinary Depot""","""vulcan""",'stainless steel'
4,"""Carlisle 1410FG012 Rectangular Cafeteria Tray - 13-3/4x10-5/8"" Sea Spray""@en Spray ""@en","""carlisle""@en",'cafeteria' 'fiberglass',""" Cambro 1000522 10"" Round Burgundy Wine Fiberglass Camtray ""Cambro | Culinary Depot""","""cambro""",'fiberglass dishwasher safe'
...,...,...,...,...,...,...
1339,"""Cambro UC1000401 10-1/2-gal Ultra Camtainer Beverage Carrier - Insulated, Slate Blue""@en Blue ""@en Insulated","""cambro""@en",'insulated plastic',""" Cambro UC1000401 10 Gallon Slate Blue Ultra Camtainer Beverage Carrier - 25.75"" H x 16.25"" W 20.5"" D ""Cambro | Culinary Depot""","""cambro""",'insulated plastic'
1340,"""Jackson 05930-121-75-66 Cold Water Thermostat For AJ-66, AJ-80, AJX-66, AJX-80, AJ-66T, AJ-100""@en AJ-66T AJ-80 AJ-66 AJX-80 AJ-100 ""@en AJX-66","""jackson""@en",,""" Jackson 05930-121-75-66 Cold Water Thermostat (Aj-66 ""Jackson | Culinary Depot""","""jackson""",
1341,"""Accutemp E64403E120SGL Electric Floor Model Steamer w/ (6) Full Size Pan Capacity, 440v/3ph""@en Capacity 440v/3ph ""@en","""accutemp""@en",,""" AccuTemp E64403E120 SGL Connected Evolution Boilerless ""AccuTemp | Culinary Depot""","""accutemp""",
1342,"""Advance Tabco 94-23-60-24RL 115"" 3-Compartment Sink w/ 20""L x 20""W Bowl, 14"" Deep""@en Bowl Deep ""@en","""advance tabco""@en",'stainless steel overall' 'nsf',""" Advance Tabco 94-23-60-24RL Regaline Sink ""Advance | Culinary Depot""","""advance tabco""",'stainless steel overall' 'nsf'


#### Remove Brand from title if brand attribute provided

In [152]:
import re
# remove text within double quote from string
def keep_text_within_double_quote(string):
    # ensure input is string by converting it to string
    string = str(string)
    # first exclude text within double quotes and ensure no whitespace at the beginning or end
    string = re.sub(r'\"(.*)\"','\\1',string).strip()
    # before returning the string without parentheses also ensure no double whitespaces are in the string
    return re.sub(r'  ',' ',string).strip()

In [153]:
candsets_with_org_attr['katom_cdi']['katom_brand'] = candsets_with_org_attr['katom_cdi']['katom_brand'].apply(lambda s: keep_text_within_double_quote(s).rstrip('@en'))

In [154]:
candsets_with_org_attr['katom_cdi']['katom_title'] = candsets_with_org_attr['katom_cdi'].apply(lambda row: str(row['katom_title']).lower().replace(row['katom_brand'],'') if (row['katom_brand'] in str(row['katom_title']).lower()) else row['katom_title'],axis=1)

In [155]:
candsets_with_org_attr['katom_cdi']['cdi_brand'] = candsets_with_org_attr['katom_cdi']['cdi_brand'].apply(lambda s: keep_text_within_double_quote(s).strip())

In [156]:
candsets_with_org_attr['katom_cdi']['cdi_title'] = candsets_with_org_attr['katom_cdi'].apply(lambda row: str(row['cdi_title']).lower().replace(row['cdi_brand'],'') if (row['cdi_brand'] in str(row['cdi_title']).lower()) else row['cdi_title'],axis=1)

In [158]:
candsets_with_org_attr['katom_rewo']['katom_brand'] = candsets_with_org_attr['katom_rewo']['katom_brand'].apply(lambda s: keep_text_within_double_quote(s).rstrip('@en'))

In [159]:
candsets_with_org_attr['katom_rewo']['rewo_brand'] = candsets_with_org_attr['katom_rewo']['rewo_brand'].apply(lambda s: keep_text_within_double_quote(s).rstrip('@en'))

In [160]:
candsets_with_org_attr['katom_rewo']['katom_title'] = candsets_with_org_attr['katom_rewo'].apply(lambda row: str(row['katom_title']).lower().replace(row['katom_brand'],'') if (row['katom_brand'] in str(row['katom_title']).lower()) else row['katom_title'],axis=1)

In [161]:
candsets_with_org_attr['katom_rewo']['rewo_title'] = candsets_with_org_attr['katom_rewo'].apply(lambda row: str(row['rewo_title']).lower().replace(row['rewo_brand'],'') if (row['rewo_brand'] in str(row['rewo_title']).lower()) else row['rewo_title'],axis=1)

In [162]:
candsets_with_org_attr['rewo_cdi']['cdi_brand'] = candsets_with_org_attr['rewo_cdi']['cdi_brand'].apply(lambda s: keep_text_within_double_quote(s).strip())

In [163]:
candsets_with_org_attr['rewo_cdi']['rewo_brand'] = candsets_with_org_attr['rewo_cdi']['rewo_brand'].apply(lambda s: keep_text_within_double_quote(s).rstrip('@en'))

In [164]:
candsets_with_org_attr['rewo_cdi']['rewo_title'] = candsets_with_org_attr['rewo_cdi'].apply(lambda row: str(row['rewo_title']).lower().replace(row['rewo_brand'],'') if (row['rewo_brand'] in str(row['rewo_title']).lower()) else row['rewo_title'],axis=1)

In [165]:
candsets_with_org_attr['rewo_cdi']['cdi_title'] = candsets_with_org_attr['rewo_cdi'].apply(lambda row: str(row['cdi_title']).lower().replace(row['cdi_brand'],'') if (row['cdi_brand'] in str(row['cdi_title']).lower()) else row['cdi_title'],axis=1)

#### Creating some noise by removing first word (actually second but first real word because starts with ") from rewo_title

In [167]:
#candsets_with_org_attr['katom_cdi']['katom_title'] = candsets_with_org_attr['katom_cdi'].apply(lambda row: row['katom_title'][5:] if (row['label']==1) else row['katom_title'],axis=1)

In [168]:
#candsets_with_org_attr['rewo_cdi']['rewo_title'] = candsets_with_org_attr['rewo_cdi'].apply(lambda row: ' '.join(row['rewo_title'].split()[2:]),axis=1)

In [169]:
#candsets_with_org_attr['katom_rewo']['katom_title'] = candsets_with_org_attr['katom_rewo'].apply(lambda row: row['katom_title'][5:] if (row['label']==1) else row['katom_title'],axis=1)

In [170]:
#candsets_with_org_attr['katom_rewo']['rewo_title'] = candsets_with_org_attr['katom_rewo'].apply(lambda row: ' '.join(row['rewo_title'].split()[2:]),axis=1)

#### Creating some noise by removing first word from katom_title

In [173]:
candsets_with_org_attr['katom_cdi']['katom_title'] = candsets_with_org_attr['katom_cdi'].apply(lambda row: ' '.join(row['katom_title'].split()[2:]),axis=1)

In [176]:
candsets_with_org_attr['katom_rewo']['katom_title'] = candsets_with_org_attr['katom_rewo'].apply(lambda row: ' '.join(row['katom_title'].split()[2:]),axis=1)

#### Creating some noise by removing words from cdi_title

In [178]:
candsets_with_org_attr['katom_cdi']['cdi_title'] = candsets_with_org_attr['katom_cdi'].apply(lambda row: ' '.join(row['cdi_title'].split()[2:]),axis=1)

In [179]:
candsets_with_org_attr['rewo_cdi']['cdi_title'] = candsets_with_org_attr['rewo_cdi'].apply(lambda row: ' '.join(row['cdi_title'].split()[2:]),axis=1)

In [184]:
for df in candsets_with_org_attr:  
    if(any(candsets_with_org_attr[df].columns.str.endswith('blocking_key'))):
        candsets_with_org_attr[df].drop(columns=list(candsets_with_org_attr[df].columns[candsets_with_org_attr[df].columns.str.endswith('blocking_key')]),inplace=True)

In [186]:
for df in candsets_with_org_attr:
    if(any(candsets_with_org_attr[df].columns.str.endswith('description'))):
        candsets_with_org_attr[df].drop(columns=list(candsets_with_org_attr[df].columns[candsets_with_org_attr[df].columns.str.endswith('description')])).to_csv('../../candsets/home/candset_{}.csv'.format(df),index=False)