In [1]:
%load_ext autoreload
%autoreload

import warnings
import logging
from help_functions import *
import candidates_sampling as cs
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns',100)
pd.set_option('display.max_colwidth', None)
import sys
sys.path.append('../')
import support_utils as sup

#### Create Logger Handlers

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(lineno)d - %(funcName)s - %(message)s')

#fh = logging.FileHandler('logging/CreatingFeatureVectorsOfPotCorr_kitchen.log')
#fh.setLevel(logging.DEBUG)
#fh.setFormatter(formatter)
#logger.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)

### Read in all candidate sets and store them as Pandas DataFrames in a Dictionary

In [3]:
# The datasets should follow the same structure
path_to_datasets='../../candsets/kitchen/' # path to the directory where the datasets are stored
pattern_of_filename = 'candset_(.{4,5}_.{3,4})' # the file names of the datasets should follow the same pattern
csv_separator = ',' # all datasets need to be csv files and need to have the same separator
#lst_of_ids = ['id','uri']  # provide the names of the IDs that are in the datasets
candset_dict = readDataInDictionary(path_to_datasets, pattern_of_filename, csv_separator)

2020-08-04 09:22:25,089 - INFO - 53 - readDataInDictionary - ../../candsets/kitchen/candset_katom_rewo.csv is read in and is stored in the dictionary with they key ['katom_rewo']
2020-08-04 09:22:25,117 - INFO - 53 - readDataInDictionary - ../../candsets/kitchen/candset_katom_cdi.csv is read in and is stored in the dictionary with they key ['katom_cdi']
2020-08-04 09:22:25,196 - INFO - 53 - readDataInDictionary - ../../candsets/kitchen/candset_rewo_cdi.csv is read in and is stored in the dictionary with they key ['rewo_cdi']


In [22]:
candset_dict.keys()

dict_keys(['katom_rewo', 'katom_cdi', 'rewo_cdi'])

In [24]:
candset_dict['katom_rewo'].shape

(6457, 30)

### Calculate Features for the whole datasets

In [25]:
type_per_column = returnAlignedDataTypeSchema(candset_dict,lst_of_ids_to_be_removed=['id','label','cluster_id'])

2020-06-15 21:10:36,055 - INFO - 256 - returnAlignedDataTypeSchema - Start with katom_rewo
2020-06-15 21:10:36,058 - INFO - 84 - getDataTypes - Start detecting datatypes for all columns of dataframe:
2020-06-15 21:10:37,806 - INFO - 140 - getDataTypes - Datatype for Column katom_base detected: str
2020-06-15 21:10:39,937 - INFO - 140 - getDataTypes - Datatype for Column rewo_base detected: str
2020-06-15 21:10:43,517 - INFO - 140 - getDataTypes - Datatype for Column katom_brand detected: str
2020-06-15 21:10:46,614 - INFO - 140 - getDataTypes - Datatype for Column rewo_brand detected: str
2020-06-15 21:10:48,117 - INFO - 140 - getDataTypes - Datatype for Column katom_capacity detected: str
2020-06-15 21:10:49,657 - INFO - 140 - getDataTypes - Datatype for Column rewo_capacity detected: str
2020-06-15 21:10:50,516 - INFO - 140 - getDataTypes - Datatype for Column katom_category detected: str
2020-06-15 21:10:50,847 - INFO - 140 - getDataTypes - Datatype for Column rewo_category detected

In [26]:
type_per_column

{'base': 'str',
 'brand': 'str',
 'capacity': 'str',
 'category': 'str',
 'color': 'str',
 'finish': 'str',
 'height': 'str',
 'material': 'str',
 'product_type': 'str',
 'shape': 'str',
 'style': 'long_str',
 'title': 'long_str'}

In [27]:
sup.saveResultsToJSON(type_per_column,'../input/kitchen_type_per_column')

Saved in ../input/home_type_per_column.json


In [28]:
# columns to be ignored are all columns that are not related to any single attribute 
candset_feature_dict = returnLabeledFeatureVectorsForCandidateSet(candset_dict, type_per_column, columns_to_be_ignored=['ids','label','cluster_id'], identifier='id')

2020-06-15 21:12:52,839 - INFO - 961 - createLabeledFeatureVectorForCandidateSets - Start Function
2020-06-15 21:12:53,379 - INFO - 984 - createLabeledFeatureVectorForCandidateSets - Common attributes identified!
2020-06-15 21:12:53,381 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - katom_finish
2020-06-15 21:12:53,384 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - rewo_finish
2020-06-15 21:12:55,222 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - katom_shape
2020-06-15 21:12:55,224 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - rewo_shape
2020-06-15 21:12:56,765 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - katom_color
2020-06-15 21:12:56,767 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - rewo_color
2020-06-15 21:12:58,643 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - katom_base
2020-06-15 21:12:58,646 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - rewo_base
2020-06-15 21:1

2020-06-15 21:13:50,937 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - rewo_material
2020-06-15 21:13:50,940 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - cdi_material
2020-06-15 21:13:52,534 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - rewo_brand
2020-06-15 21:13:52,537 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - cdi_brand
2020-06-15 21:13:54,034 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - rewo_product_type
2020-06-15 21:13:54,037 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - cdi_product_type
2020-06-15 21:13:55,616 - INFO - 1012 - createLabeledFeatureVectorForCandidateSets - 
Finished! Labeled Feature Vectors created for rewo and cdi
2020-06-15 21:13:55,618 - INFO - 939 - returnLabeledFeatureVectorsForCandidateSet - 
Finished! All labeled feature vectors are created for all dataset combinations


### Rescale the features

In [32]:
rescaleFeatureVectorsInDict(candset_feature_dict,col_to_be_ignored=['ids', 'label'])

2020-06-15 21:16:39,006 - INFO - 1043 - rescaleFeatureVectorsInDict - Rescaling feature dataframes within the dictionary
2020-06-15 21:16:39,009 - INFO - 1023 - rescaleFeatureVectors - Rescaling features to be in range [0,1] and for features that end with diff we additionally reverse the score
2020-06-15 21:16:39,184 - INFO - 1033 - rescaleFeatureVectors - All features from the frature_vector dataframe are now rescaled inplace
2020-06-15 21:16:39,186 - INFO - 1023 - rescaleFeatureVectors - Rescaling features to be in range [0,1] and for features that end with diff we additionally reverse the score
2020-06-15 21:16:39,270 - INFO - 1033 - rescaleFeatureVectors - All features from the frature_vector dataframe are now rescaled inplace
2020-06-15 21:16:39,272 - INFO - 1023 - rescaleFeatureVectors - Rescaling features to be in range [0,1] and for features that end with diff we additionally reverse the score
2020-06-15 21:16:39,460 - INFO - 1033 - rescaleFeatureVectors - All features from the

In [34]:
for df in candset_feature_dict:
    candset_feature_dict[df].to_csv('../../candsets/kitchen/candset_{}.csv'.format(df),index=False)

### Making train and test split

In [35]:
random_state = 42
test_size = 0.33
candset_dict_train = {}
candset_dict_test = {}
for df in candset_feature_dict:
    X = candset_feature_dict[df].drop(columns='label')
    y = candset_feature_dict[df]['label']
    X_train, X_test, y_train, y_test = train_test_split(X.copy(),y.copy(),random_state=random_state,
                                                        test_size=test_size,stratify=y)
    X_train['label'] = y_train.copy()
    candset_dict_train.update({df:X_train.reset_index(drop=True)})
    X_train.to_csv(f'../../candsets/kitchen/candset_{df}_train.csv',index=False)
    X_test['label'] = y_test.copy()
    candset_dict_test.update({df:X_test.reset_index(drop=True)})
    X_test.to_csv(f'../../candsets/kitchen/candset_{df}_test.csv',index=False)

In [36]:
candset_dict_train.keys()

dict_keys(['feature_katom_rewo', 'feature_katom_cdi', 'feature_rewo_cdi'])