In [1]:
%load_ext autoreload
%autoreload

import warnings
import logging
from help_functions import *
import candidates_sampling as cs
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns',100)
pd.set_option('display.max_colwidth', None)
import sys
sys.path.append('../')
import support_utils as sup

#### Create Logger Handlers

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(lineno)d - %(funcName)s - %(message)s')

#fh = logging.FileHandler('logging/CreatingFeatureVectorsOfPotCorr_Books.log')
#fh.setLevel(logging.DEBUG)
#fh.setFormatter(formatter)
#logger.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)

### Read in all candidate sets and store them as Pandas DataFrames in a Dictionary

In [3]:
# The datasets should follow the same structure
path_to_datasets='../../candsets/books/' # path to the directory where the datasets are stored
pattern_of_filename = 'candset_(.{2,4}_.{2,4})' # the file names of the datasets should follow the same pattern
csv_separator = ',' # all datasets need to be csv files and need to have the same separator
#lst_of_ids = ['id','uri']  # provide the names of the IDs that are in the datasets
candset_dict = readDataInDictionary(path_to_datasets, pattern_of_filename, csv_separator)

2020-08-04 09:22:48,124 - INFO - 53 - readDataInDictionary - ../../candsets/books/candset_bx_wor.csv is read in and is stored in the dictionary with they key ['bx_wor']
2020-08-04 09:22:48,141 - INFO - 53 - readDataInDictionary - ../../candsets/books/candset_bx_half.csv is read in and is stored in the dictionary with they key ['bx_half']
2020-08-04 09:22:48,156 - INFO - 53 - readDataInDictionary - ../../candsets/books/candset_ban_wor.csv is read in and is stored in the dictionary with they key ['ban_wor']
2020-08-04 09:22:48,168 - INFO - 53 - readDataInDictionary - ../../candsets/books/candset_ban_bx.csv is read in and is stored in the dictionary with they key ['ban_bx']
2020-08-04 09:22:48,186 - INFO - 53 - readDataInDictionary - ../../candsets/books/candset_wor_half.csv is read in and is stored in the dictionary with they key ['wor_half']
2020-08-04 09:22:48,200 - INFO - 53 - readDataInDictionary - ../../candsets/books/candset_ban_half.csv is read in and is stored in the dictionary w

### Calculate Features for the whole datasets

In [33]:
type_per_column = returnAlignedDataTypeSchema(candset_dict,lst_of_ids_to_be_removed=['id','label','isbn'])

2020-06-15 15:09:39,497 - INFO - 256 - returnAlignedDataTypeSchema - Start with bx_wor
2020-06-15 15:09:39,499 - INFO - 84 - getDataTypes - Start detecting datatypes for all columns of dataframe:
2020-06-15 15:09:49,157 - INFO - 140 - getDataTypes - Datatype for Column bx_author detected: str
2020-06-15 15:09:58,967 - INFO - 140 - getDataTypes - Datatype for Column wor_author detected: str
2020-06-15 15:09:58,971 - INFO - 109 - getDataTypes - Datatype for Column bx_binding detected: custom
2020-06-15 15:09:58,983 - INFO - 109 - getDataTypes - Datatype for Column wor_binding detected: custom
2020-06-15 15:10:08,763 - INFO - 140 - getDataTypes - Datatype for Column bx_pages detected: num
2020-06-15 15:10:18,461 - INFO - 140 - getDataTypes - Datatype for Column wor_pages detected: num
2020-06-15 15:10:22,732 - INFO - 140 - getDataTypes - Datatype for Column bx_pubdate detected: date
2020-06-15 15:10:23,042 - INFO - 140 - getDataTypes - Datatype for Column wor_pubdate detected: date
2020-0

2020-06-15 15:11:37,294 - INFO - 140 - getDataTypes - Datatype for Column ban_title detected: str
2020-06-15 15:11:38,283 - INFO - 140 - getDataTypes - Datatype for Column half_title detected: str


In [34]:
type_per_column

{'author': 'str',
 'binding': 'custom',
 'pages': 'num',
 'pubdate': 'date',
 'publisher': 'str',
 'title': 'str'}

In [35]:
sup.saveResultsToJSON(type_per_column,'../input/books_type_per_column')

Saved in ../input/books_type_per_column.json


In [36]:
# columns to be ignored are all columns that are not related to any single attribute 
candset_feature_dict = returnLabeledFeatureVectorsForCandidateSet(candset_dict, type_per_column, columns_to_be_ignored=['ids','label','isbn'], identifier='id')

2020-06-15 15:25:03,678 - INFO - 961 - createLabeledFeatureVectorForCandidateSets - Start Function
2020-06-15 15:25:05,096 - INFO - 984 - createLabeledFeatureVectorForCandidateSets - Common attributes identified!
2020-06-15 15:25:05,097 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - bx_title
2020-06-15 15:25:05,099 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - wor_title
2020-06-15 15:25:12,291 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - bx_author
2020-06-15 15:25:12,295 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - wor_author
2020-06-15 15:25:18,555 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - bx_pages
2020-06-15 15:25:18,558 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - wor_pages
2020-06-15 15:25:19,285 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - bx_pubdate
2020-06-15 15:25:19,289 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - wor_pubdate
2020-06-15 15:25:29,920 -

2020-06-15 15:25:47,813 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - half_pages
2020-06-15 15:25:47,932 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - wor_pubdate
2020-06-15 15:25:47,935 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - half_pubdate
2020-06-15 15:25:49,249 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - wor_binding
2020-06-15 15:25:49,252 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - half_binding
2020-06-15 15:25:49,363 - INFO - 1007 - createLabeledFeatureVectorForCandidateSets - wor_publisher
2020-06-15 15:25:49,365 - INFO - 1009 - createLabeledFeatureVectorForCandidateSets - half_publisher
2020-06-15 15:25:50,533 - INFO - 1012 - createLabeledFeatureVectorForCandidateSets - 
Finished! Labeled Feature Vectors created for wor and half
2020-06-15 15:25:50,537 - INFO - 961 - createLabeledFeatureVectorForCandidateSets - Start Function
2020-06-15 15:25:50,677 - INFO - 984 - createLabeledFeatureVectorForCan

### Rescale the features

In [39]:
rescaleFeatureVectorsInDict(candset_feature_dict,col_to_be_ignored=['ids', 'label'])

2020-06-15 15:27:51,849 - INFO - 1043 - rescaleFeatureVectorsInDict - Rescaling feature dataframes within the dictionary
2020-06-15 15:27:51,852 - INFO - 1023 - rescaleFeatureVectors - Rescaling features to be in range [0,1] and for features that end with diff we additionally reverse the score
2020-06-15 15:27:51,896 - INFO - 1029 - rescaleFeatureVectors - Column pages_num_abs_diff will additionally be reversed so 1 - rescaled score
2020-06-15 15:27:51,908 - INFO - 1029 - rescaleFeatureVectors - Column pubdate_days_diff will additionally be reversed so 1 - rescaled score
2020-06-15 15:27:51,915 - INFO - 1029 - rescaleFeatureVectors - Column pubdate_months_diff will additionally be reversed so 1 - rescaled score
2020-06-15 15:27:51,921 - INFO - 1029 - rescaleFeatureVectors - Column pubdate_years_diff will additionally be reversed so 1 - rescaled score
2020-06-15 15:27:51,941 - INFO - 1033 - rescaleFeatureVectors - All features from the frature_vector dataframe are now rescaled inplace
2

In [41]:
for df in candset_feature_dict:
    candset_feature_dict[df].to_csv(f'../../candsets/books/candset_{df}.csv',index=False)

### Making train and test split

In [42]:
random_state = 42
test_size = 0.33
candset_dict_train = {}
candset_dict_test = {}
for df in candset_feature_dict:
    X = candset_feature_dict[df].drop(columns='label')
    y = candset_feature_dict[df]['label']
    X_train, X_test, y_train, y_test = train_test_split(X.copy(),y.copy(),random_state=random_state,
                                                        test_size=test_size,stratify=y)
    X_train['label'] = y_train.copy()
    candset_dict_train.update({df:X_train.reset_index(drop=True)})
    X_train.to_csv(f'../../candsets/books/candset_{df}_train.csv',index=False)
    X_test['label'] = y_test.copy()
    candset_dict_test.update({df:X_test.reset_index(drop=True)})
    X_test.to_csv(f'../../candsets/books/candset_{df}_test.csv',index=False)