In [2]:
%load_ext autoreload
%autoreload

import warnings
import logging
from help_functions import *

#### Create Logger Handlers

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(lineno)d - %(funcName)s - %(message)s')

fh = logging.FileHandler('logging/CreatingFeatureVectorsOfPotCorr_Authors.log')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)

### Read in all candidate sets and store them as Pandas DataFrames in a Dictionary

In [3]:
# The datasets should follow the same structure
path_to_datasets='../../candsets/authors/' # path to the directory where the datasets are stored
pattern_of_filename = '(candset_.{3}_.{3,4})_train' # the file names of the datasets should follow the same pattern
csv_separator = ',' # all datasets need to be csv files and need to have the same separator
lst_of_ids = ['id','uri']  # provide the names of the IDs that are in the datasets
candset_dict = readDataInDictionary(path_to_datasets, pattern_of_filename, csv_separator)

In [4]:
candset_dict['candset_dbp_dnb'].columns

Index(['dnb_id', 'dnb_uri', 'dnb_birthdate', 'dnb_deathdate', 'dnb_gender',
       'dnb_name', 'dnb_work', 'dbp_id', 'dbp_uri', 'dbp_birthdate',
       'dbp_deathdate', 'dbp_gender', 'dbp_name', 'dbp_work', 'dbp_link_db',
       'label'],
      dtype='object')

In [5]:
type_per_column = returnAlignedDataTypeSchema(candset_dict,lst_of_ids_to_be_removed=['id','uri','link_db','label'])

2020-06-12 15:39:32,252 - INFO - 235 - returnAlignedDataTypeSchema - Start with candset_dbp_viaf
2020-06-12 15:39:32,254 - INFO - 84 - getDataTypes - Start detecting datatypes for all columns of dataframe:
2020-06-12 15:39:32,678 - INFO - 138 - getDataTypes - Datatype for Column viaf_birthdate detected: date
2020-06-12 15:39:32,883 - INFO - 138 - getDataTypes - Datatype for Column viaf_deathdate detected: date
2020-06-12 15:39:32,906 - INFO - 107 - getDataTypes - Datatype for Column viaf_gender detected: custom
2020-06-12 15:39:40,778 - INFO - 138 - getDataTypes - Datatype for Column viaf_name detected: str
2020-06-12 15:39:40,829 - INFO - 112 - getDataTypes - Datatype for Column viaf_work detected: long_str with avg_length 15.060646303674192
2020-06-12 15:39:41,079 - INFO - 138 - getDataTypes - Datatype for Column dbp_birthdate detected: date
2020-06-12 15:39:41,190 - INFO - 138 - getDataTypes - Datatype for Column dbp_deathdate detected: date
2020-06-12 15:39:41,201 - INFO - 107 - ge

In [21]:
type_per_column

{'name': 'str',
 'birthdate': 'date',
 'work': 'long_str',
 'deathdate': 'date',
 'gender': 'custom'}

In [7]:
candset_feature_dict = returnLabeledFeatureVectorsForCandidateSet(candset_dict, type_per_column, columns_to_be_ignored=['uri','link_db'], identifier='id')

2020-06-12 15:41:16,715 - INFO - 982 - createLabeledFeatureVectorForCandidateSets - Start Function
2020-06-12 15:41:17,975 - INFO - 1005 - createLabeledFeatureVectorForCandidateSets - Common attributes identified!
2020-06-12 15:41:17,976 - INFO - 1028 - createLabeledFeatureVectorForCandidateSets - dbp_deathdate
2020-06-12 15:41:17,979 - INFO - 1030 - createLabeledFeatureVectorForCandidateSets - viaf_deathdate
2020-06-12 15:41:27,191 - INFO - 1028 - createLabeledFeatureVectorForCandidateSets - dbp_gender
2020-06-12 15:41:27,195 - INFO - 1030 - createLabeledFeatureVectorForCandidateSets - viaf_gender
2020-06-12 15:41:27,938 - INFO - 1028 - createLabeledFeatureVectorForCandidateSets - dbp_birthdate
2020-06-12 15:41:27,941 - INFO - 1030 - createLabeledFeatureVectorForCandidateSets - viaf_birthdate
2020-06-12 15:41:35,452 - INFO - 1028 - createLabeledFeatureVectorForCandidateSets - dbp_name
2020-06-12 15:41:35,454 - INFO - 1030 - createLabeledFeatureVectorForCandidateSets - viaf_name
2020-0

2020-06-12 15:42:12,049 - INFO - 1028 - createLabeledFeatureVectorForCandidateSets - dbp_name
2020-06-12 15:42:12,052 - INFO - 1030 - createLabeledFeatureVectorForCandidateSets - dnb_name
2020-06-12 15:42:16,511 - INFO - 1028 - createLabeledFeatureVectorForCandidateSets - dbp_work
2020-06-12 15:42:16,514 - INFO - 1030 - createLabeledFeatureVectorForCandidateSets - dnb_work
2020-06-12 15:42:24,473 - INFO - 1033 - createLabeledFeatureVectorForCandidateSets - 
Finished! Labeled Feature Vectors created for dbp and dnb
2020-06-12 15:42:24,475 - INFO - 982 - createLabeledFeatureVectorForCandidateSets - Start Function
2020-06-12 15:42:25,593 - INFO - 1005 - createLabeledFeatureVectorForCandidateSets - Common attributes identified!
2020-06-12 15:42:25,595 - INFO - 1028 - createLabeledFeatureVectorForCandidateSets - dbp_deathdate
2020-06-12 15:42:25,598 - INFO - 1030 - createLabeledFeatureVectorForCandidateSets - wiki_deathdate
2020-06-12 15:42:38,985 - INFO - 1028 - createLabeledFeatureVectorF

In [9]:
rescaleFeatureVectorsInDict(candset_feature_dict,col_to_be_dropped=['ids','label'],col_to_be_rescaled_endswith='diff')

2020-06-12 15:43:39,068 - INFO - 701 - rescaleFeatureVectorsInDict - Rescaling feature dataframes within the dictionary that end with diff
2020-06-12 15:43:39,090 - INFO - 708 - rescaleFeatureVectorsInDict - Column that ends with diff found: deathdate_days_diff. Hence will be rescaled!
2020-06-12 15:43:39,107 - INFO - 708 - rescaleFeatureVectorsInDict - Column that ends with diff found: birthdate_days_diff. Hence will be rescaled!
2020-06-12 15:43:39,136 - INFO - 708 - rescaleFeatureVectorsInDict - Column that ends with diff found: deathdate_days_diff. Hence will be rescaled!
2020-06-12 15:43:39,148 - INFO - 708 - rescaleFeatureVectorsInDict - Column that ends with diff found: birthdate_days_diff. Hence will be rescaled!
2020-06-12 15:43:39,173 - INFO - 708 - rescaleFeatureVectorsInDict - Column that ends with diff found: deathdate_days_diff. Hence will be rescaled!
2020-06-12 15:43:39,184 - INFO - 708 - rescaleFeatureVectorsInDict - Column that ends with diff found: birthdate_days_dif

In [12]:
for df in candset_feature_dict:
    candset_feature_dict[df].to_csv('../../candsets/authors/candset_{}_train.csv'.format(df),index=False)

### Test Set

In [11]:
# The datasets should follow the same structure
path_to_datasets='../../candsets/authors/' # path to the directory where the datasets are stored
pattern_of_filename = '(candset_.{3}_.{3,4})_test' # the file names of the datasets should follow the same pattern
csv_separator = ',' # all datasets need to be csv files and need to have the same separator
lst_of_ids = ['id','uri']  # provide the names of the IDs that are in the datasets
candset_dict_test = readDataInDictionary(path_to_datasets, pattern_of_filename, csv_separator)

2020-06-12 16:18:43,035 - INFO - 65 - readDataInDictionary - ../../candsets/authors/candset_dbp_wiki_test.csv is read in and is stored in the dictionary with they key ['candset_dbp_wiki']
2020-06-12 16:18:43,072 - INFO - 65 - readDataInDictionary - ../../candsets/authors/candset_dbp_dnb_test.csv is read in and is stored in the dictionary with they key ['candset_dbp_dnb']
2020-06-12 16:18:43,132 - INFO - 65 - readDataInDictionary - ../../candsets/authors/candset_dbp_viaf_test.csv is read in and is stored in the dictionary with they key ['candset_dbp_viaf']


In [12]:
candset_dict_test['candset_dbp_dnb'].columns

Index(['dnb_id', 'dnb_uri', 'dnb_birthdate', 'dnb_deathdate', 'dnb_gender',
       'dnb_name', 'dnb_work', 'dbp_id', 'dbp_uri', 'dbp_birthdate',
       'dbp_deathdate', 'dbp_gender', 'dbp_name', 'dbp_work', 'dbp_link_db',
       'label'],
      dtype='object')

In [13]:
candset_feature_dict_test = returnLabeledFeatureVectorsForCandidateSet(candset_dict_test, type_per_column, columns_to_be_ignored=['uri','link_db'], identifier='id', no_prefix=True)

2020-06-12 16:18:57,552 - INFO - 982 - createLabeledFeatureVectorForCandidateSets - Start Function
2020-06-12 16:18:57,897 - INFO - 1005 - createLabeledFeatureVectorForCandidateSets - Common attributes identified!
2020-06-12 16:18:57,899 - INFO - 1028 - createLabeledFeatureVectorForCandidateSets - dbp_deathdate
2020-06-12 16:18:57,901 - INFO - 1030 - createLabeledFeatureVectorForCandidateSets - wiki_deathdate
2020-06-12 16:19:01,319 - INFO - 1028 - createLabeledFeatureVectorForCandidateSets - dbp_gender
2020-06-12 16:19:01,324 - INFO - 1030 - createLabeledFeatureVectorForCandidateSets - wiki_gender
2020-06-12 16:19:01,574 - INFO - 1028 - createLabeledFeatureVectorForCandidateSets - dbp_birthdate
2020-06-12 16:19:01,578 - INFO - 1030 - createLabeledFeatureVectorForCandidateSets - wiki_birthdate
2020-06-12 16:19:04,254 - INFO - 1028 - createLabeledFeatureVectorForCandidateSets - dbp_name
2020-06-12 16:19:04,257 - INFO - 1030 - createLabeledFeatureVectorForCandidateSets - wiki_name
2020-0

In [17]:
candset_feature_dict_test['feature_dbp_dnb'].keys()

Index(['ids', 'label', 'deathdate_days_sim', 'deathdate_years_sim',
       'deathdate_days_diff', 'gender_lev_sim', 'gender_jac_q3_sim',
       'gender_jac_an_sim', 'gender_rel_jac_an_sim', 'gender_containment_sim',
       'gender_exact_sim', 'name_lev_sim', 'name_jac_q3_sim',
       'name_jac_an_sim', 'name_rel_jac_an_sim', 'name_containment_sim',
       'name_exact_sim', 'work_cosine_tfidf_sim', 'work_lev_sim',
       'work_jac_q3_sim', 'work_jac_an_sim', 'work_rel_jac_an_sim',
       'work_containment_sim', 'work_exact_sim', 'birthdate_days_sim',
       'birthdate_years_sim', 'birthdate_days_diff'],
      dtype='object')

In [19]:
rescaleFeatureVectorsInDict(candset_feature_dict_test,col_to_be_dropped=['ids','label'],col_to_be_rescaled_endswith='diff')

2020-06-09 15:36:07,117 - INFO - 685 - rescaleFeatureVectorsInDict - Rescaling feature dataframes within the dictionary that end with diff
2020-06-09 15:36:07,132 - INFO - 692 - rescaleFeatureVectorsInDict - Column that ends with diff found: deathdate_days_diff. Hence will be rescaled!
2020-06-09 15:36:07,146 - INFO - 692 - rescaleFeatureVectorsInDict - Column that ends with diff found: birthdate_days_diff. Hence will be rescaled!
2020-06-09 15:36:07,173 - INFO - 692 - rescaleFeatureVectorsInDict - Column that ends with diff found: deathdate_days_diff. Hence will be rescaled!
2020-06-09 15:36:07,185 - INFO - 692 - rescaleFeatureVectorsInDict - Column that ends with diff found: birthdate_days_diff. Hence will be rescaled!
2020-06-09 15:36:07,207 - INFO - 692 - rescaleFeatureVectorsInDict - Column that ends with diff found: deathdate_days_diff. Hence will be rescaled!
2020-06-09 15:36:07,217 - INFO - 692 - rescaleFeatureVectorsInDict - Column that ends with diff found: birthdate_days_dif

In [20]:
for df in candset_feature_dict:
    candset_feature_dict_test[df].to_csv('../../candsets/authors/candset_{}_test.csv'.format(df),index=False)