## Entity Matching Example

Using FEBRL synthetic data

### Load Dependencies

In [None]:
import os
import math
import logging
from datetime import datetime
from typing import Optional, List, Dict

import numpy as np
import pandas as pd

from pyent.datasets import generate_febrl_data, remove_nan
from pyent.datasets import train_test_validate_stratified_split as ttvs
from pyent.features import generate_textual_features
from pyent.train import train_txt_baseline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

%matplotlib inline
%load_ext autoreload
%autoreload 2

### Generate Synthetic Data 

In [None]:
master_df = remove_nan(generate_febrl_data(init_seed=2))

In [None]:
master_df.labels.value_counts()

### Split Data into Development and Test Sets

In [None]:
X = master_df.loc[:, ~master_df.columns.isin(["labels"])]
y = master_df.loc[:, "labels"]

X_train, X_test, X_val, y_train, y_test, y_val = ttvs(
    features=X, targets=y, test_size=0.1, validate_size=0.2)


### Generate Textual Features

In [None]:
X_train_txt = generate_textual_features(X_train)
X_test_txt = generate_textual_features(X_test)
X_val_txt = generate_textual_features(X_val)

print(f"Train feature set shpae: {X_train_txt.shape} and Train target shape {len(y_train)}\nTest feature set shpae: {X_test_txt.shape} and Test target shape {len(y_test)}\nValidation feature set shape: {X_val_txt.shape} and Vaiidation target shape {len(y_val)}")

### Develop Transformer based Siamese Neural Network Model as Baseline Model

To start, for this model we can just look at the `sentence_l` and `sentence_r` _"textual"_ features we generated as shown above.

<!-- 
![example_siamese](../docs/example_siamese.png)
<h6>Image Obtained from Quora Blog Post: https://quoraengineering.quora.com/</h6>  
 -->
  
1. distill roberta base model fron huggingface
2. for negative pairs (i.e. target variabkes with negative class labels) the margin = 0.5
3. as distance metric we use cosine distance (1-cosine_similarity)


In [None]:
train_txt_baseline(X_train_txt, y_train, X_test_txt, y_test, X_val_txt, y_val)

----

## Acknowledgements

```bibtex 
@inproceedings{reimers-2019-sentence-bert,
    title     = "Sentence-BERT: Sentence Embeddings using Siamese   BERT-Networks",
    author    = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month     = "11",
    year      = "2019",
    publisher = "Association for Computational Linguistics",
    url       = "https://arxiv.org/abs/1908.10084",
}
```
  
```bibtex  
@software{de_bruin_j_2019_3559043,
  author       = "De Bruin, J",
  title        = "Python Record Linkage Toolkit: A toolkit for record linkage and duplicate detection in Python",
  month        = "12",
  year         = "2019",
  publisher    = "Zenodo",
  version      = "v0.14",
  doi          = "10.5281/zenodo.3559043",
  url          = "https://doi.org/10.5281/zenodo.3559043"
}
```




In [None]:
# ## df_in is a pandas DataFrame with all the required columns

# block_vars = ['area', 'rooms', 'bathrooms', 'garages', 'stratum', 'type']
# compare_vars = [
#             String('description', 'description', method='lcs',
#                    label='description', threshold=0.95),
#             Numeric('originPrice', 'originPrice', method='gauss',
#                     label='originPrice', offset=0.2, scale=0.2),
#             Geographic('latitude', 'longitude', 'latitude', 'longitude',
#                        method='gauss', offset=0.2, label='location')
#             ]
# indexer = rl.index.Block(block_vars)
# candidate_links = indexer.index(df_in)
# njobs = 8

# ## This is the part that takes hours
# comparer = rl.Compare(compare_vars, n_jobs=njobs)
# compare_vectors = comparer.compute(pairs=candidate_links, x=df_in)

# ## Model training doesn't take too long
# ecm = rl.ECMClassifier(binarize=0.5)
# ecm.fit(compare_vectors)
# pairs_ecm = ecm.predict(compare_vectors)

In [None]:
# from torch.utils.data import DataLoader
# from sentence_transformers import losses
# from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation
# from sentence_transformers.readers import InputExample


# ############################################################
# logging.basicConfig(format='%(asctime)s - %(message)s',
#                     datefmt='%Y-%m-%d %H:%M:%S',
#                     level=logging.INFO,
#                     handlers=[LoggingHandler()])
# logger = logging.getLogger(__name__)
# ############################################################

# # prepare data splits for algorithm
# X_train_txt['target'] = np.where(y_train == "match", 1, 0)
# X_test_txt['target'] = np.where(y_test == "match", 1, 0)
# X_val_txt['target'] = np.where(y_val == "match", 1, 0)


# # oaraneters abd configs for training
# model_name = 'bert-base-uncased'
# num_epochs = 1
# train_batch_size = 64
# margin = 0.5
# model_save_path = '../output/models/{}-bsz-{}-ep-{}-{}'.format(
#     model_name, 
#     train_batch_size,
#     num_epochs,
#     datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# )
# os.makedirs(model_save_path, exist_ok=True)
# distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
# model = SentenceTransformer(model_name)


# # create train and test sample
# train_samples = []
# for row in  X_train_txt.iterrows():
#     if row[1]['target'] == 1:
#         train_samples.append(
#             InputExample(
#                 texts=[
#                     row[1]['sentence_l'], 
#                     row[1]['sentence_r']
#                 ], 
#                 label=int(row[1]['target'])
#             )
#         )
#         train_samples.append(
#             InputExample(
#                 texts=[
#                     row[1]['sentence_r'], 
#                     row[1]['sentence_l']
#                 ], 
#                 label=int(row[1]['target'])
#             )
#         )
#     else:
#         train_samples.append(
#             InputExample(
#                 texts=[
#                     row[1]['sentence_l'], 
#                     row[1]['sentence_r']
#                 ], 
#                 label=int(row[1]['target'])
#             )
#         )

# # initialize data loader and loss definition
# train_dataloader = DataLoader(
#     train_samples, 
#     shuffle=True, 
#     batch_size=train_batch_size
# )

# train_loss = losses.OnlineContrastiveLoss(
#     model=model, 
#     distance_metric=distance_metric, 
#     margin=margin
# )

# evaluators = []

# dev_sentences1 = []
# dev_sentences2 = []
# dev_labels = []
# for row in X_val_txt.iterrows():
#     dev_sentences1.append(row[1]['sentence_l'])
#     dev_sentences2.append(row[1]['sentence_r'])
#     dev_labels.append(int(row[1]['target']))

# binary_acc_evaluator = evaluation.BinaryClassificationEvaluator(
#     sentences1=dev_sentences1, 
#     sentences2=dev_sentences2, 
#     labels=dev_labels
# )
# evaluators.append(binary_acc_evaluator)

# # This SequentialEvaluator runs all other evaluators if/when added 
# seq_evaluator = evaluation.SequentialEvaluator(
#     evaluators=evaluators, 
#     main_score_function=lambda scores: scores[-1]
# )

# logger.info("Evaluate model without training")
# seq_evaluator(
#     model=model, 
#     epoch=0, 
#     steps=0, 
#     output_path=model_save_path
# )

# model.fit(
#     train_objectives=[(train_dataloader, train_loss)],
#     evaluator=seq_evaluator,
#     epochs=num_epochs,
#     use_amp=True,
#     warmup_steps=500,
#     output_path=model_save_path,
#     show_progress_bar=True
# )

# bi_encoder = SentenceTransformer(model_save_path)

# test_sentence_l = X_test_txt.sentence_l.tolist()
# test_sentence_r = X_test_txt.sentence_r.tolist()
# test_target = X_test_txt.target.tolist()

# test_eval = evaluation.BinaryClassificationEvaluator(
#     sentences1=test_sentence_l,
#     sentences2=test_sentence_r,
#     labels=test_target,
#     name=f"test_evaluator_{os.path.basename(model_save_path)}",
#     batch_size=32,
#     write_csv=True,
#     show_progress_bar=True
# )

# test_pref_metrics = test_eval.compute_metrices(bi_encoder)
# acc, acc_threshold = test_eval(bi_encoder).find_best_acc_and_threshold()
# f1, precision, recall, f1_threshold = test_eval(bi_encoder).find_best_f1_and_threshold()