## INSTALL DEPENDENCIES

### FOR MAC

In [None]:
%pip install torch

### FOR CUDA (GPU)

In [None]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117

### GENERAL

In [127]:
%pip install presidio_analyzer
%pip install presidio_anonymizer
%pip install transformers
%pip install pandas
%pip install spacy
%pip install spacy-transformers
%pip install tabulate
%pip install multiprocess

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting multiprocess
  Using cached multiprocess-0.70.14-py39-none-any.whl (132 kB)
Collecting dill>=0.3.6
  Using cached dill-0.3.6-py3-none-any.whl (110 kB)
Installing collected packages: dill, multiprocess
Successfully installed dill-0.3.6 multiprocess-0.70.14
Note: you may need to restart the kernel to use updated packages.


### INSTALL SIMPLE SPACY MODEL

In [None]:
!python -m spacy download en_core_web_sm

### INSTALL COMPLEX SPACY MODEL (ONLY IF YOU USE THIS INSTEAD OF BERT)

In [None]:
!python -m spacy download en_core_web_lg

## IMPORTS

In [33]:
from presidio_analyzer import AnalyzerEngine, RecognizerResult, RecognizerRegistry, BatchAnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer.nlp_engine import NlpArtifacts
from presidio_anonymizer.entities import OperatorConfig
import pandas as pd
from transformers_rec import (
    TransformersRecognizer,
    BERT_DEID_CONFIGURATION,
)
from typing import List, Iterator, Tuple
import spacy
from spacy import displacy
import csv
import json
from tqdm import tqdm
import re
import warnings
from tabulate import tabulate
import os 
import time
from datetime import datetime
import psutil
import multiprocess  as mp


## TESTING CUDA FOR GPU 

In [None]:
import torch
torch.cuda.is_available()

## CREATE ANALYZER AND ANONYMIZE FUNCTION

In [11]:
def analyzer_engine(model_path):
  """Return AnalyzerEngine.
    :param model_path: Which model to use for NER:
        "obi/deid_roberta_i2b2",
        "en_core_web_lg"
    """
  registry = RecognizerRegistry()
  registry.load_predefined_recognizers()
  if model_path == "en_core_web_lg":
    nlp_configuration = {
        "nlp_engine_name": "spacy",
        "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
    }
  else:
    # Using a small spaCy model + a HF NER model
    transformers_recognizer = TransformersRecognizer(model_path=model_path)
    if model_path == "obi/deid_roberta_i2b2":
      transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
    # Use small spaCy model, no need for both spacy and HF models
    # The transformers model is used here as a recognizer, not as an NlpEngine
    nlp_configuration = {
      "nlp_engine_name": "spacy",
      "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
    }
    registry.add_recognizer(transformers_recognizer)

  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()

  analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
  return analyzer

In [12]:
def analyze(analyzer, **kwargs):
    """Analyze input using Analyzer engine and input arguments (kwargs)."""
    if "entities" not in kwargs or "All" in kwargs["entities"]:
        kwargs["entities"] = None
    return analyzer.analyze(**kwargs)

In [None]:
def anonymize(text: str, analyze_results: List[RecognizerResult]):
    """Anonymize identified input using Presidio Anonymizer.
    :param text: Full text
    :param analyze_results: list of results from presidio analyzer engine
    """
    operator_config = {"lambda": lambda x: x}
    operator = "custom"
    res = AnonymizerEngine().anonymize(
        text,
        analyze_results,
        operators={"DEFAULT": OperatorConfig(operator, operator_config)},
    )
    return res

## INITIAL CONFIG FOR THE ANALYZER AND MODEL

In [13]:
analyzer = analyzer_engine("obi/deid_roberta_i2b2") # "en_core_web_lg" or "obi/deid_roberta_i2b2"

In [6]:
threshold = 0.40
entities = ["PERSON", "LOCATION", "PHONE_NUMBER", "EMAIL_ADDRESS","CREDIT_CARD", "US_SSN"]
columns = ["SITE_URL","PVID","REVIEW"]
number_column_review = 2
check_overlaps=False

In [14]:
def test(x): 
  return x*x

In [35]:
def f(x):
    return x*x

with mp.Pool(5) as pool:
    print(pool.map(f, [1, 2, 3, 4, 5]))

[1, 4, 9, 16, 25]


In [45]:
def create_obj(an_r, text):
    """Show results of analyze() in a dataframe."""
    ents = []
    for r in an_r:
      info = r.to_dict()
      ent ={ "start": info["start"], 
              "end": info['end'], 
              "confidence": info['score'], 
              "entity": info['entity_type'], 
              "text": text[info["start"]:info["end"]]} 
      ents.append(ent)
    return ents


def model_results(input_path, output_path, entities, threshold, analyzer,columns, number_column_review, check_overlaps=False, ):
  final_result = []
  df = pd.read_csv(input_path, encoding="ISO-8859-1",header=0, names=columns)
  # Extract the review column
  # print(df.values.tolist()[0:3])
  all_values = df.values.tolist()
  data_list = df.iloc[:, number_column_review].tolist()
  print("getting results for the ner model...")
  start_time = time.time()
  results = batch_analyze(analyzer, data_list, entities=entities, language="en", score_threshold=threshold)
  print(f'end of run: {time.time()-start_time}')
  print(u'Used Memory：%.4f GB' % (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024))
  print("Anonymizing results (resolve overlaps)...")
  for index, result in tqdm(enumerate(results), total=len(results)):
    if check_overlaps:
      text_anon = anonymize(data_list[index], result)
      text_anon = sorted(text_anon.items, key=lambda x: x.start)
      result = []
      for i, res in enumerate(text_anon):
          result.append({"start": res.start, "end": res.end, "entity": res.entity_type, "text": res.text})
    else:
      result = create_obj(result, data_list[index])
    final_result.append({"SITE_URL": all_values[index][0],"PVID": all_values[index][1], "TEXT": data_list[index], "ENTITIES": result})
  print("Saving results to json file...")
  fp=open(output_path,'w', encoding="ISO-8859-1") # output file
  json.dump(final_result, fp) 
  print("Done!") 

def process_review(all_values):
    data_list = [row[number_column_review] for row in all_values]
    nlp_artifacts_batch: Iterator[
          Tuple[str, NlpArtifacts]
      ] = analyzer.nlp_engine.process_batch(
          texts=data_list, language="en"
      )
    final_result = []
    for i, (text, nlp_artifacts) in tqdm(enumerate(nlp_artifacts_batch), total=len(data_list)):
        results = analyzer.analyze(
            text=str(text), nlp_artifacts=nlp_artifacts, language="en", entities=entities, score_threshold=threshold
        )
        result = []
        if check_overlaps:
            text_anon = anonymize(data_list[i], result)
            text_anon = sorted(text_anon.items, key=lambda x: x.start)
            for i, res in enumerate(text_anon):
                result.append({"start": res.start, "end": res.end, "entity": res.entity_type, "text": res.text})
        else:
            result = create_obj(result, data_list[i])
        final_result.append({"SITE_URL": all_values[i][0],"PVID": all_values[i][1], "TEXT": data_list[i], "ENTITIES": result})
    return final_result


def process_review2(value):
    
    review = value[number_column_review]
    final_result = []
    results = analyzer.analyze(
        text=str(review), language="en", entities=entities, score_threshold=threshold
    )
    result = []
    if check_overlaps:
        text_anon = anonymize(review, result)
        text_anon = sorted(text_anon.items, key=lambda x: x.start)
        for i, res in enumerate(text_anon):
            result.append({"start": res.start, "end": res.end, "entity": res.entity_type, "text": res.text})
    else:
        result = create_obj(result, review)
    final_result.append({"SITE_URL": value[0],"PVID": value[1], "TEXT": review, "ENTITIES": result})
    return final_result

def model_results_parallel(input_path, output_path):
    # Read input data
    df = pd.read_csv(input_path, encoding="ISO-8859-1", header=0, names=columns)
    # Extract the review column
    all_values = df.values.tolist()

    # Create pool of worker processes
    num_workers = mp.cpu_count() - 1
    pool = mp.Pool(num_workers)


    print(f"Getting results for the NER model and Presidio Analyzer using {num_workers} workers...")
    # Create a tqdm instance to track progress
    pbar = tqdm(total=len(all_values))
    # results = Parallel(n_jobs=num_workers)(delayed(process_review2)(x) for x in all_values)
    with pool as p:
        results = p.map(process_review2, all_values)
        pbar.update(1)
    pbar.close()
    print("Saving results to json file...")
    with open(output_path, 'w', encoding="ISO-8859-1") as fp:
        json.dump(results, fp)
    print("Done!")


    # # Use the joblib.Parallel context manager with tqdm
    # with Parallel(n_jobs=num_workers) as parallel:
    #     # Call delayed with the function and the iterable
    #     results = parallel(delayed(process_review2)(np.copy(x)) for x in all_values)
    #     # Update the progress bar
    #     pbar.update()

    # Close the progress bar
    # pbar.close()
    
    # print(f"Getting results for the NER model and Presidio Analyzer...")
    # results = process_review(analyzer, all_values, entities, threshold, check_overlaps, number_column_review)
    # print("Saving results to json file...")
    # with open(output_path, 'w', encoding="ISO-8859-1") as fp:
    #     json.dump(results, fp)
    # print("Done!")

## TEST SIMPLE DATA

In [None]:
def annotate(text: str, analyze_results: List[RecognizerResult]):
    """
    Highlights every identified entity on top of the text.
    :param text: full text
    :param analyze_results: list of analyzer results.
    """
    ents = []

    # Use the anonymizer to resolve overlaps
    results = anonymize(text, analyze_results)
    # sort by start index
    results = sorted(results.items, key=lambda x: x.start)
    for i, res in enumerate(results):
        ents.append({"start": res.start, "end": res.end, "label": res.entity_type, "text": res.text})
    return [{"text": text, "ents": ents}]

In [None]:
def show_results(an_r, text, return_analyzer_results=False):
    """Show results of analyze() in a dataframe."""
    df = pd.DataFrame.from_records([r.to_dict() for r in an_r])
    df["text"] = [text[res.start: res.end] for res in an_r]
    df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
        {
            "entity_type": "Entity type",
            "text": "Text",
            "start": "Start",
            "end": "End",
            "score": "Confidence",
        },
        axis=1,
    )
    df_subset["Text"] = [text[res.start: res.end] for res in an_r]
    #  In analysis_explanation_df there are more columns than in df_subset with more information. 
    if return_analyzer_results:
      analysis_explanation_df = pd.DataFrame.from_records(
          [r.analysis_explanation.to_dict() for r in an_r]
      )
    # df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
    result = annotate(text, an_r)
    return df_subset.reset_index(drop=True), result
  

In [None]:
text="My name is Gonzalo Zelinka, I'm from Argentina and I live in Barcelona. My phone number is +34 666 666 666 and my email is gonzalozelinka@gmail.com. I work at Microsoft and my credit card number is 1234 5678 9012 3456. My SSN is 123-45-6789."

In [None]:
analyze_results = analyze(
    analyzer_engine=analyzer,
    texts=text,
    entities= entities,
    language="en",
    score_threshold=threshold,
)

In [None]:
frame, sentence = show_results(analyze_results, text)
# print(sentence)
displacy.render(sentence, style="ent", manual=True)
display(frame)

## GENERATE RESULTS FROM COMPLEX DATA

### RUN MODEL

In [43]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [46]:
# input_path, output_path, entities, threshold, analyzer, columns, number_column_review, check_overlaps=False
model_results_parallel("testing-data/sentences_from_db copy.csv", "testing-data/testing.json")

Getting results for the NER model and Presidio Analyzer using 9 workers...






[A[A[A[AProcess ForkPoolWorker-129:
Process ForkPoolWorker-128:
Process ForkPoolWorker-126:
Process ForkPoolWorker-127:
Process ForkPoolWorker-124:
Process ForkPoolWorker-130:
Process ForkPoolWorker-125:


KeyboardInterrupt: 

## EVALUATE MODEL
The format to the dataset if like this: 
- [Example Dataset](https://www.kaggle.com/datasets/namanj27/ner-dataset)

Column necessaries and their names:
- Sentence # --> Review #
- Word --> Word
- POS --> Delete this.
- Tag --> Tag.

### EXTRACT THE GROUND TRUTH

In [None]:
def get_span_indx(
    labels: List[str],
    words: List[str],
    sentence: str
) -> List[tuple]:
    """Gets span starts and ends for Spacy spancat component.
        
        Returns list of tuples where the first element of the 
        tuple is the span start, the second element of the tuple
        is the span end and the third element of the tuple is
        the span category. 
    """
    #gets list of indices corresponding to labelled words 
    label_indx = []
    temp_list = []

    for i, l in enumerate(labels):
        if l != 'O':
            temp_list.append(i)
        else:
            label_indx.append(temp_list)
            temp_list = []    
        if i == len(labels) - 1:
            label_indx.append(temp_list)

    clean_label_indx = [x for x in label_indx if len(x) > 0]

    spans = []
    for indx in clean_label_indx:
        if len(indx) == 1:
            span = words[indx[0]]
            label = labels[indx[0]].upper()
        else:
            span = ' '.join([words[i] for i in indx])  
            label = [labels[i].upper() for i in indx][0]
        #remove punctuation and strip whitespace for spans
        span_clean = span.strip()
        for m in re.finditer(re.escape(span_clean), sentence):
            spans.append({"start":m.start(), "end":m.end(), "entity": label, "text": m.group()})
    
    return spans

In [None]:
def transform_csv_annotated_to_json(input_path):
    DATA = []
    data = (pd.read_csv(input_path, encoding='ISO-8859-1')
          .fillna(method='ffill'))
    for sent, sent_info in data.groupby('Review #'):
      words = list(sent_info["Word"])
      #convert words to sentence and get rid of spaces between punctuation characters
      sentence = re.sub(r'\s([?.!"](?:\s|$))', r'\1', " ".join(words))
      #get labels
      labels = list(sent_info['Tag'])
      #identify token span start, span ends and span category
      span_ents = get_span_indx(labels, words, sentence)
      DATA.append({"TEXT": sentence, "ENTITIES": span_ents})
    return DATA


In [None]:
ground_truth = transform_csv_annotated_to_json("testing-data/product_reviews9.csv")
with open('testing-data/true_data.json', 'w') as fp:
    json.dump(ground_truth, fp)
print(ground_truth[1999])

### EXTRACT ONLY SENTENCES TO SEND TO THE MODEL

In [None]:
def extract_sentences_to_eval(input_file, output_file):
  data = (pd.read_csv(input_file, encoding='ISO-8859-1')
    .fillna(method='ffill'))
  with open(output_file, 'w', encoding='ISO-8859-1') as fo:
    writer = csv.writer(fo)
    writer.writerow(['SENTENCES']) 
    for sent, sent_info in data.groupby('Review #'):
      words = list(sent_info["Word"])
      sentence = re.sub(r'\s([?.!"](?:\s|$))', r'\1', " ".join(words))
      writer.writerow([sentence])

In [None]:
extract_sentences_to_eval("testing-data/product_reviews9.csv", "testing-data/sentences_evaluate.csv")

In [None]:
def extract_sentences_from_db(input_file, output_file):
  data = (pd.read_csv(input_file, encoding='ISO-8859-1')
    .fillna(method='ffill'))
  reviews = []
  for idx, row in data.iterrows():
    for review in json.loads(row["REVIEWS"]):
      reviews.append([row["SITE_URL"], row["PVID"], review])
  new_data = pd.DataFrame(reviews, columns=["SITE_URL", "PVID", "REVIEW"])
  new_data.to_csv(output_file, index=False)

In [None]:
extract_sentences_from_db("testing-data/Flamingo_Reviews_Extracted_Data_2023_04_12 (1).csv", "testing-data/sentences_from_db.csv")

### RUN THE MODEL WITH THE EXTRACTED SENTENCES

In [None]:
model_results(input_path="testing-data/input-path.csv", output_path="testing-data/output-path.json", 
entities=entities, threshold=threshold, analyzer=analyzer, columns=["SITE_URL", "PVID", "REVIEW"], check_overlaps=True)

#### REMOVE UNWANTED ENTITIES FROM THE MODEL RESULTS AND JOIN ENTITIES SUBDIVIDED INTO ONE
- Sometimes the model divide entities into several entities, so we need to join them to evaluate the results. For example, if the model found the entity "686 E Broadway" as "6", "86 E" and "Broadway", we need to join them to evaluate the results.

In [None]:
def fix_entities_to_eval(input_path, output_path):
    # Read the input JSON file
    with open(input_path, "r", encoding="ISO-8859-1") as f:
        json_data = f.read()
    # Load the JSON data into a Python list
    data_to_check = json.loads(json_data)
    # Loop over each item in the list
    for js in data_to_check:
        # Remove entities with "O" entity value
        js["ENTITIES"] = [ent for ent in js["ENTITIES"] if ent["entity"] != "O"]

        # Fix overlapping entities and combine entities with a single character distance
        entities_clean = []
        i = 0 
        # Loop over each entity in the current item
        # print("js[ENTITIES]: ", js["ENTITIES"])
        while i < len(js["ENTITIES"]):
            j = i + 1
            new_entity = js["ENTITIES"][i]
            # Combine adjacent entities that have the same type and are next to each other
            while j < len(js["ENTITIES"]):
                if (js["ENTITIES"][j]['entity'] != js["ENTITIES"][i]['entity'] 
                    or int(js["ENTITIES"][j]["start"]) - int(js["ENTITIES"][i]["end"]) > 1):
                    # If the next entity is not the same type or is not adjacent, stop combining entities
                    break
                new_entity["end"] = int(js["ENTITIES"][j]["end"])
                # print("new_entity end: ", new_entity["end"])
                new_entity["text"] = new_entity["text"] + " " + js["ENTITIES"][j]["text"]
                # print("new_entity text: ", new_entity["text"])
                j += 1

            entities_clean.append(new_entity)
            i = j
        # Update the current item with the cleaned entities
        # print("entities_clean final: ", entities_clean)
        js["ENTITIES"] = entities_clean
    # Write the updated JSON data to the output file
    with open(output_path, 'w', encoding="ISO-8859-1") as fp:
        json.dump(data_to_check, fp)

In [None]:
fix_entities_to_eval("testing-data/output_lg.json", "testing-data/output_lg_fixed.json")

### EVALUATING

In [None]:
with open("testing-data/output_lg_fixed.json", "r", encoding="ISO-8859-1") as f:
  json_data = f.read()
prediction_data = json.loads(json_data)

In [None]:
prediction_data[1]

In [None]:
def calculate_general_scores(entity_scores):
    precision_sum = 0
    recall_sum = 0
    f1_sum = 0
    for entity, scores in entity_scores.items():
        precision_sum += scores['precision']
        recall_sum += scores['recall']
        f1_sum += scores['f1']

    num_entities = len(entity_scores)
    general_precision = precision_sum / num_entities
    general_recall = recall_sum / num_entities
    general_f1 = f1_sum / num_entities

    return general_precision, general_recall, general_f1

#### USE CASE: EXACT
- Identifies the exact words associated with all PII entities in the input text.
- This use case is applicable if the client wants to know which exact words correspond to the PII information. For example to apply masks over the PII entities detected in the input text.

In [None]:
def calculate_precision_recall_for_entities(ground_truth, output_model, unique_entities, output_file):
    # Initialize counters for true positives, false positives, and false negatives
    tp = {entity: 0 for entity in unique_entities}
    fp = {entity: 0 for entity in unique_entities}
    fn = {entity: 0 for entity in unique_entities}

    for example_idx in range(len(ground_truth)):
        ground_truth_entities = ground_truth[example_idx]['ENTITIES']
        output_entities = output_model[example_idx]['ENTITIES']
        # create a set of output entity texts for quick lookup
        ground_truth_texts = set([ent['text'].replace(" ", "") for ent in ground_truth_entities])

        for ground_truth_ent in ground_truth_entities:
            ent_type = ground_truth_ent['entity']
            if ent_type not in unique_entities:
                continue
            # print(output_entities)
            if any([ent['text'].replace(" ", "") == ground_truth_ent['text'].replace(" ","") and ent["start"] == ground_truth_ent["start"] for ent in output_entities]):
                tp[ent_type] += 1
            else:
                fn[ent_type] += 1
        for output_ent in output_entities:
            ent_type = output_ent['entity']
            if ent_type not in unique_entities:
                continue
            if output_ent['text'].replace(" ","") not in ground_truth_texts:
                fp[ent_type] += 1
    # Calculate precision, recall, and F1 score for each entity
    scores = {}
    table = []
    headers = ['Entity', 'Precision', 'Recall', 'F1']
    for entity in unique_entities:
        p = round(tp[entity] / (tp[entity] + fp[entity]), 2) if tp[entity] + fp[entity] > 0 else 0
        r = round(tp[entity] / (tp[entity] + fn[entity]), 2) if tp[entity] + fn[entity] > 0 else 0
        f1 = round(2 * p * r / (p + r), 2) if p + r > 0 else 0
        scores[entity] = {'precision': p, 'recall': r, 'f1': f1}
        table.append([entity, p, r, f1])
    
    general_precision, general_recall, general_f1 = calculate_general_scores(scores)
    report = f"\nGeneral Precision: {general_precision}\nGeneral Recall: {general_recall}\nGeneral F1: {general_f1}\n{tabulate(table, headers)}"
    with open(output_file, 'w', encoding='ISO-8859-1') as f:
        f.write(report)

In [None]:
calculate_precision_recall_for_entities(ground_truth, prediction_data,set(["PERSON", "EMAIL_ADDRESS", "LOCATION", "US_SSN", "CREDIT_CARD", "PHONE_NUMBER"]) , "eval/REPORT_LG2.txt")

#### USE CASE: BINARY
- Given an input text, it indicates whether each entity (person, location, credit_card, phone_number, us_ssn) is present at least once or not.
- The use case is applicable for filtering out reviews with sensitive information without needing to know which part of the text has the sensitive information.


In [None]:
def evaluate_entities(ground_truth, output_model, unique_entities, output_file):
    # Initialize counters for true positives, false positives, and false negatives
    tp = {entity: 0 for entity in unique_entities}
    fp = {entity: 0 for entity in unique_entities}
    fn = {entity: 0 for entity in unique_entities}
    
    # Loop over the documents in the output model
    for i, doc in enumerate(output_model):
        # Get the set of entities present in the document
        model_entity_set = set()
        for entity in doc['ENTITIES']:
            model_entity_set.add(entity['entity'])
        # Get the set of entities present in the ground truth in the same index of output model
        ground_truth_set = set()
        for entity in ground_truth[i]['ENTITIES']:
            ground_truth_set.add(entity['entity'])
            
        # Check if each entity in the ground truth is present in the output model
        for entity in ground_truth_set:
            if entity in model_entity_set:
                # Entity is present in both ground truth and output model
                tp[entity] += 1
            else:
                # Entity is present in ground truth but not in output model
                fn[entity] += 1
        
        # Check if each entity in the output model is a false positive
        for entity in model_entity_set:
            if entity not in ground_truth_set:
                # Entity is not in the ground truth
                fp[entity] += 1
    
    # Calculate precision, recall, and F1 score for each entity
    scores = {}
    table = []
    headers = ['Entity', 'Precision', 'Recall', 'F1']
    for entity in unique_entities:
        p = round(tp[entity] / (tp[entity] + fp[entity]), 2) if tp[entity] + fp[entity] > 0 else 0
        r = round(tp[entity] / (tp[entity] + fn[entity]), 2) if tp[entity] + fn[entity] > 0 else 0
        f1 = round(2 * p * r / (p + r), 2) if p + r > 0 else 0
        scores[entity] = {'precision': p, 'recall': r, 'f1': f1}
        table.append([entity, p, r, f1])
    
    general_precision, general_recall, general_f1 = calculate_general_scores(scores)
    report = f"\nGeneral Precision: {general_precision}\nGeneral Recall: {general_recall}\nGeneral F1: {general_f1}\n{tabulate(table, headers)}"
    with open(output_file, 'w', encoding='ISO-8859-1') as f:
        f.write(report)

In [None]:
# a1 = [{'TEXT': 'a1', "ENTITIES": [{'start': 21, 'end': 33, 'entity': 'PERSON', 'text': 'Zoey Edwards'}, {'start': 129, 'end': 151, 'entity': 'EMAIL_ADDRESS', 'text': 'edwards-zoey@gmail.com'}, {'start': 190, 'end': 201, 'entity': 'LOCATION', 'text': '900 F St NW'}, {'start': 213, 'end': 222, 'entity': 'US_SSN', 'text': '367245504'}, {'start': 252, 'end': 271, 'entity': 'CREDIT_CARD', 'text': '2259-8740-7030-1462'}]}]
# a2 = [{"TEXT": "a2", "ENTITIES": [{'start': 21, 'end': 33, 'entity': 'PERSON', 'text': 'Zoey Edwards'}, {'start': 129, 'end': 151, 'entity': 'EMAIL_ADDRESS', 'text': 'edwards-zoey@gmail.com'}, {'start': 190, 'end': 195, 'entity': 'LOCATION', 'text': '900 F'}, {'start': 196, 'end': 201, 'entity': 'LOCATION', 'text': 'St NW'}, {'start': 213, 'end': 222, 'entity': 'US_SSN', 'text': '367245504'}, {'start': 252, 'end': 255, 'entity': 'PHONE_NUMBER', 'text': '225'}, {'start': 255, 'end': 271, 'entity': 'PHONE_NUMBER', 'text': '9-8740-7030-1462'}]}]
evaluate_entities(ground_truth, prediction_data, set(["PERSON", "EMAIL_ADDRESS", "LOCATION", "US_SSN", "CREDIT_CARD", "PHONE_NUMBER"]), "eval/REPORT_SENTENCE_LG.txt")