## INSTALL DEPENDENCIES

### FOR MAC

In [None]:
%pip install torch

### FOR CUDA (GPU)

In [None]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117

### GENERAL

In [None]:
%pip install presidio_analyzer
%pip install presidio_anonymizer
%pip install transformers
%pip install pandas
%pip install spacy
%pip install spacy-transformers
%pip install tabulate
%pip install multiprocess

### INSTALL SIMPLE SPACY MODEL

In [None]:
!python -m spacy download en_core_web_sm

### INSTALL COMPLEX SPACY MODEL (ONLY IF YOU USE THIS INSTEAD OF BERT)

In [None]:
!python -m spacy download en_core_web_lg

## IMPORTS

In [None]:
import pandas as pd
from transformers_rec import (
    TransformersRecognizer,
    BERT_DEID_CONFIGURATION,
    Analyzer,
    Anonymizer,
)
from typing import List, Iterator, Tuple
import spacy
from spacy import displacy
import csv
import json
from tqdm import tqdm
import re
import warnings
from tabulate import tabulate
import os 
import time
from datetime import datetime
import psutil
from Config import Config
from utils import (
    transform_csv_annotated_to_json,
    model_results,
    fix_entities_to_eval,
)
from presidio_analyzer import RecognizerResult

## TESTING CUDA FOR GPU 

In [None]:
import torch
torch.cuda.is_available()

## INITIAL CONFIG FOR THE ANALYZER AND MODEL

In [None]:
analyzer = Analyzer("obi/deid_roberta_i2b2") # "en_core_web_lg" or "obi/deid_roberta_i2b2"

In [None]:
threshold = Config.threshold
entities = Config.entities
columns = Config.columns
number_column_review = Config.number_column_review
check_overlaps= Config.check_overlaps

## TEST SIMPLE DATA

In [None]:
def annotate(text: str, analyze_results: List[RecognizerResult]):
    """
    Highlights every identified entity on top of the text.
    :param text: full text
    :param analyze_results: list of analyzer results.
    """
    ents = []

    # Use the anonymizer to resolve overlaps
    anonymizer = Anonymizer()
    results = anonymizer.anonymize(text, analyze_results)
    # sort by start index
    results = sorted(results.items, key=lambda x: x.start)
    for i, res in enumerate(results):
        ents.append({"start": res.start, "end": res.end, "label": res.entity_type, "text": res.text})
    return [{"text": text, "ents": ents}]

In [None]:
def show_results(an_r, text, return_analyzer_results=False):
    """Show results of analyze() in a dataframe."""
    df = pd.DataFrame.from_records([r.to_dict() for r in an_r])
    df["text"] = [text[res.start: res.end] for res in an_r]
    df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
        {
            "entity_type": "Entity type",
            "text": "Text",
            "start": "Start",
            "end": "End",
            "score": "Confidence",
        },
        axis=1,
    )
    df_subset["Text"] = [text[res.start: res.end] for res in an_r]
    #  In analysis_explanation_df there are more columns than in df_subset with more information. 
    if return_analyzer_results:
      analysis_explanation_df = pd.DataFrame.from_records(
          [r.analysis_explanation.to_dict() for r in an_r]
      )
    # df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
    result = annotate(text, an_r)
    return df_subset.reset_index(drop=True), result
  

In [None]:
text="I have ordered 3 beautiful garments from this shop and have nothing but great things to say about the products, the customer service and the shipping. Diarrablu is definitely my new favorite brand. Love from the Netherlands, Tasha"

In [None]:
analyze_results = analyzer.analyze(
    text=text,
    entities= entities,
    language="en",
    score_threshold=threshold,
)

In [None]:
frame, sentence = show_results(analyze_results, text)
# print(sentence)
displacy.render(sentence, style="ent", manual=True)
display(frame)

## RUN MODEL

In [None]:
# input_path, output_path, entities, threshold, analyzer, columns, number_column_review, check_overlaps=False
model_results(input_path="testing-data/input-path.csv", output_path="testing-data/output-path.json")

## EVALUATE MODEL
The format to the dataset if like this: 
- [Example Dataset](https://www.kaggle.com/datasets/namanj27/ner-dataset)

Column necessaries and their names:
- Sentence # --> Review #
- Word --> Word
- POS --> Delete this.
- Tag --> Tag.

### EXTRACT THE GROUND TRUTH

In [None]:
ground_truth = transform_csv_annotated_to_json("testing-data/product_reviews9.csv")
print(ground_truth[1999])

### EXTRACT ONLY SENTENCES TO SEND TO THE MODEL

In [None]:
def extract_sentences_to_eval(input_file, output_file):
  data = (pd.read_csv(input_file, encoding='ISO-8859-1')
    .fillna(method='ffill'))
  with open(output_file, 'w', encoding='ISO-8859-1') as fo:
    writer = csv.writer(fo)
    writer.writerow(['SENTENCES']) 
    for sent, sent_info in data.groupby('Review #'):
      words = list(sent_info["Word"])
      sentence = re.sub(r'\s([?.!"](?:\s|$))', r'\1', " ".join(words))
      writer.writerow([sentence])

In [None]:
extract_sentences_to_eval("testing-data/product_reviews9.csv", "testing-data/sentences_evaluate.csv")

In [8]:
def extract_sentences_from_db(input_file, output_file):
  data = (pd.read_csv(input_file, encoding='ISO-8859-1')
    .fillna(method='ffill'))
  reviews = []
  for idx, row in tqdm(data.iterrows(), total=len(data)):
    for review in json.loads(row["REVIEWS"]):
      reviews.append([row["SITE_URL"], row["PVID"], review])
  new_data = pd.DataFrame(reviews, columns=["SITE_URL", "PVID", "REVIEW"])
  new_data.to_csv(output_file, index=False)

In [9]:
extract_sentences_from_db("testing-data/reviews.csv", "testing-data/sentences_from_db2.csv")

100%|██████████| 68049/68049 [00:07<00:00, 9409.85it/s] 


### RUN THE MODEL WITH THE EXTRACTED SENTENCES

In [None]:
model_results(input_path="testing-data/input-path.csv", output_path="testing-data/output-path.json")

#### REMOVE UNWANTED ENTITIES FROM THE MODEL RESULTS AND JOIN ENTITIES SUBDIVIDED INTO ONE
- Sometimes the model divide entities into several entities, so we need to join them to evaluate the results. For example, if the model found the entity "686 E Broadway" as "6", "86 E" and "Broadway", we need to join them to evaluate the results.

In [None]:
fix_entities_to_eval("testing-data/input_path.json", "testing-data/output_path.json")

### EVALUATING

In [None]:
with open("testing-data/output_path.json", "r", encoding="ISO-8859-1") as f:
  json_data = f.read()
prediction_data = json.loads(json_data)

In [None]:
prediction_data[1]

In [None]:
def calculate_general_scores(entity_scores):
    precision_sum = 0
    recall_sum = 0
    f1_sum = 0
    for entity, scores in entity_scores.items():
        precision_sum += scores['precision']
        recall_sum += scores['recall']
        f1_sum += scores['f1']

    num_entities = len(entity_scores)
    general_precision = precision_sum / num_entities
    general_recall = recall_sum / num_entities
    general_f1 = f1_sum / num_entities

    return general_precision, general_recall, general_f1

#### USE CASE: EXACT
- Identifies the exact words associated with all PII entities in the input text.
- This use case is applicable if the client wants to know which exact words correspond to the PII information. For example to apply masks over the PII entities detected in the input text.

In [None]:
def calculate_precision_recall_for_entities(ground_truth, output_model, unique_entities, output_file):
    # Initialize counters for true positives, false positives, and false negatives
    tp = {entity: 0 for entity in unique_entities}
    fp = {entity: 0 for entity in unique_entities}
    fn = {entity: 0 for entity in unique_entities}

    for example_idx in range(len(ground_truth)):
        ground_truth_entities = ground_truth[example_idx]['ENTITIES']
        output_entities = output_model[example_idx]['ENTITIES']
        # create a set of output entity texts for quick lookup
        ground_truth_texts = set([ent['text'].replace(" ", "") for ent in ground_truth_entities])

        for ground_truth_ent in ground_truth_entities:
            ent_type = ground_truth_ent['entity']
            if ent_type not in unique_entities:
                continue
            # print(output_entities)
            if any([ent['text'].replace(" ", "") == ground_truth_ent['text'].replace(" ","") and ent["start"] == ground_truth_ent["start"] for ent in output_entities]):
                tp[ent_type] += 1
            else:
                fn[ent_type] += 1
        for output_ent in output_entities:
            ent_type = output_ent['entity']
            if ent_type not in unique_entities:
                continue
            if output_ent['text'].replace(" ","") not in ground_truth_texts:
                fp[ent_type] += 1
    # Calculate precision, recall, and F1 score for each entity
    scores = {}
    table = []
    headers = ['Entity', 'Precision', 'Recall', 'F1']
    for entity in unique_entities:
        p = round(tp[entity] / (tp[entity] + fp[entity]), 2) if tp[entity] + fp[entity] > 0 else 0
        r = round(tp[entity] / (tp[entity] + fn[entity]), 2) if tp[entity] + fn[entity] > 0 else 0
        f1 = round(2 * p * r / (p + r), 2) if p + r > 0 else 0
        scores[entity] = {'precision': p, 'recall': r, 'f1': f1}
        table.append([entity, p, r, f1])
    
    general_precision, general_recall, general_f1 = calculate_general_scores(scores)
    report = f"\nGeneral Precision: {general_precision}\nGeneral Recall: {general_recall}\nGeneral F1: {general_f1}\n{tabulate(table, headers)}"
    with open(output_file, 'w', encoding='ISO-8859-1') as f:
        f.write(report)

In [None]:
calculate_precision_recall_for_entities(ground_truth, prediction_data,set(["PERSON", "EMAIL_ADDRESS", "LOCATION", "US_SSN", "CREDIT_CARD", "PHONE_NUMBER"]) , "eval/REPORT_LG2.txt")

#### USE CASE: BINARY
- Given an input text, it indicates whether each entity (person, location, credit_card, phone_number, us_ssn) is present at least once or not.
- The use case is applicable for filtering out reviews with sensitive information without needing to know which part of the text has the sensitive information.


In [None]:
def evaluate_entities(ground_truth, output_model, unique_entities, output_file):
    # Initialize counters for true positives, false positives, and false negatives
    tp = {entity: 0 for entity in unique_entities}
    fp = {entity: 0 for entity in unique_entities}
    fn = {entity: 0 for entity in unique_entities}
    
    # Loop over the documents in the output model
    for i, doc in enumerate(output_model):
        # Get the set of entities present in the document
        model_entity_set = set()
        for entity in doc['ENTITIES']:
            model_entity_set.add(entity['entity'])
        # Get the set of entities present in the ground truth in the same index of output model
        ground_truth_set = set()
        for entity in ground_truth[i]['ENTITIES']:
            ground_truth_set.add(entity['entity'])
            
        # Check if each entity in the ground truth is present in the output model
        for entity in ground_truth_set:
            if entity in model_entity_set:
                # Entity is present in both ground truth and output model
                tp[entity] += 1
            else:
                # Entity is present in ground truth but not in output model
                fn[entity] += 1
        
        # Check if each entity in the output model is a false positive
        for entity in model_entity_set:
            if entity not in ground_truth_set:
                # Entity is not in the ground truth
                fp[entity] += 1
    
    # Calculate precision, recall, and F1 score for each entity
    scores = {}
    table = []
    headers = ['Entity', 'Precision', 'Recall', 'F1']
    for entity in unique_entities:
        p = round(tp[entity] / (tp[entity] + fp[entity]), 2) if tp[entity] + fp[entity] > 0 else 0
        r = round(tp[entity] / (tp[entity] + fn[entity]), 2) if tp[entity] + fn[entity] > 0 else 0
        f1 = round(2 * p * r / (p + r), 2) if p + r > 0 else 0
        scores[entity] = {'precision': p, 'recall': r, 'f1': f1}
        table.append([entity, p, r, f1])
    
    general_precision, general_recall, general_f1 = calculate_general_scores(scores)
    report = f"\nGeneral Precision: {general_precision}\nGeneral Recall: {general_recall}\nGeneral F1: {general_f1}\n{tabulate(table, headers)}"
    with open(output_file, 'w', encoding='ISO-8859-1') as f:
        f.write(report)

In [None]:
# a1 = [{'TEXT': 'a1', "ENTITIES": [{'start': 21, 'end': 33, 'entity': 'PERSON', 'text': 'Zoey Edwards'}, {'start': 129, 'end': 151, 'entity': 'EMAIL_ADDRESS', 'text': 'edwards-zoey@gmail.com'}, {'start': 190, 'end': 201, 'entity': 'LOCATION', 'text': '900 F St NW'}, {'start': 213, 'end': 222, 'entity': 'US_SSN', 'text': '367245504'}, {'start': 252, 'end': 271, 'entity': 'CREDIT_CARD', 'text': '2259-8740-7030-1462'}]}]
# a2 = [{"TEXT": "a2", "ENTITIES": [{'start': 21, 'end': 33, 'entity': 'PERSON', 'text': 'Zoey Edwards'}, {'start': 129, 'end': 151, 'entity': 'EMAIL_ADDRESS', 'text': 'edwards-zoey@gmail.com'}, {'start': 190, 'end': 195, 'entity': 'LOCATION', 'text': '900 F'}, {'start': 196, 'end': 201, 'entity': 'LOCATION', 'text': 'St NW'}, {'start': 213, 'end': 222, 'entity': 'US_SSN', 'text': '367245504'}, {'start': 252, 'end': 255, 'entity': 'PHONE_NUMBER', 'text': '225'}, {'start': 255, 'end': 271, 'entity': 'PHONE_NUMBER', 'text': '9-8740-7030-1462'}]}]
evaluate_entities(ground_truth, prediction_data, set(["PERSON", "EMAIL_ADDRESS", "LOCATION", "US_SSN", "CREDIT_CARD", "PHONE_NUMBER"]), "eval/REPORT_SENTENCE_LG.txt")