### regex functions

In [1]:
import logging
import re
import pandas as pd
import spacy
nlp = spacy.load("/opt/conda/lib/python3.10/site-packages/en_core_web_lg/en_core_web_lg-3.8.0")

In [2]:
def newline(text:str):
    """replace 'ZZZZZZ' with '\n'"""
    try:
        newlineRegex = re.compile(r'ZZZZZZ', re.VERBOSE)
        outputText = newlineRegex.sub('\\n', text)
    except:
        logging.debug(f'newline function error')
    return outputText

In [3]:
def appointment(text:str):
    """replace 'Appointment Reference: FF or SS followed by 10 digits' with '[APPOINTMENT]'"""
    try:
        appointmentRegex = re.compile(r'Appointment\sReference:\s(SS|FF)(\d){10}', re.IGNORECASE|re.VERBOSE)
        outputText = appointmentRegex.sub('[APPOINTMENT]', text)
    except:
        logging.debug(f'appointment function error')
    return outputText

In [4]:
def contact(text:str):
    """replace 'Contact Name: name' with '[CONTACT]'"""
    try:
        contactRegex = re.compile(r'''Contact\sName:\s\w*''', re.IGNORECASE|re.VERBOSE)
        outputText = contactRegex.sub('[CONTACT]', text)  
    except:
        logging.debug(f'appointment function error')
    return outputText

In [5]:
def phone(text:str):
    """replace 'phone numbers (with or without UK country code)' with '<PHONE-NUMBER>'"""
    try:
        phoneRegex = re.compile(r'''(
            (44|\+44)?                    # optional country code
            (\s|-|\.)?                    # optional separator
            (\d{3,5}|\(\d{3,5}\))         # first 3 to 5 digits (or 3 to 5 digits in brackets)
            (\s|-|\.)?                    # optional separator
            \d{3,4}                       # 3 to 4 digits
            (\s|-|\.)?                    # optional separator
            \d{3,5}                       # last 3 to 5 digits
            (\s*(ext|x|ext.)\s*\d{2,5})?  # optional extension
            )''',  re.IGNORECASE|re.VERBOSE)
        outputText = phoneRegex.sub('[PHONE]',text)
    except:
        logging.debug(f'phone function error')
    return outputText

In [6]:
def email(text:str):
    """replace 'email address' with '[EMAIL]'"""
    try:
        emailRegex = re.compile(r'''(
            [a-zA-Z0-9._%+-]+      # username
            @                      # @ symbol
            [a-zA-Z0-9.-]+         # domain name
            (\.[a-zA-Z]{2,4})      # dot-something
            )''', re.VERBOSE)
        outputText = emailRegex.sub('[EMAIL]',text)
    except:
        logging.debug(f'email function error')
    return outputText

In [7]:
def http(text:str):
    """replace 'http or http address' with '<HTTP-ADDRESS>'"""
    try:
        httpRegex = re.compile(r'''http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+''', re.IGNORECASE|re.VERBOSE)
        outputText = httpRegex.sub('[HTTP]', text)
    except:
        logging.debug(f'http function error')
    return outputText

In [8]:
def cp_name(text:str):
    """replace 'Communications Provider Name:' with '<CP-NAME>'"""
    try:
        cpRegex = re.compile(r'''(
            Communications\sProvider\sName:\s
            (([A-Z]{3,20})|(\s+))* # between 3 & 20 uppercase letters, or, any number of spaces
            )''', re.VERBOSE)
        outputText = cpRegex.sub('[CP] ', text)
    except:
        logging.debug(f'cp function error')
    return outputText

In [20]:
def scrub_pii_with_nlp(text:str):
    """ use nlp to replace people names with [PERSON]"""
    try:
        nlp_doc = nlp(text)
        final_text = text

        for name in nlp_doc.ents:
            if name.label_ == "PERSON":
                final_text = re.sub(re.escape(name.text), "[PERSON]", final_text)
        outputText = final_text
    except:
        logging.debug(f'cp function error')
    return outputText

In [21]:
def preprocess(inputText:str):
    text1 = scrub_pii_with_nlp(inputText)
    text2 = newline(text1)
    text3 = appointment(text2)
    text4 = contact(text3)
    text5 = phone(text4)
    text6 = email(text5)
    text7 = cp_name(text6)
    text8 = http(text7)
    #print(f"""Input text:\n{inputText}\n\nOutput text:\n{text8}""")
    return text8

#### test

In [22]:
df = pd.read_csv("/home/jupyter/projects/deemed-consent/input/unified_dc_notes_with_project_notes_context_7days.csv")

In [23]:
inputText= df.llm_context[861]

In [24]:
print(preprocess(inputText))

2023-10-31 - General
ONEA74953151 A L/WEL A END: 31/10/2023 Arrived Onsite at Approx. 10:30hrs at KING EDWARD VII HOSPITAL, 54 BEAUMONT STREET, LONDON, W1G 6DW.ONEA74953151 B L/WEL B-END: 31/10/2023 Arrived Onsite at Approx. 10:30hrs at KING EDWARD VII HOSPITAL, 5 10 BEAUMONT STREET, LONDON, W1G 6AA.Onsite Contact: [PERSON][PHONE] was onsite and in attendance for the SURVEY.SITE NOT READY. Onsite Contact: [PERSON][PHONE] has requested OPENREACH put both A END and B END orders on HOLD until their 2 new RESILENT RO2 Circuits: ONEA73955859 and ONEA[PHONE] have been installed and presented in the SECOND LEVEL BASEMENT CO[PERSON]S ROOM.2 new FIBRE TRAYS have been installed in the SECOND LEVEL BASEMENT CO[PERSON]S ROOM RACK and 2 new BFT4 CABLES have been run from the FIBRE TRAYS to 2 new FDP (FIBRE DISTRIBUTION POINTS) CSSP on the wall in the SECOND LEVEL BASEMENT CO[PERSON]S ROOM.The CUSTOMER has no INTAKE DUCT at present and are waiting for the OPENREACH CIVILS BUILT TEAM to lay a new INT