### deemed consent - preprocess project notes

In [1]:
# regex that cleans notes from pii: newline, appointment reference, contact name, phone numbers, email address, http addresses.

#### import packages

In [1]:
import logging
from pathlib import Path

In [2]:
logging.basicConfig(filename='log.txt',level=logging.DEBUG,format=' %asctime)s - %(levelname)s - %(message)s')
logging.disable()
#logging.info(f'Log created')
#logging.debug('start of program')

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth',300) 
import re
#!python -V

In [4]:
import spacy

In [5]:
# load local version of spacy model
nlp = spacy.load("/opt/conda/lib/python3.10/site-packages/en_core_web_lg/en_core_web_lg-3.8.0")

In [7]:
#!pip list

#### notes file

In [8]:
base_path = Path("/home/jupyter/deemed_consent")
data_dir = Path("data")
input_notes_path = base_path / data_dir / "unified_dc_notes_with_project_notes_context_7days.csv"
abbreviation_dictionary_path = base_path / data_dir / "DC_abbreviations.csv"
output_notes_path = base_path / data_dir / "clean_notes_context_7days.csv"

#### load data

sql_query: - https://gitlab.agile.nat.bt.com/DCU/llm_projects/openreach/deemed_consent/-/blob/main/sql/unified_dc_notes_with_project_notes_7day_context.sql

In [9]:
df = pd.read_csv(input_notes_path)

In [10]:
# Notes:
# 1) llm_context - order notes from last 7 days. the last note includes the delay_notes (which is the referral submission).
# 2) reason_code - the reason code
# 3) reason_text - reason text
# 4) delay_status - 'Draft' (remove blank,and,Delay Ref which is a new referral raised by the DC team).

In [11]:
len(df)

1610

#### filter to delay_status = "Draft"

In [12]:
# N.b. "Delay Draft" are new referrals create by the DC team.

In [13]:
draft_df = df[df.delay_status == 'Draft'].reset_index(drop=True)
len(draft_df)

1409

#### remove delay_notes from llm_context

In [14]:
result = []
for index, row in draft_df.iterrows():
    try:
        delayRegex = re.compile(r"""{row['delay_notes']}""", re.VERBOSE)
        textString = delayRegex.sub('', row['llm_context'])
        result.append(textString)
    except:
        logging.debug(f'appointment function error')     
draft_df['context'] = result

#### expand terms

In [15]:
dc_abbreviations = pd.read_csv(abbreviation_dictionary_path)

In [16]:
dc_abbreviations['abbreviation_space'] = " " + dc_abbreviations['abbreviation'] + " "
dc_abbreviations['term_space'] = " " + dc_abbreviations['term'] + " "

In [17]:
dc_dict = dc_abbreviations.set_index('abbreviation_space')['term_space'].to_dict()

In [18]:
scenario = draft_df.context
expanded_llm_context = []
draft_df['expanded_llm_context'] = ""
for k in scenario:
    for o_word, n_word in dc_dict.items():
        k = k.replace(o_word, n_word)
    expanded_llm_context.append(k)
draft_df['expanded_llm_context'] = expanded_llm_context

#### class to remove PPI

In [19]:
class Preprocess:
    def __init__(self, textString):
        self.textString=textString
        
    def __repr__(self):
        return f"{self.textString}"
    
    def set_string(self,textString):
        self.textString = textString
    
    def newline(self):
        """replace 'ZZZZZZ' with '\n'"""
        try:
            newlineRegex = re.compile(r'ZZZZZZ', re.VERBOSE)
            self.textString = newlineRegex.sub('\\n', self.textString)
        except:
            logging.debug(f'newline function error')

    def appointment(self):
        """replace 'Appointment Reference: FF or SS followed by 10 digits' with '<APPOINTMENT-REFERENCE>'"""
        try:
            appointmentRegex = re.compile(r'Appointment\sReference:\s(SS|FF)(\d){10}', re.IGNORECASE|re.VERBOSE)
            self.textString = appointmentRegex.sub('[APPOINTMENT]', self.textString)
        except:
            logging.debug(f'appointment function error')

    def contact(self):
        """replace 'Contact Name: name' with '<CONTACT>'"""
        try:
            contactRegex = re.compile(r'''Contact\sName:\s\w*''', re.IGNORECASE|re.VERBOSE)
            self.textString = contactRegex.sub('[CONTACT]', self.textString)
        except:
            logging.debug(f'appointment function error')

    def phone(self):
        """replace 'phone numbers (with or without UK country code)' with '<PHONE-NUMBER>'"""
        try:
            phoneRegex = re.compile(r'''(
                (44|\+44)?                    # optional country code
                (\s|-|\.)?                    # optional separator
                (\d{3,5}|\(\d{3,5}\))         # first 3 to 5 digits (or 3 to 5 digits in brackets)
                (\s|-|\.)?                    # optional separator
                \d{3,4}                       # 3 to 4 digits
                (\s|-|\.)?                    # optional separator
                \d{3,5}                       # last 3 to 5 digits
                (\s*(ext|x|ext.)\s*\d{2,5})?  # optional extension
                )''',  re.IGNORECASE|re.VERBOSE)
            self.textString = phoneRegex.sub('[PHONE]',self.textString)
        except:
            logging.debug(f'phone function error')

    def email(self):
        """replace 'email address' with '[EMAIL]'"""
        try:
            emailRegex = re.compile(r'''(
                [a-zA-Z0-9._%+-]+      # username
                @                      # @ symbol
                [a-zA-Z0-9.-]+         # domain name
                (\.[a-zA-Z]{2,4})      # dot-something
                )''', re.VERBOSE)
            self.textString = emailRegex.sub('<EMAIL>',self.textString)
        except:
            logging.debug(f'email function error')

    def http(self):
        """replace 'http or http address' with '[HTTP]'"""
        try:
            httpRegex = re.compile(r'''http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+''', re.IGNORECASE|re.VERBOSE)
            self.textString = httpRegex.sub('[HTTP]', self.textString)
        except:
            logging.debug(f'http function error')

    def cp_name(self):
        """replace 'Communications Provider Name:' with '[CP]'"""
        try:
            cpRegex = re.compile(r'''(
                Communications\sProvider\sName:\s
                (([A-Z]{3,20})|(\s+))* # between 3 & 20 uppercase letters, or, any number of spaces
                )''', re.VERBOSE)
            self.textString = cpRegex.sub('[CP] ', self.textString)
        except:
            logging.debug(f'cp function error')
                  
    def scrub_pii_with_nlp(self):
        """ use nlp to replace people names with [PERSON]"""
        try:
            nlp_doc = nlp(self.textString)
            final_text = self.textString

            for name in nlp_doc.ents:
                if name.label_ == "PERSON":
                    final_text = re.sub(re.escape(name.text), "[PERSON]", final_text)
            self.textString = final_text
        except:
            logging.debug(f'cp function error')

In [21]:
# Create an object from the class - and then apply class methods
p = Preprocess(draft_df.expanded_llm_context[20])
p.cp_name()
p.newline()
p.appointment()
p.contact()
p.phone()
p.email()
p.http()
p.scrub_pii_with_nlp()
print(p.textString)

2023-11-30 - General
**FND SERVICE DESK**SI Ref: C74924223 Case resolved CCT ID: ONEA74912017 - [CP] UK LIMITED Case Update Good afternoon [PERSON], I hope you are well. Thank you for your enquiry regarding your appointment.I have looked into your order and I can now confirm the OSC has been forwarded and the appointment is confirmed for the 01/12/2023 at 9am. The appointment is expected to last a day. Please allow 3-5 working days for an update after the scheduled appointment date.Therefore I am pleased to say that this case is now resolved.  Your case will now be closed. For further updates please check the Openreach customer portal and follow the Business as usual process.Kind Regards,[PERSON]

2023-11-30 - Engineering Notes
2023-11-29 13:57:32 Task progressed to Activity Sub Status ACT

2023-11-30 - Engineering Notes
2023-11-30 17:37:07 Task progressed to Activity Sub Status AWI

2023-12-01 - Engineering Notes
2023-11-30 17:37:10 Task progressed to Activity Sub Status ISS

2023-12-

#### process input data

In [22]:
results = []
for i in draft_df.expanded_llm_context:
    p = Preprocess(i)
    p.cp_name()
    p.newline()
    p.appointment()
    p.contact()
    p.phone()
    p.email()
    p.http()
    p.scrub_pii_with_nlp()
    results.append(p.textString)
draft_df['clean_llm_context'] = results

In [23]:
draft_df.head(5)

Unnamed: 0,project_id,siebel_order_number,service_id,delay_id,reason_code,reason_text,event_type,delay_status,delay_notes,actual_delay_start_date,...,event_author,event_timestamp,order_type,order_sub_type,project_start_time,project_end_time,llm_context,context,expanded_llm_context,clean_llm_context
0,J5QVG,OR014-1218039862013,ONEA10475959,1656431,2002,Insufficient Information,Delay Draft,Draft,"Reasonable assistance or information is required from the CP or end-user, including permits to work/dig Assistance or information required : land lords details for wayleave\nAction Owner: building managing agent \nAdditional information: n/a\nWhen do we expect resolution/next update: (Date) \nTh...",,...,701219060,2023-10-19 07:39:47.000000 UTC,Provide,Full Infill,2023-10-16 11:08:15.000000 UTC,2024-08-13 18:56:34.000000 UTC,2023-10-12 - HOLDING FACTOR\nHolding factor 1623903 allocated to EMP_DELAY_CLOSURES\n\n2023-10-12 - Jeopardy\nComplete Site Survey - B end has been set to priority 1 since it has missed its task milestone required by date.\n\n2023-10-13 - Jeopardy\nDelay 1623903 SI case C74689480 26/09/23 advisi...,2023-10-12 - HOLDING FACTOR\nHolding factor 1623903 allocated to EMP_DELAY_CLOSURES\n\n2023-10-12 - Jeopardy\nComplete Site Survey - B end has been set to priority 1 since it has missed its task milestone required by date.\n\n2023-10-13 - Jeopardy\nDelay 1623903 SI case C74689480 26/09/23 advisi...,2023-10-12 - HOLDING FACTOR\nHolding factor 1623903 allocated to EMP_DELAY_CLOSURES\n\n2023-10-12 - Jeopardy\nComplete Site Survey - B end has been set to priority 1 since it has missed its task milestone required by date.\n\n2023-10-13 - Jeopardy\nDelay 1623903 SI case C74689480 26/09/23 advisi...,2023-10-12 - HOLDING FACTOR\nHolding factor 1623903 allocated to EMP_DELAY_CLOSURES\n\n2023-10-12 - Jeopardy\nComplete Site Survey - B end has been set to priority 1 since it has missed its task milestone required by date.\n\n2023-10-13 - Jeopardy\nDelay 1623903 SI case C74689480 26/09/23 advisi...
1,J5QYL,OR014-1218323050767,ONEA10775923,1654112,2002,Insufficient Information,Delay Draft,Draft,"Reasonable assistance or information is required from the CP or end-user, including permits to work/dig Assistance or information required :Order delayed as the sister circuit ONEA10776555 is awaiting for the CP to AMEND for the reach option. \nAction Owner: CP \n\nAdditional information: Order ...",,...,612010220,2023-10-17 08:15:13.000000 UTC,Provide,New Provide,2023-10-16 11:16:07.000000 UTC,2023-11-03 17:20:01.000000 UTC,"2023-10-13 - Amend\nAmend Request received for:<b>CP Contact Details</b> - From CONTACT NAME: B2B FULFILMENT TEAM B2B FULFILMENT TEAM, CONTACT NO: +448005426754, CONTACT EMAILID: b2bfulfilment.ttt@talktalkplc.com, NOTES: to CONTACT NAME: B2B FULFILMENT TEAM B2B FULFILMENT TEAM, CONTACT NO: +4480...","2023-10-13 - Amend\nAmend Request received for:<b>CP Contact Details</b> - From CONTACT NAME: B2B FULFILMENT TEAM B2B FULFILMENT TEAM, CONTACT NO: +448005426754, CONTACT EMAILID: b2bfulfilment.ttt@talktalkplc.com, NOTES: to CONTACT NAME: B2B FULFILMENT TEAM B2B FULFILMENT TEAM, CONTACT NO: +4480...","2023-10-13 - Amend\nAmend Request received for:<b>CP Contact Details</b> - From CONTACT NAME: B2B FULFILMENT TEAM B2B FULFILMENT TEAM, CONTACT NO: +448005426754, CONTACT EMAILID: b2bfulfilment.ttt@talktalkplc.com, NOTES: to CONTACT NAME: B2B FULFILMENT TEAM B2B FULFILMENT TEAM, CONTACT NO: +4480...","2023-10-13 - Amend\nAmend Request received for:<b>CP Contact Details</b> - From [CONTACT] FULFILMENT TEAM B2B FULFILMENT TEAM, CONTACT NO: [PHONE], CONTACT EMAILID: <EMAIL>, NOTES: to [CONTACT] FULFILMENT TEAM B2B FULFILMENT TEAM, CONTACT NO: [PHONE], CONTACT EMAILID: <EMAIL>, NOTES: , <b>Approv..."
2,J5QYW,OR013-1220587805901,ONEA74787595,1653777,2002,Insufficient Information,Delay Draft,Draft,"Reasonable assistance or information is required from the CP or end-user, including permits to work/dig Assistance or information required : Permission to work form needs to be signed and returned for Internal cabling ,duct and jointing work at customer site.\nAction Owner: End user/CP\nAddition...",,...,614400647,2023-10-16 18:18:42.000000 UTC,Provide,Full Infill,2023-10-16 11:17:07.000000 UTC,2024-01-30 10:14:56.000000 UTC,2023-10-13 - Jeopardy\nPre-Network Review - B end has been set to priority 1 since it has missed its task milestone required by date.\n\n2023-10-13 - Jeopardy\nPre-Network Review - A end has been set to priority 1 since it has missed its task milestone required by date.\n\n2023-10-13 - General\n...,2023-10-13 - Jeopardy\nPre-Network Review - B end has been set to priority 1 since it has missed its task milestone required by date.\n\n2023-10-13 - Jeopardy\nPre-Network Review - A end has been set to priority 1 since it has missed its task milestone required by date.\n\n2023-10-13 - General\n...,2023-10-13 - Jeopardy\nPre-Network Review - B end has been set to priority 1 since it has missed its task milestone required by date.\n\n2023-10-13 - Jeopardy\nPre-Network Review - A end has been set to priority 1 since it has missed its task milestone required by date.\n\n2023-10-13 - General\n...,2023-10-13 - Jeopardy\nPre-Network Review - B end has been set to priority 1 since it has missed its task milestone required by date.\n\n2023-10-13 - Jeopardy\nPre-Network Review - A end has been set to priority 1 since it has missed its task milestone required by date.\n\n2023-10-13 - General\n...
3,J5QZZ,OR014-1218822248144,ONEA11271137,1669339,2002,Insufficient Information,Delay Draft,Draft,"Reasonable assistance or information is required from the CP or end-user, including permits to work/dig Assistance or information required : Permission to work for duct onsite sent to amit.hargun@dwrcymru.com\nAction Owner: CP\nAdditional information: Please provide signed permission to work for...",,...,602707925,2023-11-03 10:56:30.000000 UTC,Provide,Full Infill,2023-10-16 11:21:08.000000 UTC,2024-03-27 14:54:01.000000 UTC,"2023-10-27 - General\n*** email sent to Wayleave team: Mitchell,J,Jack,BNJC13 R regarding CP ask ***\n\n2023-10-27 - General\nSI Ref: C74797824 Case Acceptance CCT ID: ONEA11271137 - CP Name: Vodafone Limited We have reviewed your case and have detailed the next steps below. Case Update Current...","2023-10-27 - General\n*** email sent to Wayleave team: Mitchell,J,Jack,BNJC13 R regarding CP ask ***\n\n2023-10-27 - General\nSI Ref: C74797824 Case Acceptance CCT ID: ONEA11271137 - CP Name: Vodafone Limited We have reviewed your case and have detailed the next steps below. Case Update Current...","2023-10-27 - General\n*** email sent to Wayleave team: Mitchell,J,Jack,BNJC13 R regarding Communications Provider ask ***\n\n2023-10-27 - General\nSI Ref: C74797824 Case Acceptance CCT ID: ONEA11271137 - Communications Provider Name: Vodafone Limited We have reviewed your case and have detailed...","2023-10-27 - General\n*** email sent to Wayleave team: [PERSON],J,[PERSON],BNJC13 R regarding Communications Provider ask ***\n\n2023-10-27 - General\nSI Ref: C74797824 Case Acceptance CCT ID: ONEA11271137 - [CP] Vodafone Limited We have reviewed your case and have detailed the next steps below..."
4,J5R0Z,OR014-1218580552834,ONEA11045549,1677101,2002,Insufficient Information,Delay Draft,Draft,"Reasonable assistance or information is required from the CP or end-user, including permits to work/dig Assistance or information required : PO number required from CP to cover 500pounds access charge to Cellnex.\nAction Owner: CP\nAdditional information: PO number required to be able to book ac...",,...,601198243,2023-11-13 15:05:19.000000 UTC,Provide,Full Infill,2023-10-16 11:23:29.000000 UTC,2024-03-12 11:35:44.000000 UTC,"2023-11-07 - General\nThe hoist work has been issued as infill on 07/11/23.\n\n2023-11-07 - General\nHoist platform not fixed yet but if its fixed between this afternoon and 10th, this hoist work will be attended.\n\n2023-11-07 - General\nSI Ref: C74755088 Case update CCT ID: ONEA11045549 - CP N...","2023-11-07 - General\nThe hoist work has been issued as infill on 07/11/23.\n\n2023-11-07 - General\nHoist platform not fixed yet but if its fixed between this afternoon and 10th, this hoist work will be attended.\n\n2023-11-07 - General\nSI Ref: C74755088 Case update CCT ID: ONEA11045549 - CP N...","2023-11-07 - General\nThe hoist work has been issued as infill on 07/11/23.\n\n2023-11-07 - General\nHoist platform not fixed yet but if its fixed between this afternoon and 10th, this hoist work will be attended.\n\n2023-11-07 - General\nSI Ref: C74755088 Case update CCT ID: ONEA11045549 - Comm...","2023-11-07 - General\nThe hoist work has been issued as infill on 07/11/23.\n\n2023-11-07 - General\nHoist platform not fixed yet but if its fixed between this afternoon and 10th, this hoist work will be attended.\n\n2023-11-07 - General\nSI Ref: C74755088 Case update CCT ID: ONEA11045549 - [CP]..."


#### save the file

In [24]:
draft_df.to_csv(output_notes_path,index=False)