# Extract Entities from Blue Reports with transformer model

### setup & import packages

In [1]:
# export PIP_INDEX_URL="https://agile.nat.bt.com/nexus/repository/pypi-proxy/simple"

In [2]:
# example using pretrained models:-
# https://gitlab.agile.nat.bt.com/DCU/experimental/agent_comments/-/blob/main/notebooks/Pre-trained%20GCP%20Models.ipynb

In [3]:
from pathlib import Path
import pandas as pd
import time
from io import StringIO
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing
import threading
import json
import re
from datetime import datetime

%matplotlib inline
pd.set_option('display.max_colwidth', None)

In [4]:
# vertexai transformer models
import vertexai
from vertexai.language_models import TextGenerationModel

In [5]:
# document processing
import docx2txt

In [6]:
# paht of cwd
Path.cwd()

PosixPath('/home/jupyter/projects/cable-theft/notebooks')

#### import llm model

In [7]:
import vertexai
from vertexai.language_models import TextGenerationModel

vertexai.init(project="or-fttp-prope-aif-exp-prod", location="europe-west2")
parameters = {
    "candidate_count": 1,
    "max_output_tokens": 512,
    "temperature": 0,
    "top_p": 1
}
model = TextGenerationModel.from_pretrained("text-bison")

#### import data

In [8]:
# original sharepoint reference for blue reports: https://btgroupcloud.sharepoint.com/teams/ORSecDPA_1641471791611/Shared%20Documents/Forms/AllItems.aspx?e=5%3A51d3fe2077064a81b3e6842c84a3e083&sharingv2=true&fromShare=true&at=9&CID=3c2826b3%2De871%2D4e0a%2D8cb6%2D412db0c2331d&FolderCTID=0x012000372F5DB8C0922341B8571EFD66FD9C86&id=%2Fteams%2FORSecDPA%5F1641471791611%2FShared%20Documents%2FCable%20Thefts%2FBlue%20Reports

In [9]:
# Create a list of files to process
INPUT_PATH = '../data/input/blue_reports/'
file_list = sorted(Path(INPUT_PATH).glob('*.docx'))

In [10]:
# check name exists
file_list[96].name

'GRAVESEND 05.12.23.docx'

#### read a file & convert to text

In [11]:
INPUT_FILE = file_list[96]
text = docx2txt.process(INPUT_FILE)
print(text)

INCIDENT  05/12/23

CAUSE

ACTIONS

FINDINGS/COMMENTS



Thong Lane
Gravesend
Kent
DA12 



Kent Police

Ref: 
DP-57924-23-4646-01



Sim: 23008405





OM: BOS561

	Darrel Speirs

	07484 049678



SAM B0S5 

	Tony Williams

	074368 32609  

	







1 x LEAD 600/0.5
cut and attempt to steal





Box 1 (cut):

///rise.rushed.insect



Box 2 (cut and attempt):

///backup.accent.lived



PCP(s) 33 & 34 affected 

















Cable had previously been cut before theft attempt.

Door to door enquires carried out.

Lid alarm ready to deploy once works have completed.

Route to be snared. 



































                

           

1 cut cable in 2 boxes

Cable cut the night before (05/12/23) and thieves returned on 06/12/23 @ 01:20 to remove cable

Lots of noise and banging woke a number of residents opposite the drag box

1 resident filmed the suspects and this footage has been obtained 

No other footage available 

Leaflets left with remaining neighbours who di

In [12]:
text

'INCIDENT  05/12/23\n\nCAUSE\n\nACTIONS\n\nFINDINGS/COMMENTS\n\n\n\nThong Lane\nGravesend\nKent\nDA12 \n\n\n\nKent Police\n\nRef: \nDP-57924-23-4646-01\n\n\n\nSim: 23008405\n\n\n\n\n\nOM: BOS561\n\n\tDarrel Speirs\n\n\t07484\xa0049678\n\n\n\nSAM B0S5 \n\n\tTony Williams\n\n\t074368\xa032609  \n\n\t\n\n\n\n\n\n\n\n1 x LEAD 600/0.5\ncut and attempt to steal\n\n\n\n\n\nBox 1 (cut):\n\n///rise.rushed.insect\n\n\n\nBox 2 (cut and attempt):\n\n///backup.accent.lived\n\n\n\nPCP(s) 33 & 34 affected \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCable had previously been cut before theft attempt.\n\nDoor to door enquires carried out.\n\nLid alarm ready to deploy once works have completed.\n\nRoute to be snared. \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                \n\n           \n\n1 cut cable in 2 boxes\n\nCable cut the night before (05/12/23) and thieves returned on 06/12/23 @ 01:20 to remove cable\n\nLots of noise and banging woke a number of residents opposite the dr

#### create prompt to process blue reports in a consistent way

In [13]:
prompt = """
        You are an expert extraction and summarisation algorithm. Return all the entities if they are in the text. If an entity is not present in the text return "None".
        Your task is to create a valid json formatted file of the entities from the text below using double quotes "".
        The .json file should contain all of the following list of entities: "Filename", "Incident date", "Incident time", "Postcode", "SIM (or SIR)", "Cable theft", "Police ref", "Captured on CCTV", "Captured on RABit", "Evidence".
        Do not include **, or, " in the entities.
        """

#### use llm to extract entities from the text into a DataFrame

In [14]:
response = model.predict(f"{prompt}\n[Filename: {INPUT_FILE.name}\n{text}]\n", **parameters)
entity_txt = response.text.strip().lstrip(""" ```json\n""").rstrip("""\n```""").replace('\n', '')
str(entity_txt)

'{  "Filename": "GRAVESEND 05.12.23.docx",  "Incident date": "05/12/23",  "Incident time": "01:20",  "Postcode": "DA12",  "SIM (or SIR)": "23008405",  "Cable theft": "Yes",  "Police ref": "DP-57924-23-4646-01",  "Captured on CCTV": "Yes",  "Captured on RABit": "No",  "Evidence": "Footage from one resident"}'

In [15]:
json_object = json.loads(entity_txt)
json_object['Filename']

'GRAVESEND 05.12.23.docx'

In [16]:
def find_entities(model, parameters, prompt, file, text):
    response = model.predict(f"""{prompt}\n[Filename: {file.name}\n{text}]\n""", **parameters)
    entity_txt = response.text.strip().lstrip(""" ```json\n""").rstrip("""\n```""").replace('\n', '')
    json_object = json.loads(entity_txt)
    return json_object

In [None]:
%%time
results = []
for file in file_list[150:170:
    report_text = docx2txt.process(file)
    json_object = find_entities(model, parameters, prompt, file, report_text)
    results.append(json_object)

In [None]:
print(file)

In [None]:
len(results)

In [None]:
#results

In [None]:
results_df = pd.DataFrame(results)

In [None]:
results_df[54:64]

#### clean filename

In [None]:
results_df.Filename = results_df.Filename.str.lower().str.strip().str.replace(r'\n','').str.replace("[/,',(,),#]", "").str.replace('  ', ' ').str.replace(' ', '_').str.replace('__', '_')
results_df.Filename = results_df.Filename.str.lower().str.strip().str.replace(r'(','')
results_df.Filename = results_df.Filename.str.lower().str.strip().str.replace(r')','')

#### clean column names

In [None]:
results_df.columns = results_df.columns.str.lower().str.strip().str.replace(r'\n','').str.replace("[/,',(,),#]", "").str.replace('  ', ' ').str.replace(' ', '_').str.replace('__', '_')
results_df.columns = results_df.columns.str.lower().str.strip().str.replace(r'(','')
results_df.columns = results_df.columns.str.lower().str.strip().str.replace(r')','')

In [None]:
results_df.columns

#### save results

In [None]:
results_df.to_csv('..//data/output/results_df.csv',index=False)
results_df = pd.read_csv('..//data/output/results_df.csv')

#### clean datetime

In [89]:
dates = results_df.incident_date
reobj = re.compile(
    r"""^\s*  # optional whitespace
    (\d{1,2})    # Day
    [-/]     # separator
    (\d{1,2})    # Month
    [-/]     # separator
    (20)?  # century (optional)
    (\d{2})    # years (YY)
    \s*      # optional whitespace""", 
    re.VERBOSE)
fdates = []
i=0
for date in dates:
    found_date=None
    formatted_date=None
    create_date=None
    i=i+1
    try:
        first_date = date.lstrip().split()[0].strip()
        #adjusted_date = reobj.sub(r"\1/\2/20\3", first_date)
        found_date = reobj.findall(first_date)
        if found_date:
            if len(found_date[0])==4:
                create_date = found_date[0][0] + '/' + found_date[0][1] + "/20" + found_date[0][3]  
                formatted_date = datetime.strftime(datetime.strptime(create_date,"%d/%m/%Y"), "%d/%m/%Y")
                fdates.append(formatted_date)
            else:
                # length not 4
                fdates.append(None)
        else:
            fdates.append(None)
    except:
        # date formatting failed
        fdates.append(None)
    print(f"{i} {found_date} {create_date} {formatted_date}")

1 [('18', '06', '20', '23')] 18/06/2023 18/06/2023
2 [('25', '04', '20', '24')] 25/04/2024 25/04/2024
3 [('03', '07', '', '24')] 03/07/2024 03/07/2024
4 [('02', '08', '20', '22')] 02/08/2022 02/08/2022
5 [('01', '08', '20', '22')] 01/08/2022 01/08/2022
6 [('29', '10', '20', '22')] 29/10/2022 29/10/2022
7 [('26', '03', '20', '24')] 26/03/2024 26/03/2024
8 [('08', '11', '20', '23')] 08/11/2023 08/11/2023
9 [('13', '03', '', '24')] 13/03/2024 13/03/2024
10 [('30', '05', '20', '22')] 30/05/2022 30/05/2022
11 [('30', '10', '', '23')] 30/10/2023 30/10/2023
12 [('24', '10', '', '23')] 24/10/2023 24/10/2023
13 [('19', '7', '', '23')] 19/7/2023 19/07/2023
14 [('01', '03', '20', '23')] 01/03/2023 01/03/2023
15 [('19', '5', '', '23')] 19/5/2023 19/05/2023
16 [('29', '06', '20', '24')] 29/06/2024 29/06/2024
17 [('27', '02', '20', '24')] 27/02/2024 27/02/2024
18 [('15', '08', '20', '23')] 15/08/2023 15/08/2023
19 [('02', '08', '20', '23')] 02/08/2023 02/08/2023
20 [('23', '07', '20', '24')] 23/07/2

In [90]:
# recreate as date
results_df['clean_incident_date'] = fdates

In [91]:
results_df['clean_incident_date'] = pd.to_datetime(results_df['clean_incident_date'],dayfirst=True)

#### save results

In [92]:
results_df.to_csv('..//data/output/results_df.csv',index=False)

#### read results

In [95]:
results_df = pd.read_csv('..//data/output/results_df.csv')

In [96]:
results_df.dtypes

filename               object
incident_date          object
incident_time          object
postcode               object
sim_or_sir             object
cable_theft            object
police_ref             object
captured_on_cctv       object
captured_on_rabit      object
evidence               object
clean_incident_date    object
dtype: object

In [97]:
results_df['clean_incident_date'] = pd.to_datetime(results_df['clean_incident_date'],dayfirst=True)

  results_df['clean_incident_date'] = pd.to_datetime(results_df['clean_incident_date'],dayfirst=True)


In [98]:
results_df.dtypes

filename                       object
incident_date                  object
incident_time                  object
postcode                       object
sim_or_sir                     object
cable_theft                    object
police_ref                     object
captured_on_cctv               object
captured_on_rabit              object
evidence                       object
clean_incident_date    datetime64[ns]
dtype: object