# regex to process manual referrals

https://www.regular-expressions.info/email.html

In [1]:
import re
import pandas as pd

In [2]:
# regex can pick out email, phone numbers, and, look for words such as accept, pend, decline

In [3]:
from pathlib import Path

file = []
text = []
email = []
phone = []
bk = []
vr = []
sira = []
cifas = []
outcome = []

for p in Path('./data').glob('*.txt'):
    with p.open() as f:
        
        # file
        file_text = f.read()
        #print(f"{p.name}:\n{f.read()}\n")
        file.append(p.name)
        
        # text
        text.append(file_text)
        #print(file_text)
        
        # email
        emailRegex = re.compile(r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b',re.IGNORECASE)
        emailMatch = emailRegex.search(file_text)
        if emailMatch is not None:
            s_email = emailMatch.group(0)
        else:
            s_email = None
        email.append(s_email)
        
        # phone -leave out as it picks out
        # This regular expression will match phone numbers entered with delimiters (spaces, dots, brackets, etc.)
        phoneRegex = re.compile("\\b\\+?\\d{1,4}?[-.\\s]?\\(?\\d{1,3}?\\)?[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,9}\\b")
        phoneMatch = phoneRegex.search(file_text)
        if phoneMatch is not None:
            s_phone = phoneMatch.group(0)
        else:
            s_phone = None
        phone.append(s_phone)

        # BK
        bkRegex = re.compile(r"""\b(bk|bk's|bks)\b""",re.IGNORECASE)
        bkMatch = bkRegex.findall(file_text)
        bk.append(bkMatch)        
                
        # VR
        vrRegex = re.compile(r'\b(VR)\b',re.IGNORECASE)
        vrMatch = vrRegex.findall(file_text)
        vr.append(vrMatch)
        
        # CIFAS
        cifasRegex = re.compile(r'\b(cifas|cifa)\b',re.IGNORECASE)
        cifasMatch = cifasRegex.findall(file_text)
        cifas.append(cifasMatch)
        
        # SIRA
        siraRegex = re.compile(r'\b(sira)\b',re.IGNORECASE)
        siraMatch = siraRegex.findall(file_text)
        sira.append(siraMatch)
        
        # outcome
        outcomeRegex = re.compile(r'\b(accept|accepted|pend|pended|pending|decline|declined|reject|rejected)\b',re.IGNORECASE)
        match = outcomeRegex.findall(file_text)
        outcome.append(match)
        
        df = pd.DataFrame([file, text, email, phone, bk, vr, cifas, sira, outcome], index =['file', 'text', 'email', 'phone', 'bk', 'vr', 'cifas', 'sira', 'outcome']).T
        

In [4]:
df

Unnamed: 0,file,text,email,phone,bk,vr,cifas,sira,outcome
0,211857920.txt,CA05\nSIMO\nIND CAT 6 AT CURRENT 03/20 088A346...,,208061851.0,[],[VR],[],[],[DECLINE]
1,211857897.txt,211857897\nNumber of Disconnect Rules Hit\nvr ...,,211857897.0,[BKs],[vr],[],"[sira, sira]",[pended]
2,211858038.txt,MISS XXXXX XXXXXX \n211858010\nXXXXXXXXXXXXXXX...,XXXXXXXXXXXXXXX@icloud.com,211858010.0,"[BK, BK]",[Vr],[],"[Sira, Sira]","[decline, Decline]"
3,211857895.txt,211857895\nR047 Ind CIFAS \nXXXXXXXX@yahoo.com...,XXXXXXXX@yahoo.com,211857895.0,[BK],"[VR, VR]",[CIFAS],[],[Declined]
4,211858010.txt,MISS XXXXX XXXXXX \n211858010\nXXXXXXXXXXXXXXX...,XXXXXXXXXXXXXXX@icloud.com,211858010.0,"[BK, BK]",[Vr],[],"[Sira, Sira]","[decline, Decline]"
5,211858093.txt,R07 Address not found at bureau\n211858093\ncu...,,211858093.0,[bk],"[vr, vr]",[],[],"[accepted, accepted, declined]"
6,211857988.txt,R47 Indirect CIFAS Hit \n211857988\ncifas ok \...,,211857988.0,[bk],[vr],"[CIFAS, cifas]",[],[declined]
7,211858131.txt,211858131\nR046 Direct CIFAS \nR053 SIRA\nXXXX...,XXXXXXXXXXXXXXXXXX@outlook.com,211858131.0,[BK],"[VR, VR]",[CIFAS],"[SIRA, SIRA]",[Declined]
8,211858016.txt,211858016\nR057 TAA\nTAA on app 20yrs \nTAA on...,,211858016.0,[],"[VR, VR, VR]",[],[],[]
9,211858065.txt,R47 Indirect CIFAS Hit\nprev address same as a...,,,[],[vr],"[CIFAS, cifas]",[],[]


In [5]:
print(email)
print(file)
print(phone)
print(bk)
print(vr)
print(sira)
print(cifas)
print(outcome)

[None, None, 'XXXXXXXXXXXXXXX@icloud.com', 'XXXXXXXX@yahoo.com', 'XXXXXXXXXXXXXXX@icloud.com', None, None, 'XXXXXXXXXXXXXXXXXX@outlook.com', None, None]
['211857920.txt', '211857897.txt', '211858038.txt', '211857895.txt', '211858010.txt', '211858093.txt', '211857988.txt', '211858131.txt', '211858016.txt', '211858065.txt']
['208061851', '211857897', '211858010', '211857895', '211858010', '211858093', '211857988', '211858131', '211858016', None]
[[], ['BKs'], ['BK', 'BK'], ['BK'], ['BK', 'BK'], ['bk'], ['bk'], ['BK'], [], []]
[['VR'], ['vr'], ['Vr'], ['VR', 'VR'], ['Vr'], ['vr', 'vr'], ['vr'], ['VR', 'VR'], ['VR', 'VR', 'VR'], ['vr']]
[[], ['sira', 'sira'], ['Sira', 'Sira'], [], ['Sira', 'Sira'], [], [], ['SIRA', 'SIRA'], [], []]
[[], [], [], ['CIFAS'], [], [], ['CIFAS', 'cifas'], ['CIFAS'], [], ['CIFAS', 'cifas']]
[['DECLINE'], ['pended'], ['decline', 'Decline'], ['Declined'], ['decline', 'Decline'], ['accepted', 'accepted', 'declined'], ['declined'], ['Declined'], [], []]


#### Policy

In [6]:
multipleLines = """
summary of CAIS (Experian + Equifax) – 
VR – account 98098789
98098789
Email address - 
A Summary of Fraud checks (Post Code, Bank & Card) If required -
Details of SEC call (if applicable) and questions asked - 
Outcome (accept/pending/decline) -
Summary of why you have taken this decision – 
If pended, what is the reason and what should the caller ask?
Any inconsistencies? I.E insight within TOA at another address. VR within TOA. Etc. 
"""

In [7]:
# Any line with summary in it, return
idRegex = re.compile(r'\b(\d{8})\b', re.I)
match = idRegex.findall(multipleLines)
match

['98098789', '98098789']

#### summary of CAIS

In [8]:
multipleLines = """
summary of CAIS (Experian + Equifax) – 
VR – 
Email address - 
A Summary of Fraud checks (Post Code, Bank & Card) If required -
Details of SEC call (if applicable) and questions asked - 
Outcome (accept/pending/decline) -
Summary of why you have taken this decision – 
If pended, what is the reason and what should the caller ask?
Any inconsistencies? I.E insight within TOA at another address. VR within TOA. Etc. 
"""

In [9]:
# Any line with summary in it, return
summaryRegex = re.compile(r'''(
    \b(summary|CAIS) .*\b
    )''', re.IGNORECASE |re.VERBOSE)
match = summaryRegex.findall(multipleLines)
match

[('summary of CAIS (Experian + Equifax', 'summary'),
 ('Summary of Fraud checks (Post Code, Bank & Card) If required', 'Summary'),
 ('Summary of why you have taken this decision', 'Summary')]

#### email addresses - simple & more complicated

In [10]:
line = "should we use regex more often? let me know at  jdsk@bob.com.lol or popop@coco.com mark_holland@gmail.com mark_.+-holland@gmail.com"

In [11]:
# \w is word
match = re.findall(r'[\w.+-]+@[\w-]+\.[\w.-]+', line)
match

['jdsk@bob.com.lol',
 'popop@coco.com',
 'mark_holland@gmail.com',
 'mark_.+-holland@gmail.com']

In [12]:
# \b is start or end of word
emailRegex = re.compile(r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b',re.IGNORECASE)
match = emailRegex.findall(line)
match

['jdsk@bob.com.lol',
 'popop@coco.com',
 'mark_holland@gmail.com',
 'mark_.+-holland@gmail.com']

#### Contact number

In [13]:
#line = "should we use regex more often? let me know at Check DOB with  077589330309"
line = "+12223334444 , 0775 8933 030"
validate_phone_number_pattern = "^\\+?[1-9][0-9]{7,14}$"

In [14]:
# This regular expression will match phone numbers entered with delimiters (spaces, dots, brackets, etc.)
phoneRegex = re.compile("\\b\\+?\\d{1,4}?[-.\\s]?\\(?\\d{1,3}?\\)?[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,4}[-.\\s]?\\d{1,9}\\b")

In [15]:
match = phoneRegex.findall(line)
match

['12223334444', '0775 8933 030']

#### Outcome (accept / pend / decline)

In [16]:
multipleLines = """accept
summary of CAIS (Experian + Equifax) – 
VR – 
Email address - 
A Summary of Fraud checks (Post Code, Bank & Card) If required -
Details of SEC call (if applicable) and questions asked - 
Outcome decline
Summary of why you have taken this decision – 
If pended, what is the reason and what should the caller ask?
Any inconsistencies? I.E insight within TOA at another address. VR within TOA. Etc. pend
"""

In [17]:
# \b is start or end of word
outcomeRegex = re.compile(r'\b(accept|accepted|pend|pended|pending|decline|declined|reject|rejected)\b',re.IGNORECASE)
match = outcomeRegex.findall(multipleLines)
match

['accept', 'decline', 'pended', 'pend']

## Tokeniser

In [18]:
from typing import NamedTuple
import re

class Token(NamedTuple):
    type: str
    value: str
    line: int
    column: int

def tokenize(code):
    keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
    token_specification = [
        ('NUMBER',   r'\d+(\.\d*)?'),  # Integer or decimal number
        ('ASSIGN',   r':='),           # Assignment operator
        ('END',      r';'),            # Statement terminator
        ('ID',       r'[A-Za-z]+'),    # Identifiers
        ('OP',       r'[+\-*/]'),      # Arithmetic operators
        ('NEWLINE',  r'\n'),           # Line endings
        ('SKIP',     r'[ \t]+'),       # Skip over spaces and tabs
        ('MISMATCH', r'.'),            # Any other character
    ]
    tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
    line_num = 1
    line_start = 0
    for mo in re.finditer(tok_regex, code):
        kind = mo.lastgroup
        value = mo.group()
        column = mo.start() - line_start
        if kind == 'NUMBER':
            value = float(value) if '.' in value else int(value)
        elif kind == 'ID' and value in keywords:
            kind = value
        elif kind == 'NEWLINE':
            line_start = mo.end()
            line_num += 1
            continue
        elif kind == 'SKIP':
            continue
        elif kind == 'MISMATCH':
            raise RuntimeError(f'{value!r} unexpected on line {line_num}')
        yield Token(kind, value, line_num, column)

statements = '''
    IF quantity THEN
        total := total + price * quantity;
        tax := price * 0.05;
    ENDIF;
'''

for token in tokenize(statements):
    print(token)

Token(type='IF', value='IF', line=2, column=4)
Token(type='ID', value='quantity', line=2, column=7)
Token(type='THEN', value='THEN', line=2, column=16)
Token(type='ID', value='total', line=3, column=8)
Token(type='ASSIGN', value=':=', line=3, column=14)
Token(type='ID', value='total', line=3, column=17)
Token(type='OP', value='+', line=3, column=23)
Token(type='ID', value='price', line=3, column=25)
Token(type='OP', value='*', line=3, column=31)
Token(type='ID', value='quantity', line=3, column=33)
Token(type='END', value=';', line=3, column=41)
Token(type='ID', value='tax', line=4, column=8)
Token(type='ASSIGN', value=':=', line=4, column=12)
Token(type='ID', value='price', line=4, column=15)
Token(type='OP', value='*', line=4, column=21)
Token(type='NUMBER', value=0.05, line=4, column=23)
Token(type='END', value=';', line=4, column=27)
Token(type='ENDIF', value='ENDIF', line=5, column=4)
Token(type='END', value=';', line=5, column=9)


### Detect Personal Information