## Convert Patterns to Fake Legal Sentences

In [1]:
import pandas as pd
import re
from collections import defaultdict
from pathlib import Path
import pathlib
import time
import random

In [2]:
# initialising primes and cache them
def isPrime(x):
    count = 0
    for i in range(int(x/2)):
        if x % (i+1) == 0:
            count = count+1
    return count == 1

MIN_PRIME = 0
MAX_PRIME = 1000

CACHED_PRIMES = [i for i in range(MIN_PRIME, MAX_PRIME) if isPrime(i)]

In [3]:
# CHANGE_ME BLOCK
# Change Me_1
INPUT_FILE = r'../PATTERNS/CASE_LAW_COUNSEL/GROUP_MODEL_TEMPLATE_PATTERN_COUNSEL_CL_SEP_BATCH_v3.xlsx'


# Change Me_2
MODEL_PATTERN_SHEET = 'ModelPatterns'
NAMED_ENTITIES_SHEET = 'NamedEntities'

VAL_TEST_SKIP_STEP = 3 # take every 3rd sentence into validation and test data

# Change Me_3
CORPUS_DIR_PATH = r'../PATTERNS/CORPUS_GROUPS'
#CORPUS_DIR_PATH = r'../PATTERNS/CORPUS_EXAMPLE'

# Change Me_4
PATTERN_GROUP_LIST = ['G1', 'G2','G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9',
                    'G10', 'G11', 'G12', 'G13', 'G14', 'G15', 'G16', 'G17', 
                    'G18', 'G19', 'G20', 'G21', 'G22', 'G23', 'G24', 'G25', 'G26']
#PATTERN_GROUP_LIST = ['G1']
#PATTERN_GROUP_LIST = ['G2']
#PATTERN_GROUP_LIST = ['G18']

# Change Me_5
#MINI_BATCH_SIZE = 64
MINI_BATCH_SIZE = 16
BATCHES_PER_FILE = 3 # MINI_BATCH_SIZE * BATCHES_PER_FILE sentences for each pattern
# Set RANDOM_STATE to a fix prime value for reproducibale results
RANDOM_STATE = random.choice([i for i in CACHED_PRIMES if MIN_PRIME < i < MAX_PRIME])
print(RANDOM_STATE)

647


In [4]:
#from datetime import datetime
#dt = datetime.now()
#dt.microsecond

In [5]:
TOKENIZATION_STRING = r"[^\s\t,!\"()\*\+,-.\/:;?@\[\]\\`\{\|\}¶§·¸ʺ‶‟”“˗„\']+|[,!\"()\*\+,-.\/:;?@\[\]\\`\{\|\}¶§·¸ʺ‶‟”“˗„\']"
TOKENIZER = re.compile(TOKENIZATION_STRING)
LABEL_TYPE = 'ner'
OTHER = "other" # default unsupported entity_tag

def custom_tokenizer(sentence):
    if type(sentence) == float: return ""
    if len(sentence) == 0: return ""    
    tokens = [match.group() for match in TOKENIZER.finditer(sentence)]
    return tokens

def fix_tokenization(sentence):
    if type(sentence) == float: return ""
    if len(sentence) == 0: return ""
    new_sentence = ' '.join(custom_tokenizer(sentence))
    return new_sentence

sent = "' A. Douglas Brady, Jr., Judge of the Superior Court .'"
new_sent = ' '.join(custom_tokenizer(sent))
print(new_sent)

def get_entity_code_df_dict():
    """Maps each EntityCode to a df that stores dictionary data for this EntityCode."""
    dict_files = NER_DF.DictionaryFileName.unique()
    ec_df_dict = {}
    for dict_file in dict_files:
        file_path = f'../DICTIONARIES/{dict_file}'
        dict_df = pd.read_excel(file_path)
        print(f'{dict_df.shape} : {dict_file}')
        df = NER_DF[NER_DF.DictionaryFileName == dict_file]
        entity_code_list = list(df.EntityCode.unique())
        for code in entity_code_list:
            ec_df_dict [code] = dict_df
            
    return ec_df_dict

def print_list(obj_list):
    for obj in obj_list: 
        print(obj)
        print('~'*30)

' A . Douglas Brady , Jr . , Judge of the Superior Court . '


In [6]:
def get_entity(token):
    """If token is EntityCode then extract entity code (e.g., ATTORNEY) and return True to indicate it is EntityCode.
        Else: return token as is and False as an indicator that it is not EntityCode
    """
    open_angle = '<' in token
    close_angle = '>' in token
    end = len(token)
    if open_angle and close_angle:
        return token[1:end-1], True
    else: return token, False
    
def is_exception_entity(entity):
    entity_splits = entity.split()
    if len(entity_splits) > 1: return True
    else: return False

def get_entity_base(entity_phrase):
    """Remove numeric suffix from the entity name to get the base.
    Check if entity is an ExceptionEntityName: Law Office of <ATTORNEY_2>.
    ASSUMPTION: that entity_phrase contains exactly ONE <base_entity>.
    Example: LAW_FIRM_5 has the base LAW_FIRM
    """
    
    entity_splits = entity_phrase.split()
    for item in entity_splits:
        entity, is_base = get_entity(item)
        if is_base: break
            
    entity_splits = entity.split('_')
    last = entity_splits[-1]
    base = entity
    if last.isnumeric(): # LAW_FIRM_1, LAW_FIRM_2
        base = "_".join(entity_splits[0:-1])        
        
    return base
        
def get_entity_tag(entity, entity_type, supported=True):
    """Extract entity base (e.g., LAW_FIRM_3 has the base LAW_FIRM).
    Use ner dictionary (NER_DICT) look-up to determine the entity tag for the base.
    Ex: entity_base = LAW_FIRM, entity_tag = lawFirm
    """
    entity_tag = OTHER
    entity_base = get_entity_base(entity)
    
    if entity_type:
        entity_tag = NER_DICT[entity_base]['EntityTag']
        if supported: # account for entity being supported or not for tagging
            is_supported = NER_DICT[entity_base]['isSupported']
            if not is_supported: entity_tag = OTHER

    return entity_tag

def ner_parse_pattern(pattern):
    """Given the pattern, return a list of (token, ner_tag) pairs.
    Example: 
    """
    master_list = []
    token_list = custom_tokenizer(pattern)
    for token in token_list:
        entity, entity_type = get_entity(token)
        entity_tag = get_entity_tag(entity, entity_type, supported=False)
        master_list.append((entity, entity_tag))
    return master_list


def get_exceptions_dict(pdf, pattern):
    sp_dict = {}
    for _, row in pdf.iterrows():
        entity = row.ExceptionEntityName
        entity_tag = row.ExceptionEntityTag
        for pos in range(len(pattern)):
            if pattern.startswith(entity, pos):
                sp_dict[pos] = (entity, entity_tag)
    return sp_dict

def get_exceptions_ner_list(sp_dict, pattern):
    master_list = []
    pos_list = list(sp_dict.keys())
    pos_list.sort(reverse=False)
    
    start = 0
    for pos in pos_list:
        entity = sp_dict[pos][0]
        tag = sp_dict[pos][1]
        if start < pos:
            prec_pattern = pattern[start:pos]
            prec_list = ner_parse_pattern(prec_pattern)
            for item in prec_list:
                master_list.append(item)
                
        master_list.append((entity, tag))
        start = pos + len(entity)
    
    if start < len(pattern):
        tail_pattern = pattern[start:len(pattern)]
        tail_list = ner_parse_pattern(tail_pattern)
        for item in tail_list:
            master_list.append(item)
                
    return master_list
    
def ner_parse_pattern_df(pdf):
    """Given the df with exceptions in ExceptionEntityName for pattern handling, 
    return a list of (token, ner_tag) pairs."""
    
    pattern = pdf.iloc[0].PATTERN
    
    # split pattern into sub-patterns based on the exceptions
    # defined in the ExceptionEntityName column
    sp_dict = get_exceptions_dict(pdf, pattern)
    print('~'*30)
    print(sp_dict)
    master_list = get_exceptions_ner_list(sp_dict, pattern)
    
    return master_list
    

def get_sample_name_list(base, replace = True, how_many=64, random_state=RANDOM_STATE, fixed = False):
    """Sample with replacement; otherwise, may not have enough names to sample.""" 
    if not fixed: random_state = random.choice([i for i in CACHED_PRIMES if MIN_PRIME < i < MAX_PRIME])
    df = ENTITY_CODE_DF_DCT[base]
    ser = df[base]
    ser.dropna(inplace = True) # critical, as the excel files are of uneven row numbers per column
    name_list = list(ser.sample(n=how_many, replace=True, random_state=random_state).values)
    return name_list    

def get_fake_entity_names(token, tag, how_many):
    """Given (token, tag) pair, generate how_many instances of fake entity names.
    Example: (ATTORNEY_1, attorney) will have a list of how_many names to use as a
    fake substitute for ATTORNEY_1
    """
    base = get_entity_base(token)
    entity_name_list = get_sample_name_list(base, replace = True, how_many=how_many, random_state=RANDOM_STATE, fixed = False)
    
    if not is_exception_entity(token): return entity_name_list
    else: # Special Cases such as Law Office of <ATTORNEY_2>: assume only one <entity>
        token_list = []
        
        parts = token.split()
        entity_name = []
        
        prefix_pos = token.find('<')
        suffix_pos = token.find('>') + 1
        
        prefix = token[0:prefix_pos]
        suffix = token[suffix_pos:]
        
        for entity in entity_name_list:
            new_entity = ' '.join(entity.split())
            final_token = prefix.strip() + ' ' + new_entity.strip() + ' ' + suffix.strip()
            token_list.append(final_token.strip())
        return token_list
        
def generate_fake_sentences(ner_token_list, how_many = MINI_BATCH_SIZE):
    unique_entity_list = get_unique_entity_list(ner_token_list)
    fake_entity_dict = defaultdict(list)
    master_fake_list = []
    
    for (token, tag) in unique_entity_list:
        key = (token, tag)
        entity_name_list = get_fake_entity_names(token, tag, how_many=how_many)
        fake_entity_dict[key] = entity_name_list
    
    for idx in range(0, how_many):
        fake_list = []
        for (token, tag) in ner_token_list:
            key = (token, tag)
            if key in fake_entity_dict.keys():
                entity_names = fake_entity_dict[key]
                name = entity_names[idx]
                fake_list.append((name, tag))
            else: fake_list.append((token, tag))
            
        master_fake_list.append(fake_list)
        
    return master_fake_list

def get_unique_entity_list(ner_token_list):
    """Extract unique (ENTITY, EntityTag) pairs in the sentence template"""
    entity_list = []
    result_list = []
    for (token, tag) in ner_token_list:
        if tag != OTHER:
            entity_list.append((token, tag))
    if len(entity_list):
        entity_set = set(entity_list)
        result_list = list(entity_set)
    return result_list

In [7]:
# Test the function
#entity_phrase = 'Office of <ATTORNEY_2> Law Firm'
#base = get_entity_base(entity_phrase)
#print (base)
#entity_name_list = get_fake_entity_names(token=entity_phrase, tag='lawFirm', how_many=1)
#print(entity_name_list)

In [29]:
from collections import defaultdict
def process_non_special_patterns(pg_list = PATTERN_GROUP_LIST, 
                                 merge = False, pattern_src = "cl_counsel", 
                                 conll=True):
    """For a given pattern groupID from the pg_list, generate fake sentences and save them in conll format.
    merge = True : combine output into a single file for different groupIDs
    merge = False: do not combine into a single conll file
    """
    
    pattern_sent_dict = defaultdict(list)
    print(len(pg_list), " : The Number of Pattern Groups to Process.")
    for pattern_group in pg_list:
        print("="*50)
        print(pattern_group, ": Processing this pattern group")
        df = NON_SPECIAL_PATTERN_DF [NON_SPECIAL_PATTERN_DF.GroupID == pattern_group]
        pattern_list = df.PATTERN.unique()
        print(pattern_group, ": The Number of Unique Patterns : ", len(pattern_list))
                      
        file_prefix = pattern_group
        if merge: file_prefix = '_'.join(pg_list)
    
        file_names = [f'{file_prefix}_{pattern_src}_train_iobes.txt', 
                      f'{file_prefix}_{pattern_src}_val_iobes.txt',
                      f'{file_prefix}_{pattern_src}_test_iobes.txt']
        
        # Validation and Test data set do not have to be large for each pattern.
        # Taking only every third sentence might suffice.
        file_skip_dict = {file_names[0]:1, file_names[1]:VAL_TEST_SKIP_STEP, file_names[2]:VAL_TEST_SKIP_STEP} # how many sentences to skip
                      
        no_files = len(file_names)
        how_many = MINI_BATCH_SIZE * no_files * BATCHES_PER_FILE
        print('HOW MANY = ', how_many) # 384 = 64 * 3 * 2

        for pid, pattern in enumerate(pattern_list):
            print('='*30)
            print(pattern_group, ": ", pid, ": PATTERN TEMPLATE: \n\t", pattern)
            print('-'*30)
            ner_token_list = ner_parse_pattern(pattern)
            print(ner_token_list)
            print(pid, '\tNER TOKEN LIST: ', ner_token_list)
            
        
            fake_tokenized_sent_list = generate_fake_sentences(ner_token_list, how_many=how_many)
            #print(pid, '\tFAKE TOKENIZED SENTENCE LIST:', fake_tokenized_sent_list)
            pattern_sent_dict[pattern] = fake_tokenized_sent_list
            
            if conll:
                print(len(fake_tokenized_sent_list), ' fake sentences per file and per pattern')
                start = 0
                sent_per_file = len(fake_tokenized_sent_list) // no_files
                print(pid, "\tPreparing CONLL file format for writing...")
                for file_name in file_names:
                    print(pid, "\t",file_name)
                    end = start + sent_per_file
                    skip = file_skip_dict[file_name]
                    conll_text = convert_fake_sent_to_CONLL_sentence(fake_tokenized_sent_list[start:end:skip])
                    write_to_conll_file(conll_text,  file_name,  dir_path=CORPUS_DIR_PATH, mode="a", verbose = False)
                    start = end
                    
    return pattern_sent_dict
                

def process_special_patterns(pg_list = PATTERN_GROUP_LIST, 
                             merge = False, pattern_src = "cl_counsel", 
                             conll=True):
    """For a given pattern groupID from the pg_list, generate fake sentences and save them in conll format.
    merge = True : combine output into a single file for different groupIDs
    merge = False: do not combine into a single conll file
    """
    print(len(pg_list), " : The Number of Pattern Groups to Process.")
    pattern_sent_dict = defaultdict(list)
    
    for pattern_group in pg_list:
        print("="*50)
        print(pattern_group, ": Processing this pattern group")        
        
        df = SPECIAL_PATTERN_DF [SPECIAL_PATTERN_DF.GroupID == pattern_group]
        pattern_list = df.PATTERN.unique()
        print(pattern_group, ": The Number of Unique Patterns : ", len(pattern_list))
                      
        file_prefix = pattern_group
        if merge: file_prefix = '_'.join(pg_list)
    
        file_names = [f'{file_prefix}_{pattern_src}_train_iobes.txt', 
                      f'{file_prefix}_{pattern_src}_val_iobes.txt',
                      f'{file_prefix}_{pattern_src}_test_iobes.txt']
        
        # Validation and Test data set do not have to be large for each pattern.
        # Taking only every third sentence might suffice.
        file_skip_dict = {file_names[0]:1, file_names[1]:VAL_TEST_SKIP_STEP, file_names[2]:VAL_TEST_SKIP_STEP} # how many sentences to skip        
                      
        no_files = len(file_names)
        how_many = MINI_BATCH_SIZE * no_files * BATCHES_PER_FILE
        print('HOW MANY = ', how_many) # 384 = 64 * 3 * 2

        for pid, pattern in enumerate(pattern_list):
            print('='*30)
            print(pattern_group, ": ", pid, ": PATTERN TEMPLATE: \n\t", pattern)
            print('-'*30)
            
            pdf = df[df.PATTERN == pattern]
            
            ner_token_list = ner_parse_pattern_df(pdf)
            print(pid, '\tNER TOKEN LIST: ', ner_token_list)
        
            fake_tokenized_sent_list = generate_fake_sentences(ner_token_list, how_many=how_many)
            #print(pid, '\tFAKE TOKENIZED SENTENCE LIST:', fake_tokenized_sent_list)
            pattern_sent_dict[pattern] = fake_tokenized_sent_list
            
            if conll:
                print(len(fake_tokenized_sent_list), ' fake sentences per file and per pattern')
                start = 0
                sent_per_file = len(fake_tokenized_sent_list) // no_files
                print(pid, "\tPreparing CONLL file format for writing...")
                for file_name in file_names:
                    print(pid, "\t",file_name)
                    end = start + sent_per_file
                    skip = file_skip_dict[file_name]
                    conll_text = convert_fake_sent_to_CONLL_sentence(fake_tokenized_sent_list[start:end:skip])
                    write_to_conll_file(conll_text,  file_name,  dir_path=CORPUS_DIR_PATH, mode="a", verbose = False)
                    start = end
                
    return pattern_sent_dict

In [9]:
# CONLL-related methods
def process_entity_segment(entity, entity_tag):
    token_list = entity.strip().split()
    if len(token_list) == 1:
        single_text = entity + ' S-' + entity_tag + '\n'
        return single_text
    
    B_tag = ' B-' + entity_tag + '\n'
    I_tag = ' I-' + entity_tag + '\n'
    E_tag = ' E-' + entity_tag + '\n'
    
    B_text = token_list[0] + B_tag
    E_text = token_list[-1] + E_tag
    I_text = ""
    for token in token_list[1:len(token_list)-1]:
        I_text = I_text + token + I_tag
        
    return B_text + I_text + E_text    

def process_other_segment(token):
    text = ""
    
    O_tag = " O" + '\n'
    token_list = token.strip().split()
    for token in token_list:
        text = text + token + O_tag
    return text

def get_conll_tag(entity_tag):
    df = NER_DF[NER_DF.isSupported == True]
    status = entity_tag in list(df.EntityTag)
    if status: return entity_tag
    else: return OTHER
    
def convert_fake_sent_to_CONLL_sentence(fake_tokenized_sent_list):
    text = ""
    
    for ner_token_list in fake_tokenized_sent_list:
        for idx, (token, tag) in enumerate(ner_token_list):
            conll_tag = get_conll_tag(tag)
            if conll_tag == OTHER: 
                conll_token = process_other_segment(token.strip())
                text = text + conll_token
            else: text = text + process_entity_segment(token.strip(), conll_tag)
        text = text + '\n'

    return text

def write_to_conll_file(text, file_name="conll_iobes.txt", dir_path=CORPUS_DIR_PATH, mode = 'a', verbose = False):
    path = pathlib.Path(dir_path)
    abs_path = path.absolute()
    if not abs_path.exists():
        abs_path.mkdir()
    file_path = abs_path.joinpath(file_name)
        
    with open(file_path, mode, encoding = "utf-8") as the_file:
        the_file.write(text)
        
    if verbose: print("Saved: ", file_path)
    return

### Step-1: Read and Filter Model Pattern File

In [10]:
MODEL_PATTERN_DF = pd.read_excel(INPUT_FILE, sheet_name = MODEL_PATTERN_SHEET)
NER_DF = pd.read_excel(INPUT_FILE, sheet_name = NAMED_ENTITIES_SHEET)
NER_DF.tail(7)

Unnamed: 0,Category,EntityCode,EntityTag,isSupported,Example,DictionaryFileName,ToDo
21,Location,STATE_ABBR,state,False,Calif.,US_state_abbrev_names.xlsx,
22,Location,COUNTY,county,False,,US_county_state_names.xlsx,
23,Person,PERSON,person,False,,person_names.xlsx,
24,Person,PERSON_LAST_NAME,person,False,,person_names.xlsx,ADD_COL
25,Employment,GOV_TITLE,employmentTitle,False,,US_employment_titles.xlsx,
26,Organization,PRISON,prison,False,,US_organization.xlsx,
27,Organization,CHURCH,church,False,,US_organization.xlsx,


In [11]:
MODEL_PATTERN_DF.head()

Unnamed: 0,PID,Reviewed,SME,isSpecial,DataError,ExceptionEntityName,ExceptionEntityTag,isFinal,GroupID,PATTERN,PRE_FILLED_PATTERN,Sentence_TEXT,Sentence_ID
0,FP_23,False,True,False,1.0,,,False,,,"ELIZABETH LATIF , Law Offices of For <LITIGANT...","ELIZABETH LATIF , Law Offices of For Defendant...",S29
1,M_96,False,False,False,0.0,,,False,,,For <STATE_NAME=Alabama> Department of Public ...,"For Alabama Department of Public Health , Denn...",S132
2,M_99,True,True,False,0.0,,,False,,,"For <LITIGANT_PERSON> , Secretary of U . S . D...","For ALEX M . AZAR II , Secretary of U . S . De...",S135
3,M_100,True,True,False,0.0,,,False,,,"For <LITIGANT_PERSON> , Acting Commissioner , ...","For Andrew M . Saul , Acting Commissioner , So...",S136
4,M_102,True,True,False,0.0,,,False,,,"For <LITIGATN_PERSON> , Commissioner of Social...","For Andrew M . Saul , Commissioner of Social S...",S138


In [12]:
ENTITY_CODE_DF_DCT = get_entity_code_df_dict()

(61002, 10) : US_litigants.xlsx
(61002, 2) : US_attorney_names.xlsx
(218, 7) : US_law_firms.xlsx
(28889, 2) : US_cities_state_names.xlsx
(1, 2) : US_cities_abbrev_names.xlsx
(51, 3) : US_state_abbrev_names.xlsx
(3241, 2) : US_county_state_names.xlsx
(61002, 7) : person_names.xlsx
(30, 1) : US_employment_titles.xlsx
(6, 2) : US_organization.xlsx


In [13]:
ENTITY_CODE_DF_DCT.keys()

dict_keys(['LITIGANT_PERSON', 'LITIGANT_COMPANY', 'LITIGANT_COUNTY', 'LITIGANT_CHURCH', 'LITIGANT_HOSPITAL', 'LITIGANT_INSURANCE_CO', 'LITIGANT_FOUNDATION', 'LITIGANT_ROLE', 'GOV_LITIGANT', 'LITIGANT_LAW_FIRM', 'ATTORNEY', 'LAW_FIRM', 'ODD_LAW_FIRM', 'UNIQUE_LAW_FIRM', 'GOVT_ORG_AS_LAW_FIRM', 'AMICUS_CURAE_LAW_FIRM', 'FOUNDATION_LAW_FIRM', 'CITY', 'CITY_ABBR', 'STATE', 'STATE_NAME', 'STATE_ABBR', 'COUNTY', 'PERSON', 'PERSON_LAST_NAME', 'GOV_TITLE', 'PRISON', 'CHURCH'])

In [14]:
ENTITY_CODE_DF_DCT['LITIGANT_ROLE'].LITIGANT_ROLE[0:10]

0                 APPELLANT
1                 Appellant
2                 appellant
3                APPELLANTS
4                Appellants
5                appellants
6    APPELLANT / PETITIONER
7    Appellant / Petitioner
8    appellant / petitioner
9    APPELLANT - PETITIONER
Name: LITIGANT_ROLE, dtype: object

In [15]:
ENTITY_CODE_DF_DCT['LAW_FIRM'].head()

Unnamed: 0,LAW_FIRM,UNIQUE_LAW_FIRM,ODD_LAW_FIRM,FOUNDATION_LAW_FIRM,AMICUS_CURAE_LAW_FIRM,GOVT_ORG_AS_LAW_FIRM,LawFirmPattern
0,Traub Lieberman Straus and Shrewsberry LLP,Pacific Merchant Shipping Association,AARP Foundation Litigation,ACLU of Oklahoma Foundation,FREEDOM FOUNDATION,United States Department of Justice,"Law Office of <ATTORNEY> , Attorney at Law"
1,"Joseph , Aleem & Slowik LLC",Office of Disciplinary Counsel,Adams and Reese,FREEDOM FOUNDATION,Freedom from Religion Foundation,"Office of Immigration Litigation , Department ...","The Law Offices of <ATORNEY> , <ATTORNEY>"
2,Lowe and Reilly,Social Security Administration,AKERMAN SENTERFITT,Freedom from Religion Foundation,GENDER JUSTICE,"U . S . Department of Justice , Civil Division...",
3,"Foland , Wickens , Roper , Hofer & Crawford PC",State Public Defender Office,AKERMAN SENTERFITT EIDSON,Hindu American Foundation,Hindu American Foundation,"United States Department of Justice , Office o...",
4,AKERMAN LLP,,Akin Gump,Pacific Legal Foundation,Institute for Free Speech,"U . S . Department of Justice , Office of Immi...",


In [16]:
MODEL_PATTERN_DF.head()

Unnamed: 0,PID,Reviewed,SME,isSpecial,DataError,ExceptionEntityName,ExceptionEntityTag,isFinal,GroupID,PATTERN,PRE_FILLED_PATTERN,Sentence_TEXT,Sentence_ID
0,FP_23,False,True,False,1.0,,,False,,,"ELIZABETH LATIF , Law Offices of For <LITIGANT...","ELIZABETH LATIF , Law Offices of For Defendant...",S29
1,M_96,False,False,False,0.0,,,False,,,For <STATE_NAME=Alabama> Department of Public ...,"For Alabama Department of Public Health , Denn...",S132
2,M_99,True,True,False,0.0,,,False,,,"For <LITIGANT_PERSON> , Secretary of U . S . D...","For ALEX M . AZAR II , Secretary of U . S . De...",S135
3,M_100,True,True,False,0.0,,,False,,,"For <LITIGANT_PERSON> , Acting Commissioner , ...","For Andrew M . Saul , Acting Commissioner , So...",S136
4,M_102,True,True,False,0.0,,,False,,,"For <LITIGATN_PERSON> , Commissioner of Social...","For Andrew M . Saul , Commissioner of Social S...",S138


In [17]:
NER_DF.set_index('EntityCode', inplace = True)
NER_DICT = NER_DF.to_dict(orient='index')
print(NER_DICT['ATTORNEY'])
NER_DF.head()

{'Category': 'Attorney', 'EntityTag': 'attorney', 'isSupported': True, 'Example': nan, 'DictionaryFileName': 'US_attorney_names.xlsx', 'ToDo': nan}


Unnamed: 0_level_0,Category,EntityTag,isSupported,Example,DictionaryFileName,ToDo
EntityCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LITIGANT_PERSON,Litigant,litigant,False,,US_litigants.xlsx,
LITIGANT_COMPANY,Litigant,litigant,False,,US_litigants.xlsx,
LITIGANT_COUNTY,Litigant,litigant,False,,US_litigants.xlsx,
LITIGANT_CHURCH,Litigant,litigant,False,,US_litigants.xlsx,
LITIGANT_HOSPITAL,Litigant,litigant,False,,US_litigants.xlsx,


In [18]:
print(MODEL_PATTERN_DF.columns)

Index(['PID', 'Reviewed', 'SME', 'isSpecial', 'DataError',
       'ExceptionEntityName', 'ExceptionEntityTag', 'isFinal', 'GroupID',
       'PATTERN', 'PRE_FILLED_PATTERN', 'Sentence_TEXT', 'Sentence_ID'],
      dtype='object')


In [19]:
ROI_COLUMNS = ['isSpecial', 'ExceptionEntityName', 'ExceptionEntityTag', 
               'GroupID','PATTERN']
ERROR_PATTERN_DF = MODEL_PATTERN_DF [MODEL_PATTERN_DF.isFinal == True][ROI_COLUMNS]
print(ERROR_PATTERN_DF.shape)

(303, 5)


In [20]:
#ROI_COLUMNS_2 = ['Reviewed', 'SME', 'DataError', 
#               'isSpecial', 'ExceptionEntityName', 'ExceptionEntityTag', 
#               'GroupID','PATTERN', 'Sentence_TEXT']
#ERROR_PATTERN_DF = MODEL_PATTERN_DF [MODEL_PATTERN_DF.isFinal == True][ROI_COLUMNS_2]
#print(ERROR_PATTERN_DF.shape)

In [21]:
ERROR_PATTERN_DF.drop_duplicates(inplace = True, ignore_index = True)
print(ERROR_PATTERN_DF.shape)

(266, 5)


In [22]:
ERROR_PATTERN_DF.head()

Unnamed: 0,isSpecial,ExceptionEntityName,ExceptionEntityTag,GroupID,PATTERN
0,False,,,G18,"Attorney General <ATTORNEY_1> , by Special De..."
1,False,,,G13,"<ATTORNEY_1> , county attorney , and <ATTORNE..."
2,False,,,G13,"<ATTORNEY_1> , <ATTORNEY_2> , <ATTORNEY_3> , D..."
3,False,,,G13,"<ATTORNEY_1> , <ATTORNEY_2> , Chief Counsel , ..."
4,False,,,G13,"<ATTORNEY_1> , <GOV_TITLE_1> , <ATTORNEY_2> , ..."


In [23]:
UNIQUE_PATTERN_LIST = ERROR_PATTERN_DF.PATTERN.unique()
print(len(UNIQUE_PATTERN_LIST))

257


In [24]:
# Process Unambiguous patterns
NON_SPECIAL_PATTERN_DF = ERROR_PATTERN_DF[ERROR_PATTERN_DF.isSpecial == False]
SPECIAL_PATTERN_DF = ERROR_PATTERN_DF[ERROR_PATTERN_DF.isSpecial == True]
NON_SPECIAL_PATTERN_DF.columns

Index(['isSpecial', 'ExceptionEntityName', 'ExceptionEntityTag', 'GroupID',
       'PATTERN'],
      dtype='object')

In [25]:
print(NON_SPECIAL_PATTERN_DF.shape)
print(SPECIAL_PATTERN_DF.shape)

(204, 5)
(62, 5)


In [26]:
# Examine the patterns
#special_pattern_list = SPECIAL_PATTERN_DF.PATTERN.unique()
#non_special_pattern_list = NON_SPECIAL_PATTERN_DF.PATTERN.unique()

#print('Non-Special Error Patterns:')
#print_list(non_special_pattern_list)
#print('-'*50)
#print('Special Error Patterns:')
#print_list(special_pattern_list)

In [27]:
print(PATTERN_GROUP_LIST)

['G1', 'G2', 'G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9', 'G10', 'G11', 'G12', 'G13', 'G14', 'G15', 'G16', 'G17', 'G18', 'G19', 'G20', 'G21', 'G22', 'G23', 'G24', 'G25', 'G26']


In [28]:
# Comment PATTERN_GROUP_LIST here, once debugging is finished
#PATTERN_GROUP_LIST = ['G14', 'G15', 'G16', 'G17', 'G18', 'G19', 'G20', 'G21', 'G22', 'G23', 'G24']
CONLL = True
pattern_sent_dict_non_spec = process_non_special_patterns(pg_list = PATTERN_GROUP_LIST, 
                                                          merge = False, 
                                                          pattern_src = "cl_counsel", conll=CONLL)
print("COMPLETED NON-SPECIAL PATTERNS")

#-------------------------
pattern_sent_dict_special = process_special_patterns(pg_list = PATTERN_GROUP_LIST, 
                                                     merge = False, 
                                                     pattern_src = "cl_counsel", conll=CONLL)
print("COMPLETED SPECIAL PATTERNS")

26  : The Number of Pattern Groups to Process.
G1 : Processing this pattern group
G1 : The Number of Unique Patterns :  5
HOW MANY =  144
G1 :  0 : PATTERN TEMPLATE: 
	 <LITIGANT_PERSON> , in pro . Per. for <LITIGANT_ROLE> .
------------------------------
[('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('in', 'other'), ('pro', 'other'), ('.', 'other'), ('Per', 'other'), ('.', 'other'), ('for', 'other'), ('LITIGANT_ROLE', 'litigant'), ('.', 'other')]
0 	NER TOKEN LIST:  [('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('in', 'other'), ('pro', 'other'), ('.', 'other'), ('Per', 'other'), ('.', 'other'), ('for', 'other'), ('LITIGANT_ROLE', 'litigant'), ('.', 'other')]
144  fake sentences per file and per pattern
0 	Preparing CONLL file format for writing...
0 	 G1_cl_counsel_train_iobes.txt
0 	 G1_cl_counsel_val_iobes.txt
0 	 G1_cl_counsel_test_iobes.txt
G1 :  1 : PATTERN TEMPLATE: 
	 <LITIGANT_PERSON> , In pro per , and <ATTORNEY> ,  under appointment by the Court of Appeal , for <LITI

2 	 G2_cl_counsel_val_iobes.txt
2 	 G2_cl_counsel_test_iobes.txt
G2 :  3 : PATTERN TEMPLATE: 
	 For <LITIGANT_PERSON> , <LITIGANT_ROLE> : <ATTORNEY_1> , LEAD ATTORNEY , <ATTORNEY_1> Attorney at Law PLLC , <CITY_1> , <STATE_1> ; <ATTORNEY_2> , <GOVT_ORG_AS_LAW_FIRM> - <CITY> , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('ATTORNEY_1', 'attorney'), ('Attorney', 'other'), ('at', 'other'), ('Law', 'other'), ('PLLC', 'other'), (',', 'other'), ('CITY_1', 'city'), (',', 'other'), ('STATE_1', 'state'), (';', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), ('-', 'other'), ('CITY', 'city'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
3 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_PERSON', '

9 	 G2_cl_counsel_val_iobes.txt
9 	 G2_cl_counsel_test_iobes.txt
G2 :  10 : PATTERN TEMPLATE: 
	 For <LITIGANT_ROLE> : Prosecuting Attorney <COUNTY> , <COUNTY> Pros . Atty , <CITY> , <STATE> ; <ATTORNEY> , Attorney at Law , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('Prosecuting', 'other'), ('Attorney', 'other'), ('COUNTY', 'county'), (',', 'other'), ('COUNTY', 'county'), ('Pros', 'other'), ('.', 'other'), ('Atty', 'other'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), (';', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('at', 'other'), ('Law', 'other'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
10 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('Prosecuting', 'other'), ('Attorney', 'other'), ('COUNTY', 'county'), (',', 'other'), ('COUNTY', 'county'), ('Pros', 'other'), ('.', 'oth

18 	 G2_cl_counsel_val_iobes.txt
18 	 G2_cl_counsel_test_iobes.txt
G2 :  19 : PATTERN TEMPLATE: 
	 Mr . <ATTORNEY> , Attorney at Law , For <LITIGANT_INSURANCE_CO> .
------------------------------
[('Mr', 'other'), ('.', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('at', 'other'), ('Law', 'other'), (',', 'other'), ('For', 'other'), ('LITIGANT_INSURANCE_CO', 'litigant'), ('.', 'other')]
19 	NER TOKEN LIST:  [('Mr', 'other'), ('.', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('at', 'other'), ('Law', 'other'), (',', 'other'), ('For', 'other'), ('LITIGANT_INSURANCE_CO', 'litigant'), ('.', 'other')]
144  fake sentences per file and per pattern
19 	Preparing CONLL file format for writing...
19 	 G2_cl_counsel_train_iobes.txt
19 	 G2_cl_counsel_val_iobes.txt
19 	 G2_cl_counsel_test_iobes.txt
G2 :  20 : PATTERN TEMPLATE: 
	 Mr . <ATTORNEY> , Attorney at Law , For <LITIGANT_LAW_FIRM> , et al .
------------------------------
[('Mr', 'o

28 	 G2_cl_counsel_val_iobes.txt
28 	 G2_cl_counsel_test_iobes.txt
G2 :  29 : PATTERN TEMPLATE: 
	 Mr . <ATTORNEY> , Attorney at Law , For K2 Asia Ventures .
------------------------------
[('Mr', 'other'), ('.', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('at', 'other'), ('Law', 'other'), (',', 'other'), ('For', 'other'), ('K2', 'other'), ('Asia', 'other'), ('Ventures', 'other'), ('.', 'other')]
29 	NER TOKEN LIST:  [('Mr', 'other'), ('.', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('at', 'other'), ('Law', 'other'), (',', 'other'), ('For', 'other'), ('K2', 'other'), ('Asia', 'other'), ('Ventures', 'other'), ('.', 'other')]
144  fake sentences per file and per pattern
29 	Preparing CONLL file format for writing...
29 	 G2_cl_counsel_train_iobes.txt
29 	 G2_cl_counsel_val_iobes.txt
29 	 G2_cl_counsel_test_iobes.txt
G2 :  30 : PATTERN TEMPLATE: 
	 Mr . <ATTORNEY> , Attorney at Law , For Trustee .
----------------------------

37 	 G2_cl_counsel_val_iobes.txt
37 	 G2_cl_counsel_test_iobes.txt
G2 :  38 : PATTERN TEMPLATE: 
	 Ms . <ATTORNEY> , Attorney at Law , For <LITIGANT_HOSPITAL>  .
------------------------------
[('Ms', 'other'), ('.', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('at', 'other'), ('Law', 'other'), (',', 'other'), ('For', 'other'), ('LITIGANT_HOSPITAL', 'litigant'), ('.', 'other')]
38 	NER TOKEN LIST:  [('Ms', 'other'), ('.', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('at', 'other'), ('Law', 'other'), (',', 'other'), ('For', 'other'), ('LITIGANT_HOSPITAL', 'litigant'), ('.', 'other')]
144  fake sentences per file and per pattern
38 	Preparing CONLL file format for writing...
38 	 G2_cl_counsel_train_iobes.txt
38 	 G2_cl_counsel_val_iobes.txt
38 	 G2_cl_counsel_test_iobes.txt
G2 :  39 : PATTERN TEMPLATE: 
	 Ms . <ATTORNEY> , Attorney at Law , For <LITIGANT_PERSON> .
------------------------------
[('Ms', 'other'), ('.', 'other'

0 	 G4_cl_counsel_val_iobes.txt
0 	 G4_cl_counsel_test_iobes.txt
G4 :  1 : PATTERN TEMPLATE: 
	 For Commissioner of Social Security , <LITIGANT_ROLE>  : <ATTORNEY> , LEAD ATTORNEY , <GOVT_ORG_AS_LAW_FIRM> , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('Commissioner', 'other'), ('of', 'other'), ('Social', 'other'), ('Security', 'other'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
1 	NER TOKEN LIST:  [('For', 'other'), ('Commissioner', 'other'), ('of', 'other'), ('Social', 'other'), ('Security', 'other'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY'

144  fake sentences per file and per pattern
1 	Preparing CONLL file format for writing...
1 	 G5_cl_counsel_train_iobes.txt
1 	 G5_cl_counsel_val_iobes.txt
1 	 G5_cl_counsel_test_iobes.txt
G5 :  2 : PATTERN TEMPLATE: 
	 For <LITIGANT_ROLE> : <ATTORNEY_1> , <ATTORNEY_2> , <COUNTY> County Prosecutor ' s Office , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('COUNTY', 'county'), ('County', 'other'), ('Prosecutor', 'other'), ("'", 'other'), ('s', 'other'), ('Office', 'other'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
2 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('COUNTY', 'county'), ('County', 'other'), ('Prosecutor', 'other'), ("'", 'other'), ('s', 'other'), ('

7 	 G5_cl_counsel_val_iobes.txt
7 	 G5_cl_counsel_test_iobes.txt
G5 :  8 : PATTERN TEMPLATE: 
	 FOR <LITIGANT_ROLE> : <GOVT_ORG_AS_LAW_FIRM> , BY : <ATTORNEY> .
------------------------------
[('FOR', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('BY', 'other'), (':', 'other'), ('ATTORNEY', 'attorney'), ('.', 'other')]
8 	NER TOKEN LIST:  [('FOR', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('BY', 'other'), (':', 'other'), ('ATTORNEY', 'attorney'), ('.', 'other')]
144  fake sentences per file and per pattern
8 	Preparing CONLL file format for writing...
8 	 G5_cl_counsel_train_iobes.txt
8 	 G5_cl_counsel_val_iobes.txt
8 	 G5_cl_counsel_test_iobes.txt
G5 :  9 : PATTERN TEMPLATE: 
	 FOR <LITIGANT_ROLE> : Mr . <ATTORNEY_1> , Mr . <ATTORNEY_2> , <LAW_FIRM> , <CITY> , <STATE> .
------------------------------
[('FOR', 'other'), ('LITIGANT_ROLE', 'litigant'), (':'

5 	 G6_cl_counsel_val_iobes.txt
5 	 G6_cl_counsel_test_iobes.txt
G6 :  6 : PATTERN TEMPLATE: 
	 <ATTORNEY> of the <UNIQUE_LAW_FIRM>, <CITY> , attorney and guardian ad litem for minor children .
------------------------------
[('ATTORNEY', 'attorney'), ('of', 'other'), ('the', 'other'), ('UNIQUE_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('attorney', 'other'), ('and', 'other'), ('guardian', 'other'), ('ad', 'other'), ('litem', 'other'), ('for', 'other'), ('minor', 'other'), ('children', 'other'), ('.', 'other')]
6 	NER TOKEN LIST:  [('ATTORNEY', 'attorney'), ('of', 'other'), ('the', 'other'), ('UNIQUE_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('attorney', 'other'), ('and', 'other'), ('guardian', 'other'), ('ad', 'other'), ('litem', 'other'), ('for', 'other'), ('minor', 'other'), ('children', 'other'), ('.', 'other')]
144  fake sentences per file and per pattern
6 	Preparing CONLL file format for writing...
6 	 G6_cl_counsel_tr

3 	 G8_cl_counsel_val_iobes.txt
3 	 G8_cl_counsel_test_iobes.txt
G8 :  4 : PATTERN TEMPLATE: 
	 For USA , <LITIGANT_ROLE> : <ATTORNEY_1> , LEAD ATTORNEY , <ATTORNEY_2> , <GOVT_ORG_AS_LAW_FIRM> ( <CITY> ) , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('USA', 'other'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), ('(', 'other'), ('CITY', 'city'), (')', 'other'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
4 	NER TOKEN LIST:  [('For', 'other'), ('USA', 'other'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), ('(', 'other'), ('CITY

8 	 G8_cl_counsel_val_iobes.txt
8 	 G8_cl_counsel_test_iobes.txt
G8 :  9 : PATTERN TEMPLATE: 
	 For USA , <LITIGANT_ROLE> : <ATTORNEY_1> , LEAD ATTORNEY , <GOVT_ORG_AS_LAW_FIRM> ,  <CITY> , <STATE> USA ; <CITY> , <GOVT_ORG_AS_LAW_FIRM> ( <CITY> ) ,  <CITY> , <STATE> ; Pretrial Services . , Probation Department . ; <ATTORNEY_2> , <GOVT_ORG_AS_LAW_FIRM> ,  <CITY> , <STATE> ; <ATTORNEY_3> , <GOVT_ORG_AS_LAW_FIRM> (<CITY> ,  <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('USA', 'other'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('USA', 'other'), (';', 'other'), ('CITY', 'city'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), ('(', 'other'), ('CITY', 'city'), (')', 'other'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', '

12 	 G8_cl_counsel_val_iobes.txt
12 	 G8_cl_counsel_test_iobes.txt
G8 :  13 : PATTERN TEMPLATE: 
	 For USA , <LITIGANT_ROLE> : <ATTORNEY_1> , LEAD ATTORNEY , <GOVT_ORG_AS_LAW_FIRM> , One St . Andrew ' s Plaza , <CITY> , <STATE> ; <ATTORNEY_2> , LEAD ATTORNEY , <GOVT_ORG_AS_LAW_FIRM> , SDNY , One Saint Andrew ' s Plaza , <CITY> , <STATE> ; <ATTORNEY_3> , LEAD ATTORNEY , <GOVT_ORG_AS_LAW_FIRM> , SDNY ( St Andw ' s ) , One St . Andrew ' s Plaza , <CITY> , <STATE> ; <ATTORNEY_4> , <GOVT_ORG_AS_LAW_FIRM> , SDNY ( St Andw ' s ) , One St . Andrew ' s Plaza , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('USA', 'other'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('One', 'other'), ('St', 'other'), ('.', 'other'), ('Andrew', 'other'), ("'", 'other'), ('s', 'other'), ('Plaza', 'other'), (',', 'other'

16 	 G8_cl_counsel_val_iobes.txt
16 	 G8_cl_counsel_test_iobes.txt
G8 :  17 : PATTERN TEMPLATE: 
	 For USA , <LITIGANT_ROLE> : <ATTORNEY> , LEAD ATTORNEY , <GOVT_ORG_AS_LAW_FIRM> - <CITY> , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('USA', 'other'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), ('-', 'other'), ('CITY', 'city'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
17 	NER TOKEN LIST:  [('For', 'other'), ('USA', 'other'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), ('-', 'other'), ('CITY', 'city'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
144  fake sentences per f

144  fake sentences per file and per pattern
23 	Preparing CONLL file format for writing...
23 	 G8_cl_counsel_train_iobes.txt
23 	 G8_cl_counsel_val_iobes.txt
23 	 G8_cl_counsel_test_iobes.txt
G8 :  24 : PATTERN TEMPLATE: 
	 For USA , <LITIGANT_ROLE> : <ATTORNEY> , LEAD ATTORNEY , <GOVT_ORG_AS_LAW_FIRM> , Northern District of <STATE_NAME> , Third Floor , <CITY> , <STATE> USA .
------------------------------
[('For', 'other'), ('USA', 'other'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('Northern', 'other'), ('District', 'other'), ('of', 'other'), ('STATE_NAME', 'state'), (',', 'other'), ('Third', 'other'), ('Floor', 'other'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('USA', 'other'), ('.', 'other')]
24 	NER TOKEN LIST:  [('For', 'other'), ('USA', 'other'), (',', 'other'), ('LITIGANT_RO

1 	 G9_cl_counsel_val_iobes.txt
1 	 G9_cl_counsel_test_iobes.txt
G9 :  2 : PATTERN TEMPLATE: 
	 <ATTORNEY_1> , <GOV_TITLE_1> , <ATTORNEY_2> , <GOV_TITLE_2> , <ATTORNEY_3> , <GOV_TITLE_3> , <ATTORNEY_4> and <ATTORNEY_5> , Deputy Attorneys General , for <LITIGANT_ROLE> .
------------------------------
[('ATTORNEY_1', 'attorney'), (',', 'other'), ('GOV_TITLE_1', 'employmentTitle'), (',', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('GOV_TITLE_2', 'employmentTitle'), (',', 'other'), ('ATTORNEY_3', 'attorney'), (',', 'other'), ('GOV_TITLE_3', 'employmentTitle'), (',', 'other'), ('ATTORNEY_4', 'attorney'), ('and', 'other'), ('ATTORNEY_5', 'attorney'), (',', 'other'), ('Deputy', 'other'), ('Attorneys', 'other'), ('General', 'other'), (',', 'other'), ('for', 'other'), ('LITIGANT_ROLE', 'litigant'), ('.', 'other')]
2 	NER TOKEN LIST:  [('ATTORNEY_1', 'attorney'), (',', 'other'), ('GOV_TITLE_1', 'employmentTitle'), (',', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('GOV_TITLE

5 	 G10_cl_counsel_val_iobes.txt
5 	 G10_cl_counsel_test_iobes.txt
G10 :  6 : PATTERN TEMPLATE: 
	 For <GOV_LITIGANT> , <LITIGANT_ROLE> : <ATTORNEY> , Attorney , <GOVT_ORG_AS_LAW_FIRM> , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('GOV_LITIGANT', 'litigant'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('Attorney', 'other'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
6 	NER TOKEN LIST:  [('For', 'other'), ('GOV_LITIGANT', 'litigant'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('Attorney', 'other'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
144  fake sentences per file and per pattern
6 	Preparing CONLL file format for writing...
6 	 G10_cl_counsel_t

0 	 G12_cl_counsel_val_iobes.txt
0 	 G12_cl_counsel_test_iobes.txt
G12 :  1 : PATTERN TEMPLATE: 
	 For  <LITIGANT_PERSON> , <LITIGANT_ROLE>  : <ATTORNEY_1> , <GOV_TITLE> , <ATTORNEY_2> , <GOV_TITLE> , <GOVT_ORG_AS_LAW_FIRM> , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('GOV_TITLE', 'employmentTitle'), (',', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('GOV_TITLE', 'employmentTitle'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
1 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('GOV_TITLE', 'employmentTitle'), (',', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('GOV_TITLE', 'employment

4 	 G12_cl_counsel_val_iobes.txt
4 	 G12_cl_counsel_test_iobes.txt
G12 :  5 : PATTERN TEMPLATE: 
	 For <LITIGANT_PERSON_1> , also known as :  <LITIGANT_PERSON_2> , <LITIGANT_ROLE>  : <ATTORNEY_1> , <GOV_TITLE_1> , <GOVT_ORG_AS_LAW_FIRM> , <CITY_1> , <STATE> ; <ATTORNEY_2> , <GOV_TITLE_2> , <GOVT_ORG_AS_LAW_FIRM> , <CITY_2> , <STATE> .
------------------------------
[('For', 'other'), ('LITIGANT_PERSON_1', 'litigant'), (',', 'other'), ('also', 'other'), ('known', 'other'), ('as', 'other'), (':', 'other'), ('LITIGANT_PERSON_2', 'litigant'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('GOV_TITLE_1', 'employmentTitle'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY_1', 'city'), (',', 'other'), ('STATE', 'state'), (';', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('GOV_TITLE_2', 'employmentTitle'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY_2', 'city'), ('

7 	 G12_cl_counsel_val_iobes.txt
7 	 G12_cl_counsel_test_iobes.txt
G12 :  8 : PATTERN TEMPLATE: 
	 For <LITIGANT_PERSON_1> , in his official capacity as Governor of <STATE_NAME> , <LITIGANT_PERSON> , in his official capacity as Attorney General of <STATE_NAME> , <LITIGANT_PERSON_2> , in his official capacity as District Attorney for <STATE_NAME> County , <LITIGANT_PERSON_3> , in his official capacity as District Attorney for Cleveland County , Gary Cox , in his official capacity as <STATE_NAME> Commissioner of Health , <LITIGANT_PERSON_4> , in his official capacity as Director of the <STATE_NAME> Department of Emergency Management , <LITIGANT_ROLE>  : <ATTORNEY_1> , <GOVT_ORG_AS_LAW_FIRM> , <STATE_NAME> City , <STATE> ; <ATTORNEY_2>, <ATTORNEY_3> , <GOVT_ORG_AS_LAW_FIRM>  <STATE_NAME> , <STATE> .
------------------------------
[('For', 'other'), ('LITIGANT_PERSON_1', 'litigant'), (',', 'other'), ('in', 'other'), ('his', 'other'), ('official', 'other'), ('capacity', 'other'), ('as', 'ot

10 	 G12_cl_counsel_val_iobes.txt
10 	 G12_cl_counsel_test_iobes.txt
G12 :  11 : PATTERN TEMPLATE: 
	 For <LITIGANT_PERSON> , <LITIGANT_ROLE>  : <ATTORNEY_1> , LEAD ATTORNEY , <GOV_TITLE> , <CITY> , <STATE> ; <ATTORNEY_2> , LEAD ATTORNEY , <GOVT_ORG_AS_LAW_FIRM> , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('GOV_TITLE', 'employmentTitle'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), (';', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
11 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':

144  fake sentences per file and per pattern
17 	Preparing CONLL file format for writing...
17 	 G12_cl_counsel_train_iobes.txt
17 	 G12_cl_counsel_val_iobes.txt
17 	 G12_cl_counsel_test_iobes.txt
G12 :  18 : PATTERN TEMPLATE: 
	 For <LITIGANT_PERSON> , <LITIGANT_ROLE> : <ATTORNEY> , <LAW_FIRM> , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
18 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
144  fake sentences per file and per pattern
18 	Preparing CONLL file format for writing...
18 

24 	 G12_cl_counsel_val_iobes.txt
24 	 G12_cl_counsel_test_iobes.txt
G12 :  25 : PATTERN TEMPLATE: 
	 For <LITIGANT_PERSON> , in his official capacity as Administrator of the U . S . Transportation Security Administration , <LITIGANT_ROLE>  : <ATTORNEY> , <GOVT_ORG_AS_LAW_FIRM> , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('in', 'other'), ('his', 'other'), ('official', 'other'), ('capacity', 'other'), ('as', 'other'), ('Administrator', 'other'), ('of', 'other'), ('the', 'other'), ('U', 'other'), ('.', 'other'), ('S', 'other'), ('.', 'other'), ('Transportation', 'other'), ('Security', 'other'), ('Administration', 'other'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
25 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_PERSON', 'litiga

0 	 G13_cl_counsel_val_iobes.txt
0 	 G13_cl_counsel_test_iobes.txt
G13 :  1 : PATTERN TEMPLATE: 
	 <ATTORNEY_1> , <ATTORNEY_2> , <ATTORNEY_3> , Deputies Corporation Counsel , City and County of <COUNTY> for <LITIGANT_ROLE>  .
------------------------------
[('ATTORNEY_1', 'attorney'), (',', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('ATTORNEY_3', 'attorney'), (',', 'other'), ('Deputies', 'other'), ('Corporation', 'other'), ('Counsel', 'other'), (',', 'other'), ('City', 'other'), ('and', 'other'), ('County', 'other'), ('of', 'other'), ('COUNTY', 'county'), ('for', 'other'), ('LITIGANT_ROLE', 'litigant'), ('.', 'other')]
1 	NER TOKEN LIST:  [('ATTORNEY_1', 'attorney'), (',', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('ATTORNEY_3', 'attorney'), (',', 'other'), ('Deputies', 'other'), ('Corporation', 'other'), ('Counsel', 'other'), (',', 'other'), ('City', 'other'), ('and', 'other'), ('County', 'other'), ('of', 'other'), ('COUNTY', 'county'), ('for', 'other'), ('LITI

7 	 G13_cl_counsel_val_iobes.txt
7 	 G13_cl_counsel_test_iobes.txt
G13 :  8 : PATTERN TEMPLATE: 
	 <ATTORNEY_1> , <GOV_TITLE_1> , <CITY_1>, and <ATTORNEY_2> , <GOV_TITLE_2> , <CITY_2> , for <LITIGANT_ROLE> .
------------------------------
[('ATTORNEY_1', 'attorney'), (',', 'other'), ('GOV_TITLE_1', 'employmentTitle'), (',', 'other'), ('CITY_1', 'city'), (',', 'other'), ('and', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('GOV_TITLE_2', 'employmentTitle'), (',', 'other'), ('CITY_2', 'city'), (',', 'other'), ('for', 'other'), ('LITIGANT_ROLE', 'litigant'), ('.', 'other')]
8 	NER TOKEN LIST:  [('ATTORNEY_1', 'attorney'), (',', 'other'), ('GOV_TITLE_1', 'employmentTitle'), (',', 'other'), ('CITY_1', 'city'), (',', 'other'), ('and', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('GOV_TITLE_2', 'employmentTitle'), (',', 'other'), ('CITY_2', 'city'), (',', 'other'), ('for', 'other'), ('LITIGANT_ROLE', 'litigant'), ('.', 'other')]
144  fake sentences per file and per pattern


15 	 G13_cl_counsel_val_iobes.txt
15 	 G13_cl_counsel_test_iobes.txt
G13 :  16 : PATTERN TEMPLATE: 
	 <ATTORNEY_1> , <GOV_TITLE_1> , attorney for <LITIGANT_ROLE> <STATE_NAME> Civil Service Commission ( <ATTORNEY_2> , <GOV_TITLE_2> , on the statement in lieu of brief ) .
------------------------------
[('ATTORNEY_1', 'attorney'), (',', 'other'), ('GOV_TITLE_1', 'employmentTitle'), (',', 'other'), ('attorney', 'other'), ('for', 'other'), ('LITIGANT_ROLE', 'litigant'), ('STATE_NAME', 'state'), ('Civil', 'other'), ('Service', 'other'), ('Commission', 'other'), ('(', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('GOV_TITLE_2', 'employmentTitle'), (',', 'other'), ('on', 'other'), ('the', 'other'), ('statement', 'other'), ('in', 'other'), ('lieu', 'other'), ('of', 'other'), ('brief', 'other'), (')', 'other'), ('.', 'other')]
16 	NER TOKEN LIST:  [('ATTORNEY_1', 'attorney'), (',', 'other'), ('GOV_TITLE_1', 'employmentTitle'), (',', 'other'), ('attorney', 'other'), ('for', 'other'), ('

21 	 G13_cl_counsel_val_iobes.txt
21 	 G13_cl_counsel_test_iobes.txt
G13 :  22 : PATTERN TEMPLATE: 
	 <ATTORNEY_1> , Attorney , <GOVT_ORG_AS_LAW_FIRM> ,  argued the cause for appellants . With him on the briefs were <ATTORNEY_2> , <GOV_TITLE> , and <ATTORNEY_3> , Attorney . <ATTORNEY_4> , Attorney , entered an appearance .
------------------------------
[('ATTORNEY_1', 'attorney'), (',', 'other'), ('Attorney', 'other'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('argued', 'other'), ('the', 'other'), ('cause', 'other'), ('for', 'other'), ('appellants', 'other'), ('.', 'other'), ('With', 'other'), ('him', 'other'), ('on', 'other'), ('the', 'other'), ('briefs', 'other'), ('were', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('GOV_TITLE', 'employmentTitle'), (',', 'other'), ('and', 'other'), ('ATTORNEY_3', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('.', 'other'), ('ATTORNEY_4', 'attorney'), (',', 'other'), ('Attorney', 'other'), (',', 'other'),

144  fake sentences per file and per pattern
27 	Preparing CONLL file format for writing...
27 	 G13_cl_counsel_train_iobes.txt
27 	 G13_cl_counsel_val_iobes.txt
27 	 G13_cl_counsel_test_iobes.txt
G13 :  28 : PATTERN TEMPLATE: 
	 <ATTORNEY_1> , Director of Labor Relations , argued the cause for respondent <COUNTY> County Department of Citizen Services ( <ATTORNEY_2> , <COUNTY> County Counsel , attorneys ; <ATTORNEY_3, on the brief ) .
------------------------------
[('ATTORNEY_1', 'attorney'), (',', 'other'), ('Director', 'other'), ('of', 'other'), ('Labor', 'other'), ('Relations', 'other'), (',', 'other'), ('argued', 'other'), ('the', 'other'), ('cause', 'other'), ('for', 'other'), ('respondent', 'other'), ('COUNTY', 'county'), ('County', 'other'), ('Department', 'other'), ('of', 'other'), ('Citizen', 'other'), ('Services', 'other'), ('(', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('COUNTY', 'county'), ('County', 'other'), ('Counsel', 'other'), (',', 'other'), ('attorneys'

33 	 G13_cl_counsel_val_iobes.txt
33 	 G13_cl_counsel_test_iobes.txt
G13 :  34 : PATTERN TEMPLATE: 
	 <ATTORNEY> , <GOV_TITLE> , <GOVT_ORG_AS_LAW_FIRM> , <CITY> , <STATE> , for Amicus Commonwealth of <STATE_NAME> .
------------------------------
[('ATTORNEY', 'attorney'), (',', 'other'), ('GOV_TITLE', 'employmentTitle'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), (',', 'other'), ('for', 'other'), ('Amicus', 'other'), ('Commonwealth', 'other'), ('of', 'other'), ('STATE_NAME', 'state'), ('.', 'other')]
34 	NER TOKEN LIST:  [('ATTORNEY', 'attorney'), (',', 'other'), ('GOV_TITLE', 'employmentTitle'), (',', 'other'), ('GOVT_ORG_AS_LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), (',', 'other'), ('for', 'other'), ('Amicus', 'other'), ('Commonwealth', 'other'), ('of', 'other'), ('STATE_NAME', 'state'), ('.', 'other')]
144  fake sentences per file and per pattern
34 	Prep

0 	 G16_cl_counsel_val_iobes.txt
0 	 G16_cl_counsel_test_iobes.txt
G17 : Processing this pattern group
G17 : The Number of Unique Patterns :  2
HOW MANY =  144
G17 :  0 : PATTERN TEMPLATE: 
	 For <LITIGANT_COMPANY> , <LITIGANT_PERSON> , <LITIGANT_COMPANY> , <ATTORNEY> , Esq . , <LAW_FIRM>  , <CITY> , <STATE> .
------------------------------
[('For', 'other'), ('LITIGANT_COMPANY', 'litigant'), (',', 'other'), ('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('LITIGANT_COMPANY', 'litigant'), (',', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('Esq', 'other'), ('.', 'other'), (',', 'other'), ('LAW_FIRM', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
0 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_COMPANY', 'litigant'), (',', 'other'), ('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('LITIGANT_COMPANY', 'litigant'), (',', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('Esq', 'other'), ('.', 'other'), (',', 'other'), ('LAW_

3 	 G18_cl_counsel_val_iobes.txt
3 	 G18_cl_counsel_test_iobes.txt
G19 : Processing this pattern group
G19 : The Number of Unique Patterns :  0
HOW MANY =  144
G20 : Processing this pattern group
G20 : The Number of Unique Patterns :  1
HOW MANY =  144
G20 :  0 : PATTERN TEMPLATE: 
	 Disciplinary Counsel <ATTORNEY_1> and Senior Assistant Disciplinary Counsel <ATTORNEY_2> , both of <CITY> , for the <UNIQUE_LAW_FIRM> .
------------------------------
[('Disciplinary', 'other'), ('Counsel', 'other'), ('ATTORNEY_1', 'attorney'), ('and', 'other'), ('Senior', 'other'), ('Assistant', 'other'), ('Disciplinary', 'other'), ('Counsel', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('both', 'other'), ('of', 'other'), ('CITY', 'city'), (',', 'other'), ('for', 'other'), ('the', 'other'), ('UNIQUE_LAW_FIRM', 'lawFirm'), ('.', 'other')]
0 	NER TOKEN LIST:  [('Disciplinary', 'other'), ('Counsel', 'other'), ('ATTORNEY_1', 'attorney'), ('and', 'other'), ('Senior', 'other'), ('Assistant', 'other'),

4 	 G24_cl_counsel_val_iobes.txt
4 	 G24_cl_counsel_test_iobes.txt
G25 : Processing this pattern group
G25 : The Number of Unique Patterns :  4
HOW MANY =  144
G25 :  0 : PATTERN TEMPLATE: 
	 For <LITIGANT_ROLE> : <ATTORNEY_1> , Attorney General , and <ATTORNEY_2> , Assistant Attorney General .
------------------------------
[('For', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('General', 'other'), (',', 'other'), ('and', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('Assistant', 'other'), ('Attorney', 'other'), ('General', 'other'), ('.', 'other')]
0 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('General', 'other'), (',', 'other'), ('and', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('Assistant', 'other'), ('Attorney', 'other'), ('General', 'other'), ('.', 'other')]
144  fake sentenc

2 	 G2_cl_counsel_val_iobes.txt
2 	 G2_cl_counsel_test_iobes.txt
G2 :  3 : PATTERN TEMPLATE: 
	 For <LITIGANT_PERSON> , <LITIGANT_ROLE> : <ATTORNEY_1> , LEAD ATTORNEY , <ATTORNEY_1> , Attorney at Law , LLC ,  <CITY_1> , <STATE_1> ; <ATTORNEY_2> , <ATTORNEY_2> , Attorney at Law , <CITY_2> , <STATE_2> .
------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{151: ('<ATTORNEY_2> , Attorney at Law', 'lawFirm')}
3 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('at', 'other'), ('Law', 'other'), (',', 'other'), ('LLC', 'other'), (',', 'other'), ('CITY_1', 'city'), (',', 'other'), ('STATE_1', 'state'), (';', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('<ATTORNEY_2> , Attorney at Law', 'lawFirm'), (',', 'other'), ('CITY_2', '

0 	 G3_cl_counsel_val_iobes.txt
0 	 G3_cl_counsel_test_iobes.txt
G3 :  1 : PATTERN TEMPLATE: 
	 For <LITIGANT_PERSON_1> Individually and as Parent and Next Best Friend of , <PERSON_2> , A minor other <PERSON_2> , <LITIGANT_PERSON_3> Individually and as Parent and Next Best Friend of , <PERSON_2> , A minor other <PERSON_2> , <LITIGANT_ROLE> : <ATTORNEY_1> , LEAD ATTORNEY , Attorney at Law , <CITY_1> , <STATE_1> ; <ATTORNEY_2> , LEAD ATTORNEY , The Law Office of <ATTORNEY_2> , PLLC , <CITY_2> , <STATE_2> ; <ATTORNEY_3> , LEAD ATTORNEY , <ATTORNEY_3> Law Firm , PLLC , <CITY_3> , <STATE_3> .
------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{445: ('<ATTORNEY_3> Law Firm , PLLC', 'lawFirm'), 351: ('The Law Office of <ATTORNEY_2> , PLLC', 'lawFirm')}
1 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_PERSON_1', 'litigant'), ('Individually', 'other'), ('and', 'other'), ('as', 'other'), ('Parent', 'other'), ('and', 'other'), ('Next', 'other'), ('Best', 'other'), ('Friend', 'other'),

6 	 G3_cl_counsel_val_iobes.txt
6 	 G3_cl_counsel_test_iobes.txt
G3 :  7 : PATTERN TEMPLATE: 
	 For <LITIGANT_ROLE> ( s ) : <ATTORNEY_1> , Law Offices of <ATTORNEY_1> , <CITY_1>, <STATE_1> ; <ATTORNEY_2> , Attorney at Law , <CITY_2>, <STATE_2> .
------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{43: ('Law Offices of <ATTORNEY_1>', 'lawFirm')}
7 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_ROLE', 'litigant'), ('(', 'other'), ('s', 'other'), (')', 'other'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('Law Offices of <ATTORNEY_1>', 'lawFirm'), (',', 'other'), ('CITY_1', 'city'), (',', 'other'), ('STATE_1', 'state'), (';', 'other'), ('ATTORNEY_2', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('at', 'other'), ('Law', 'other'), (',', 'other'), ('CITY_2', 'city'), (',', 'other'), ('STATE_2', 'state'), ('.', 'other')]
144  fake sentences per file and per pattern
7 	Preparing CONLL file format for writing...
7 	 G3_cl_counsel_train_iobes.txt
7 	 G3_cl_coun

G7 : Processing this pattern group
G7 : The Number of Unique Patterns :  3
HOW MANY =  144
G7 :  0 : PATTERN TEMPLATE: 
	 For <LITIGANT_PERSON> , individually and as personal representative of the estate of <PERSON_1> , deceased , <LITIGANT_ROLE> : <ATTORNEY_1> , <ATTORNEY_2> , <LAW_FIRM_1> , <CITY_1> , <STATE_1> ; <ATTORNEY_3> , <CITY_2> , <STATE_2> ; <ATTORNEY_4> , <ATTORNEY_4> , Attorney at Law , <CITY_3> , <STATE_3> ; <ATTORNEY_5> , ACLU of <STATE_NAME> Foundation , <CITY> , <STATE> ; <ATTORNEY_6> , <LAW_FIRM_2> , <CITY> , <STATE> ; <ATTORNEY_7> , ACLU of <STATE_NAME> Foundation , <CITY> , <STATE> .
------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{248: ('<ATTORNEY_4> , Attorney at Law', 'lawFirm'), 319: ('ACLU of <STATE_NAME> Foundation', 'lawFirm'), 436: ('ACLU of <STATE_NAME> Foundation', 'lawFirm')}
0 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_PERSON', 'litigant'), (',', 'other'), ('individually', 'other'), ('and', 'other'), ('as', 'other'), ('personal', 'othe

0 	 G9_cl_counsel_val_iobes.txt
0 	 G9_cl_counsel_test_iobes.txt
G10 : Processing this pattern group
G10 : The Number of Unique Patterns :  1
HOW MANY =  144
G10 :  0 : PATTERN TEMPLATE: 
	 For <GOV_LITIGANT> , <LITIGANT_ROLE> : <ATTORNEY_1> , LEAD ATTORNEY , <GOVT_ORG_AS_LAW_FIRM> - <CITY> , Northern District of <STATE> , <CITY> , <STATE> ; <ATTORNEY_2> , <ATTORNEY_3> , <GOVT_ORG_AS_LAW_FIRM> - <CITY> , <CITY> , <STATE> .
------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{70: ('<GOVT_ORG_AS_LAW_FIRM> - <CITY> , Northern District of <STATE>', 'lawFirm')}
0 	NER TOKEN LIST:  [('For', 'other'), ('GOV_LITIGANT', 'litigant'), (',', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY_1', 'attorney'), (',', 'other'), ('LEAD', 'other'), ('ATTORNEY', 'other'), (',', 'other'), ('<GOVT_ORG_AS_LAW_FIRM> - <CITY> , Northern District of <STATE>', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), (';', 'other'), ('ATTORNEY_2', 'attorney'), 

4 	 G12_cl_counsel_val_iobes.txt
4 	 G12_cl_counsel_test_iobes.txt
G12 :  5 : PATTERN TEMPLATE: 
	 For <LITIGANT_ROLE>  : <ATTORNEY> , <GOV_TITLE> , <STATE_NAME> Department of Justice , <CITY> , <STATE_NAME> .
------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{50: ('<STATE_NAME> Department of Justice', 'lawFirm')}
5 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('GOV_TITLE', 'employmentTitle'), (',', 'other'), ('<STATE_NAME> Department of Justice', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE_NAME', 'state'), ('.', 'other')]
144  fake sentences per file and per pattern
5 	Preparing CONLL file format for writing...
5 	 G12_cl_counsel_train_iobes.txt
5 	 G12_cl_counsel_val_iobes.txt
5 	 G12_cl_counsel_test_iobes.txt
G12 :  6 : PATTERN TEMPLATE: 
	 For R . S . , a minor , by her parent and next friend , E . B . , E . B .  , <LITIGANT_ROLE> : <ATTORNEY> , <ATTORNEY> and Asso

1 	 G18_cl_counsel_val_iobes.txt
1 	 G18_cl_counsel_test_iobes.txt
G18 :  2 : PATTERN TEMPLATE: 
	 <ATTORNEY> , Attorney General , OFFICE OF THE ATTORNEY GENERAL OF <STATE_NAME> , <CITY> , <STATE_NAME> , for Amicus State of <STATE_NAME> .
------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{32: ('OFFICE OF THE ATTORNEY GENERAL OF <STATE_NAME>', 'lawFirm')}
2 	NER TOKEN LIST:  [('ATTORNEY', 'attorney'), (',', 'other'), ('Attorney', 'other'), ('General', 'other'), (',', 'other'), ('OFFICE OF THE ATTORNEY GENERAL OF <STATE_NAME>', 'lawFirm'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE_NAME', 'state'), (',', 'other'), ('for', 'other'), ('Amicus', 'other'), ('State', 'other'), ('of', 'other'), ('STATE_NAME', 'state'), ('.', 'other')]
144  fake sentences per file and per pattern
2 	Preparing CONLL file format for writing...
2 	 G18_cl_counsel_train_iobes.txt
2 	 G18_cl_counsel_val_iobes.txt
2 	 G18_cl_counsel_test_iobes.txt
G19 : Processing this pattern group
G19 : T

0 	 G25_cl_counsel_val_iobes.txt
0 	 G25_cl_counsel_test_iobes.txt
G25 :  1 : PATTERN TEMPLATE: 
	 For <LITIGANT_ROLE> : <ATTORNEY> , #26033 , <PERSON_LAST_NAME> LAW , <CITY> , <STATE> .
------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{}
1 	NER TOKEN LIST:  [('For', 'other'), ('LITIGANT_ROLE', 'litigant'), (':', 'other'), ('ATTORNEY', 'attorney'), (',', 'other'), ('#26033', 'other'), (',', 'other'), ('PERSON_LAST_NAME', 'person'), ('LAW', 'other'), (',', 'other'), ('CITY', 'city'), (',', 'other'), ('STATE', 'state'), ('.', 'other')]
144  fake sentences per file and per pattern
1 	Preparing CONLL file format for writing...
1 	 G25_cl_counsel_train_iobes.txt
1 	 G25_cl_counsel_val_iobes.txt
1 	 G25_cl_counsel_test_iobes.txt
G26 : Processing this pattern group
G26 : The Number of Unique Patterns :  0
HOW MANY =  144
COMPLETED SPECIAL PATTERNS


***
__STOP HERE__

In [29]:
def get_text(tokenized_sent):
    text = ""
    for entity, tag in tokenized_sent:
        text = text + entity.strip() + ' '
    return text
    
pattern_list = []
sent_list = []
for key, val in pattern_sent_dict_special.items():
    #print(key)
    #print('-'*30)
    for tokenized_sent in val[0:1]:
        sent = get_text(tokenized_sent)
        pattern_list.append(key)
        sent_list.append(sent)
        print(sent)
        print('~'*30)
        
for key, val in pattern_sent_dict_non_spec.items():
    #print(key)
    #print('-'*30)
    for tokenized_sent in val[0:1]:
        sent = get_text(tokenized_sent)
        pattern_list.append(key)
        sent_list.append(sent)
        print(sent)
        print('~'*30)

pattern_sent_df = pd.DataFrame(list(zip(sent_list, pattern_list)))
pattern_sent_df.columns = ['Sentence_TEXT', 'PATTERN']
print(pattern_sent_df.shape)
pattern_sent_df.to_excel("../pattern_sent_examples_4.xlsx")

For State : Mark Boyle , CRIMINAL DISTRICT ATTORNEY ; Rebecca W   Blackmore , CHIEF OF APPELLATE DIVISION , ASSISTANT CRIMINAL DISTRICT ATTORNEY ; John Chamberlain , ASSISTANT CRIMINAL DISTRICT ATTORNEY , MCLEAN COUNTY DISTRICT ATTORNEY ’ S OFFICE JOHNSON LANE , Iowa . 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
For The City of SKYLAND ESTATES , a government entity , Defendants - Appellees : Gregory Francis Noonan , LEAD ATTORNEY , HUBBARD City Attorney ' s Office , SKYLAND ESTATES , WY ; JULIE CHRISTINE GREBEL , SKYLAND ESTATES City Attorneys Office , SKYLAND ESTATES , WY . 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
For United States Citizenship and Immigration Services , COUNTERDEFENDANT : TRAVIS E   BURNETT , GOVT , LEAD ATTORNEY , US ATTORNEY ' S OFFICE , TOM BEAN , MA . 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Mr . Jason J   George , Assistant Attorney General , For State of West Virginia . 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Mr . Lauren Courtney Dimaggio , General Counsel , For NATIONSTAR MORTGAGE LLC . 
~~~~~~~~~~

In [40]:
# Save all the Patterns used to Build the model
pattern_file_path = "../PATTERNS/CORPUS_GROUPS/01_GROUP_TEMPLATES_CASE_LAW_1000.xlsx"
ERROR_PATTERN_DF.to_excel(pattern_file_path, sheet_name = "ModelPatterns")
ner_file_path = "../PATTERNS/CORPUS_GROUPS/NamedEntities.xlsx"
NER_DF.to_excel (ner_file_path, sheet_name = "NamedEntities")


#SIZE = 13888
#print('RANDOM_STATE = ', RANDOM_STATE)
print(NON_SPECIAL_PATTERN_DF.shape[0], "\t: Traditional Templates")
print(SPECIAL_PATTERN_DF.shape[0], "\t : Special Templates")
print(len(UNIQUE_PATTERN_LIST), "\t: TOTAL COUNSEL TEMPLATES")
print('-'*30)
#print(SIZE, "\t : Training Data Sentences")
#print(SIZE, "\t : Validation Data Sentences")
#print(SIZE, "\t : Test Data Sentences")
#print(SIZE*3, "\t : TOTAL CORPUS SIZE")

191 	: Traditional Templates
34 	 : Special Templates
216 	: TOTAL COUNSEL TEMPLATES
------------------------------


In [576]:
print('-'*30)
print(f'\nEntity Tags Dictionary Keys: \n{ENTITY_CODE_DF_DCT.keys()}')
print('-'*30)

------------------------------

Entity Tags Dictionary Keys: 
dict_keys(['LITIGANT_PERSON', 'LITIGANT_COMPANY', 'LITIGANT_COUNTY', 'LITIGANT_CHURCH', 'LITIGANT_HOSPITAL', 'LITIGANT_INSURANCE_CO', 'LITIGANT_FOUNDATION', 'LITIGANT_ROLE', 'GOV_LITIGANT', 'LITIGANT_LAW_FIRM', 'ATTORNEY', 'LAW_FIRM', 'ODD_LAW_FIRM', 'GOVT_ORG_AS_LAW_FIRM', 'AMICUS_CURAE_LAW_FIRM', 'FOUNDATION_LAW_FIRM', 'CITY', 'CITY_ABBR', 'STATE', 'STATE_NAME', 'STATE_ABBR', 'COUNTY', 'PERSON', 'PERSON_LAST_NAME', 'GOV_TITLE', 'PRISON', 'CHURCH'])
------------------------------


In [22]:
print('RANDOM_STATE = ', RANDOM_STATE)
#process_non_special_patterns(pg_list = ["G1","G3"], merge = True, pattern_src = "cl_counsel")

RANDOM_STATE =  743


In [559]:
#process_special_patterns(pg_list = ["G1"], merge = False, pattern_src = "cl_counsel")

In [24]:
#pattern_group = "G3"
#df = NON_SPECIAL_PATTERN_DF [NON_SPECIAL_PATTERN_DF.GroupID == pattern_group]
#print(df.shape)
#df.head()