In [1]:
import os
import re
import json
import pandas as pd
import numpy as np
import datetime
import itertools
import warnings
from zoning.utils import get_project_root
from zoning.term_extraction.types import RelevantContext

from typing import List, Tuple, Union, Optional

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


sk


# Not Found exploration

In [2]:
# Set the directory path where the CSV files are located
directory = os.path.join(get_project_root(), 'data', 'logs', 'included_context_phrases', 'not-found')

# List all files in the directory
os.listdir(directory)

['timestamp=2023-11-14_20-04_town=branford_district=Multifamily-Residence_term=min_lot_size_phrase=1.LOT-AREAUZ_tokens=2000_occurrence=not-found.csv',
 'timestamp=2023-11-14_20-04_town=andover_district=Andover-Lake_term=min_lot_size_phrase=Minimum-Lot-Area-shall-be-as-followsARD--One-Hundr_tokens=2000_occurrence=not-found.csv']

In [3]:
# Create an empty list to store the dataframes
dfs = []

# Loop through all CSV files in the directory and append them to the list
for filename in os.listdir(directory):
    if filename.endswith('.csv') and filename.startswith('timestamp'):
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)

# Concatenate all dataframes in the list into one
df = pd.concat(dfs, ignore_index=True)


# rename the columns to lowercase and replace spaces by underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
joined_towns = '-'.join(list(set(df.town.to_list())))
joined_terms = '-'.join(list(set(df.term.to_list())))


In [5]:
# Access the appropriate columns for the filename
earliest_timestamp = df.timestamp.min()
latest_timestamp = df.timestamp.max()


earliest_timestamp_dt = datetime.datetime.strptime(earliest_timestamp, '%Y-%m-%d_%H-%M')
latest_timestamp_dt = datetime.datetime.strptime(latest_timestamp, '%Y-%m-%d_%H-%M')

# Calculate the difference
delta = latest_timestamp_dt - earliest_timestamp_dt

# Extract days, hours, and minutes
days = delta.days
hours, remainder = divmod(delta.seconds, 3600)
minutes, _ = divmod(remainder, 60)

# save the joined dataframe as a CSV file in the data/logs/included_context_phrases directory
filename = f"joined_towns={joined_towns[:50]}_term={joined_terms}_first-date={earliest_timestamp}_last-date={latest_timestamp}_delta={days}days-{hours}hours-{minutes}minutes.csv"
df.to_csv(os.path.join(get_project_root(), 'data', 'logs', 'included_context_phrases', 'not-found', filename), index=False)

# checkpoint save
we reload df from checkpoint

In [6]:
# Filter those that start with "joined" and we pick the latest one.
files = os.listdir(directory)
joined_files = [file for file in files if file.startswith('joined')]
latest_file = sorted(joined_files, key=lambda x: os.path.getmtime(os.path.join(directory, x)), reverse=True)[0]

df = pd.read_csv(os.path.join(directory, latest_file))

In [7]:
df

Unnamed: 0,phrase,town,district.full_name,district.short_name,term,n_tokens,occurrence,before-context,after-context,document,timestamp
0,1.\nLOT AREA\nUZ,branford,Multifamily-Residence,MF,min_lot_size,2000,not-found,\nNEW PAGE 89\nSECTION 5.7\n5.7\nSECTION\n5\nS...,"ess district and no subzone\nis designated, th...",\nNEW PAGE 89\nSECTION 5.7\n5.7\nSECTION\n5\nS...,2023-11-14_20-04
1,Minimum Lot Area shall be as follows:\nARD- *O...,andover,Andover-Lake,AL,min_lot_size,2000,not-found,\nNEW PAGE 22\n4.9.1\nAuthorization - The Plan...,isfaction of the Commission:\na.\nThat such lo...,\nNEW PAGE 22\n4.9.1\nAuthorization - The Plan...,2023-11-14_20-04


In [8]:
print(df.document[0])


NEW PAGE 89
SECTION 5.7
5.7
SECTION
5
SPECIAL DISTRICTS
Branford Zoning Regulations
12.25.2019
83
CELL (1, 1): 
CELL (1, 2): 
INCENTIVE HOUSING OVERLAY DISTRICT (IHOD)
CELL (2, 1): 
5.7.A Purpose
CELL (2, 2): 
5.7.A Purpose
CELL (3, 1): 
1.
CELL (3, 2): 
The primary purpose of the Incentive Housing Overlay District (IHOD) is to
encourage the development and maintenance of affordable housing in both
residential and business districts that have the transportation connections, nearby
access to amenities and services, and infrastructure necessary to support more
concentrated levels of development.
CELL (4, 1): 
2.
CELL (4, 2): 
The IHOD seeks to avoid sprawl and traffic congestion by encouraging a more
vibrant residential component to business or mixed-use areas in order to sustain a
lifestyle in which residents can walk or use public transportation to reach jobs,
services, and recreational or cultural opportunities.
CELL (5, 1): 
3.
CELL (5, 2): 
The IHOD is also intended to enable infil

### Add relevant pages that appear in the context

In [9]:
if 'pages' not in df.columns:
    # split document by NEW PAGE
    page_list = df['document'].str.split('\nNEW PAGE').tolist()
    clean_page_list = [[page.split('\n')[0] for page in pages if page.split('\n')[0].strip() != ''] for pages in page_list]
    #print(clean_page_list)

    df['pages'] = pd.Series(clean_page_list)

### Double check (are pages included) 
answer should be FALSE

In [10]:
if 'phrase_included' not in df.columns:
    # split the phrases into a list by \n and separate it into new columns
    df = df.join(df['phrase'].str.split('\n', expand=True).add_prefix('phrase_'))

    # we check if phrase is included (expected not to be)
    df['phrase_included'] = df.apply(lambda x: x['phrase'] in x['document'], axis=1) 

df = df.sort_values(by=['town', 'phrase'], ascending=[True, True])
df

Unnamed: 0,phrase,town,district.full_name,district.short_name,term,n_tokens,occurrence,before-context,after-context,document,timestamp,pages,phrase_0,phrase_1,phrase_2,phrase_3,phrase_4,phrase_5,phrase_6,phrase_included
1,Minimum Lot Area shall be as follows:\nARD- *O...,andover,Andover-Lake,AL,min_lot_size,2000,not-found,\nNEW PAGE 22\n4.9.1\nAuthorization - The Plan...,isfaction of the Commission:\na.\nThat such lo...,\nNEW PAGE 22\n4.9.1\nAuthorization - The Plan...,2023-11-14_20-04,"[ 22, 23, 24, 25, 26]",Minimum Lot Area shall be as follows:,"ARD- *One Hundred and twenty thousand (120,000...",calculated including the Access way,"AL- One Hundred and twenty thousand (120,000) ...",calculated including the Access way,"AL- open space- Thirty thousand (30,000) squar...",excluding the Access way,False
0,1.\nLOT AREA\nUZ,branford,Multifamily-Residence,MF,min_lot_size,2000,not-found,\nNEW PAGE 89\nSECTION 5.7\n5.7\nSECTION\n5\nS...,"ess district and no subzone\nis designated, th...",\nNEW PAGE 89\nSECTION 5.7\n5.7\nSECTION\n5\nS...,2023-11-14_20-04,"[ 89, 90]",1.,LOT AREA,UZ,,,,,False


In [11]:
# Assuming 'sentence' column contains sentences of the document
def find_sentences_with_phrase(document, phrase):
    """Returns all sentences containing the phrase."""
    sentences = document.split('.')
    return [sentence.strip() for sentence in sentences if phrase in sentence]

# create an empty dictionary to store the results
results = {}

# get all columns that start with 'phrase_' followed by a number
phrase_cols = [col for col in df.columns if re.match(r'^phrase_\d+$', col)]

# iterate over the phrase columns
for col in phrase_cols:
    # Create a temporary column to check if the phrase is in the document
    df[col + '_included'] = df.apply(lambda x: x[col] is not None and x[col] in x['document'], axis=1)
    
    # If the phrase is included, get all sentences where it appears
    df[col + '_sentences'] = df.apply(lambda x: find_sentences_with_phrase(x['document'], x[col]) if x[col + '_included'] else [], axis=1)
    
    # If the phrase is included in any of the rows, add it to the results dictionary
    if df[col + '_included'].any():
        sentences = df.loc[df[col + '_included'], col + '_sentences'].explode().dropna().tolist()
        phrase = df[col].dropna().iloc[0]  # Assuming that the same phrase is repeated in the column
        results[phrase] = sentences

    # Drop temporary columns
    df.drop([col + '_sentences'], axis=1, inplace=True)

# print the results dictionary
#print(results)
with open('not-found-context-results.json', 'w') as f:
    json.dump(results, f, indent=4)

df

Unnamed: 0,phrase,town,district.full_name,district.short_name,term,n_tokens,occurrence,before-context,after-context,document,timestamp,pages,phrase_0,phrase_1,phrase_2,phrase_3,phrase_4,phrase_5,phrase_6,phrase_included,phrase_0_included,phrase_1_included,phrase_2_included,phrase_3_included,phrase_4_included,phrase_5_included,phrase_6_included
1,Minimum Lot Area shall be as follows:\nARD- *O...,andover,Andover-Lake,AL,min_lot_size,2000,not-found,\nNEW PAGE 22\n4.9.1\nAuthorization - The Plan...,isfaction of the Commission:\na.\nThat such lo...,\nNEW PAGE 22\n4.9.1\nAuthorization - The Plan...,2023-11-14_20-04,"[ 22, 23, 24, 25, 26]",Minimum Lot Area shall be as follows:,"ARD- *One Hundred and twenty thousand (120,000...",calculated including the Access way,"AL- One Hundred and twenty thousand (120,000) ...",calculated including the Access way,"AL- open space- Thirty thousand (30,000) squar...",excluding the Access way,False,True,True,True,True,True,True,True
0,1.\nLOT AREA\nUZ,branford,Multifamily-Residence,MF,min_lot_size,2000,not-found,\nNEW PAGE 89\nSECTION 5.7\n5.7\nSECTION\n5\nS...,"ess district and no subzone\nis designated, th...",\nNEW PAGE 89\nSECTION 5.7\n5.7\nSECTION\n5\nS...,2023-11-14_20-04,"[ 89, 90]",1.,LOT AREA,UZ,,,,,False,True,True,True,False,False,False,False


In [12]:
# TODO update so that this works with n phrases
def adjust_context_indices(document: str, start_idx: int, end_idx: int, char_budget: int = 500) -> Tuple[int, int]:
    """
    Adjust the start and end indices for a context based on the character budget.
    
    Parameters:
    - document: The document string.
    - start_idx: Starting index of the context.
    - end_idx: Ending index of the context.
    - char_budget: The total character count allowed for the context (default is 500).

    Returns:
    - Tuple of (adjusted_start_idx, adjusted_end_idx).
    """
    pre_context_budget = 100
    pre_context_end = start_idx
    pre_context_start = max(0, pre_context_end - pre_context_budget, document.rfind('\n', 0, pre_context_end) + 1)

    remaining_budget = char_budget - (pre_context_end - pre_context_start) - (end_idx - start_idx)
    post_context_start = end_idx
    post_context_end = min(len(document), post_context_start + remaining_budget)

    return pre_context_start, post_context_end

def get_multiple_matches(document: str, phrase: str) -> List[int]:
    """Return all starting indices of a phrase within a document."""
    starts = []
    idx = document.find(phrase)
    while idx != -1:
        starts.append(idx)
        idx = document.find(phrase, idx + 1)
    return starts

def select_best_candidate(context_details: List[RelevantContext]) -> RelevantContext:
    # Select the best candidates sorting by minimum distance:
    context_details.sort(key=lambda x: x[1])
    return context_details[0]

def get_relevant_context(document: str, *phrases: str) -> List[RelevantContext]:
    valid_phrases = [phrase for phrase in phrases if phrase]
    if not valid_phrases:
        warnings.warn("No valid phrases were supplied.")
        return ""

    if len(valid_phrases) == 1:
        return fallback_context(document, valid_phrases[0], index=-1)
    
    phrase_pairs = list(itertools.combinations(valid_phrases, 2))
    context_details = []
    lines = document.split('\n')

    for pair in phrase_pairs:
        idx_0_starts, idx_1_starts = get_multiple_matches(document, pair[0]), get_multiple_matches(document, pair[1])

        # Handle multiple matches
        if len(idx_0_starts) > 1:
            warnings.warn(f"More than one match found for '{pair[0]}'. Considering all matches.")
        if len(idx_1_starts) > 1:
            warnings.warn(f"More than one match found for '{pair[1]}'. Considering all matches.")
        
        for i_0 in idx_0_starts:
            for i_1 in idx_1_starts:
                if i_0 < i_1:
                    start, end = adjust_context_indices(document, i_0, document.find('\n', i_1) + 1)
                    context = document[start:end]
                    chars = len(context)
                    distance = i_1 - i_0
                    line_idx_0, line_idx_1 = document[:i_0].count('\n') + 1, document[:i_1].count('\n') + 1
                    context_details.append((chars, distance, line_idx_0, line_idx_1, context))
                else:
                    context_details.append(fallback_context(document, pair[0]))

    return select_best_candidate(context_details)

def fallback_context(document: str, phrase: str, index: int = None) -> Tuple[int, int, Optional[int], Optional[int], str]:
    """
    Retrieve a context around a given phrase or index if the phrase is not found.
    
    Parameters:
    - document: The document string.
    - phrase: Target phrase.
    - index: Index of the phrase (default is None).

    Returns:
    - Context details.
    """
    if index is None:
        index = document.find(phrase)
    
    if index == -1:
        warnings.warn(f"Phrase {phrase} was not in the supplied document.")
        start, end = adjust_context_indices(document, len(document) // 2, len(document) // 2)
    else:
        start, end = adjust_context_indices(document, index, index + len(phrase))
    
    context = document[start:end]
    chars = len(context)
    distance = np.inf if index == -1 else 0
    line_idx = document.split('\n').index(phrase) + 1 if index != -1 else None
    return (chars, distance, line_idx, -1, context)


In [13]:
def get_context_for_row(row):
    doc = row['document']
    phrase_1 = row['phrase_0']
    phrase_2 = row['phrase_1']
    phrase_3 = row['phrase_2']
    
    result = get_relevant_context(doc, phrase_1, phrase_2, phrase_3)
    return result

# Apply the function to each row and save the full result to a new column 'context_result'
df['context_result'] = df.apply(get_context_for_row, axis=1)

# Now, extract necessary parts from 'context_result' and create additional columns
df['modified_context'] = df['context_result'].apply(lambda x: x[-1])
df['min_distance'] = df['context_result'].apply(lambda x: x[1])
df['line_idx_0'] = df['context_result'].apply(lambda x: x[2])
df['line_idx_1'] = df['context_result'].apply(lambda x: x[3])

df.drop(['context_result'], axis=1, inplace=True)

#save
filename = f"clean_joined_towns={joined_towns[:25]}_term={joined_terms}_first-date={earliest_timestamp}_last-date={latest_timestamp}_delta={days}days-{hours}hours-{minutes}minutes.csv"
df.to_csv(os.path.join(get_project_root(), 'data', 'logs', 'included_context_phrases', 'not-found', filename), index=False)



In [14]:
df[df.town =="beacon-falls"]

Unnamed: 0,phrase,town,district.full_name,district.short_name,term,n_tokens,occurrence,before-context,after-context,document,timestamp,pages,phrase_0,phrase_1,phrase_2,phrase_3,phrase_4,phrase_5,phrase_6,phrase_included,phrase_0_included,phrase_1_included,phrase_2_included,phrase_3_included,phrase_4_included,phrase_5_included,phrase_6_included,modified_context,min_distance,line_idx_0,line_idx_1
