In [1]:
import re
import json
import os
import pandas as pd
from collections import Counter
from alive_progress import alive_bar


In [2]:
def add_text(dataframe, parsed_docs_folder):
    fr_texts = []
    en_texts = []

    with alive_bar(len(dataframe), force_tty=True) as bar:
        for _, row in dataframe.iterrows():
            bar()

            filename_fr = row['filename_fr']
            filename_en = row['filename_en']

            def get_json_path(pdf_filename):
                if pdf_filename.endswith(".pdf"):
                    json_filename = pdf_filename + ".json"
                    for root, _, files in os.walk(parsed_docs_folder):
                        if json_filename in files:
                            return os.path.join(root, json_filename)
                return None

            def load_raw_text(json_path):
                if json_path and os.path.exists(json_path):
                    with open(json_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                        return data.get('text', '')
                return ''

            fr_link = get_json_path(filename_fr)
            fr_text = load_raw_text(fr_link)

            if filename_fr == filename_en:
                en_text = fr_text
            else:
                en_link = get_json_path(filename_en)
                en_text = load_raw_text(en_link)

            fr_texts.append(fr_text)
            en_texts.append(en_text)

    dataframe['fr_text'] = fr_texts
    dataframe['en_text'] = en_texts

    return dataframe


filename = "../../Data/fr_eng_correlation_data.csv"
docs_folder = "../../Data/ParsedPublications"
testing_pickle = "testing_df.pickle"

if os.path.exists(testing_pickle):
    df = pd.read_pickle(testing_pickle)
else:
    df = pd.read_csv(filename)
    df = add_text(df, docs_folder)
    df.to_pickle(testing_pickle)

In [54]:
QUOTE_END_PATTERN = re.compile(r'[.!?]["\'"»"\']+\s')

KNOWN_ABBREVS = {
    'dr', 'mr', 'mrs', 'ms', 'prof', 'sr', 'jr', 'rev', 'hon', 'gov', 'gen', 'col', 'lt', 'sgt',
    'mme', 'mlle', 'me', 'pr', 'mgr',
    'st', 'ste', 'mt', 'ft', 'ave', 'blvd', 'rd', 'apt',
    'inc', 'ltd', 'corp', 'co', 'bros', 'assn', 'dept',
    'vs', 'etc', 'al', 'eg', 'ie', 'viz', 'approx', 'ca', 'cf',
    'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept', 'oct', 'nov', 'dec',
    'janv', 'fevr', 'avr', 'juil',
    'no', 'nos', 'vol', 'vols', 'ed', 'eds', 'ch', 'sec', 'fig', 'figs',
    'pp', 'pg', 'pgs',
    'op', 'cit', 'loc', 'ibid', 'et', 'seq',
    'misc', 'govt', 'natl', 'intl',
}


def is_likely_abbreviation(word):
    w = word.lower().rstrip('.')
    if len(w) <= 1:
        return False
    if w in KNOWN_ABBREVS:
        return True
    if len(w) <= 4 and w.isupper():
        return True
    return False


def scan_for_patterns(dataframe, fr_col='fr_text', en_col='en_text'):
    en_abbrevs = Counter()
    fr_abbrevs = Counter()
    
    pattern = re.compile(r'\b([A-Za-zÀ-ÿ]{2,5})\.\s+([A-ZÀ-Ÿ])')
    
    for idx, row in dataframe.iterrows():
        if pd.notna(row.get(en_col)):
            text = str(row[en_col])
            for m in pattern.finditer(text):
                word = m.group(1).lower()
                if is_likely_abbreviation(word):
                    en_abbrevs[word] += 1
        
        if pd.notna(row.get(fr_col)):
            text = str(row[fr_col])
            for m in pattern.finditer(text):
                word = m.group(1).lower()
                if is_likely_abbreviation(word):
                    fr_abbrevs[word] += 1
    
    return {
        'en_abbrevs': en_abbrevs,
        'fr_abbrevs': fr_abbrevs,
    }


def print_scan_results(results, min_count=2):
    print("=" * 60)
    print("PATTERN SCAN RESULTS")
    print("=" * 60)
    
    print("\nEnglish abbreviations:")
    for abbrev, count in results['en_abbrevs'].most_common(30):
        if count >= min_count:
            print(f"  {abbrev:10} : {count}")
    
    print("\nFrench abbreviations:")
    for abbrev, count in results['fr_abbrevs'].most_common(30):
        if count >= min_count:
            print(f"  {abbrev:10} : {count}")
    
    print(f"\nQuote-ending sentences: EN={results['quote_endings']['en']}, FR={results['quote_endings']['fr']}")


def get_exemption_list(results, min_count=1):
    combined = results['en_abbrevs'] + results['fr_abbrevs']
    return [abbrev for abbrev, count in combined.most_common() if count >= min_count]



In [None]:
# may as well include all of them
results = scan_for_patterns(df)
len(get_exemption_list(results, 0))

In [None]:
# final list of exemptions
print(sorted(get_exemption_list(results)))

['al', 'apr', 'assn', 'aug', 'ave', 'avr', 'blvd', 'bros', 'ca', 'cf', 'ch', 'cit', 'co', 'col', 'corp', 'dec', 'dept', 'dr', 'ed', 'eds', 'eg', 'et', 'etc', 'feb', 'fig', 'figs', 'ft', 'gen', 'gov', 'govt', 'hon', 'ibid', 'ie', 'inc', 'intl', 'jan', 'janv', 'jr', 'juil', 'jul', 'jun', 'loc', 'lt', 'ltd', 'mar', 'me', 'mgr', 'misc', 'mlle', 'mme', 'mr', 'mrs', 'ms', 'mt', 'natl', 'no', 'nos', 'nov', 'oct', 'op', 'pg', 'pgs', 'pp', 'pr', 'prof', 'rd', 'rev', 'sec', 'sep', 'sept', 'seq', 'sr', 'st', 'ste', 'viz', 'vol', 'vols', 'vs']


In [None]:
_EXEMPTIONS = {
    'al', 'apr', 'assn', 'aug', 'ave', 'avr', 'blvd', 'bros', 'ca', 'cf', 'ch', 'cit', 'co', 'col', 'corp', 'dec', 
    'dept', 'dr', 'ed', 'eds', 'eg', 'et', 'etc', 'feb', 'fig', 'figs', 'ft', 'gen', 'gov', 'govt', 'hon', 'ibid', 
    'ie', 'inc', 'intl', 'jan', 'janv', 'jr', 'juil', 'jul', 'jun', 'loc', 'lt', 'ltd', 'mar', 'me', 'mgr', 'misc', 
    'mlle', 'mme', 'mr', 'mrs', 'ms', 'mt', 'natl', 'no', 'nos', 'nov', 'oct', 'op', 'pg', 'pgs', 'pp', 'pr', 
    'prof', 'rd', 'rev', 'sec', 'sep', 'sept', 'seq', 'sr', 'st', 'ste', 'viz', 'vol', 'vols', 'vs'
}
_EXEMPT_PATTERN = re.compile(
    r'\b(' + '|'.join(re.escape(e) for e in sorted(_EXEMPTIONS, key=len, reverse=True)) + r')\.\s',
    re.IGNORECASE
)
_SPLIT_PATTERN = re.compile(r'([.?!]["\'\u00BB\u201D\u2019]*)\s+')

def split_text(text):
    protected = _EXEMPT_PATTERN.sub(lambda m: m.group().replace('.', '\x00'), text)
    parts = _SPLIT_PATTERN.split(protected)
    sentences = []
    for i in range(0, len(parts) - 1, 2):
        sentences.append((parts[i] + parts[i + 1]).replace('\x00', '.').strip())
    if len(parts) % 2 == 1 and parts[-1].strip():
        sentences.append(parts[-1].replace('\x00', '.').strip())
    return sentences


def clean_text(text):
    text = re.sub(r'[\u201C\u201D\u201E]', '"', text)  # " " „ -> "
    text = re.sub(r'[\u2018\u2019\u201A]', "'", text)  # ' ' ‚ -> '
    text = re.sub(r'[\u00AB\u00BB]', '"', text)        # « » -> "
    text = re.sub(r"[^a-zA-Z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF0-9.,;:!?()'\"-]", ' ', text)
    return re.sub(r'\s+', ' ', text).strip()
