In [1]:
import pandas as pd

# Missing Apostrophes - Deeper Dive

In [None]:
legitimate_english = {'i', 'a', 'o'}
legitimate_french = {'à', 'a', 'y', 'ô', 'ù'}

contraction_patterns_english = {
    't': ['don', 'doesn', 'didn', 'won', 'wouldn', 'couldn', 'shouldn', 'isn', 'aren', 'wasn', 'weren', 'hasn', 'haven', 'hadn', 'can', 'ain', 'it', 'that'],
    's': ['it', 'that', 'what', 'he', 'she', 'there', 'here', 'let', 'where', 'who'],
    'm': ['i'],
    'd': ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'who', 'what', 'there'],
    'll': ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'who', 'what', 'there'],
    've': ['i', 'you', 'we', 'they', 'would', 'could', 'should', 'might', 'must'],
    're': ['you', 'we', 'they', 'who', 'what', 'there', 'here', 'where'],
}

contraction_patterns_french = {
    'j': ['ai', 'avais', 'aurai', 'aurais', 'étais', 'en', 'y', 'irai', 'irais', 'espère', 'aime', 'adore', 'habite', 'arrive', 'entends', 'attends', 'ouvre', 'écoute', 'imagine', 'ignore', 'accepte',
          'apprends', 'appelle', 'essaie', 'essaye', 'entends'],
    'l': ['a', 'est', 'était', 'ont', 'avait', 'aura', 'aurait', 'on', 'un', 'une', 'autre', 'homme', 'eau', 'air', 'or', 'argent', 'amour', 'ami', 'amie', 'enfant', 'église', 'école', 'Europe',
          'Amérique', 'Afrique', 'Asie', 'Italie', 'Espagne', 'Allemagne', 'Angleterre'],
    'd': ['un', 'une', 'abord', 'accord', 'autres', 'ailleurs', 'après', 'autant', 'entre', 'eux', 'elle', 'elles', 'ici', 'où', 'avoir', 'être'],
    'n': ['a', 'ai', 'as', 'avons', 'avez', 'ont', 'est', 'es', 'êtes', 'y', 'en', 'importe', 'oublie', 'oubliez'],
    'm': ['a', 'as', 'avez', 'en', 'y', 'appelle', 'ont', 'est'],
    't': ['a', 'as', 'en', 'y', 'ont', 'es', 'est', 'aime', 'appelle', 'inquiète', 'il', 'elle', 'on'],
    's': ['il', 'ils', 'en', 'y', 'est', 'était', 'appelle', 'agit', 'avère'],
    'c': ['est', 'était', 'a', 'en'],
    'qu': ['il', 'ils', 'elle', 'elles', 'on', 'un', 'une', 'est', 'en', 'à', 'au', 'aux', 'y'],
}


def has_single_letter_word(text):
    if not isinstance(text, str):
        return False
    words = text.split()
    for word in words:
        cleaned = word.strip('.,!?;:"\'-()[]{}')
        if len(cleaned) == 1:
            return True
    return False


def get_single_letter_words(text):
    if not isinstance(text, str):
        return []
    words = text.split()
    single_letters = []
    for i, word in enumerate(words):
        cleaned = word.strip('.,!?;:"\'-()[]{}')
        if len(cleaned) == 1:
            single_letters.append((i, cleaned.lower(), word))
    return single_letters


def is_legitimate_single_letter(letter, lang):
    letter_lower = letter.lower()
    if lang == 'en':
        return letter_lower in legitimate_english
    else:
        return letter_lower in legitimate_french


def check_missing_apostrophe(text, letter_info, lang):
    if not isinstance(text, str):
        return False, None
    words = text.split()
    idx, letter, original = letter_info
    letter_lower = letter.lower()
    
    if lang == 'en':
        patterns = contraction_patterns_english
    else:
        patterns = contraction_patterns_french
    
    if letter_lower in patterns:
        if idx > 0:
            prev_word = words[idx - 1].strip('.,!?;:"\'-()[]{}').lower()
            if prev_word in patterns[letter_lower]:
                return True, f"{prev_word} {letter}"
        if idx < len(words) - 1:
            next_word = words[idx + 1].strip('.,!?;:"\'-()[]{}').lower()
            if next_word in patterns[letter_lower]:
                return True, f"{letter} {next_word}"
    
    for pattern_letter, preceding_words in patterns.items():
        if idx > 0:
            prev_word = words[idx - 1].strip('.,!?;:"\'-()[]{}').lower()
            if prev_word == letter_lower and pattern_letter in [w.strip('.,!?;:"\'-()[]{}').lower() for w in words[idx + 1:idx + 2]] if idx < len(words) - 1 else False:
                return True, f"{letter} {words[idx + 1] if idx < len(words) - 1 else ''}"
    
    return False, None


def analyze_row(row):
    issues = []
    
    if row['source_lang'] == 'en':
        en_text, fr_text = row['source'], row['target']
    else:
        en_text, fr_text = row['target'], row['source']
    
    en_singles = get_single_letter_words(en_text)
    for letter_info in en_singles:
        idx, letter, original = letter_info
        if not is_legitimate_single_letter(letter, 'en'):
            is_apostrophe, pattern = check_missing_apostrophe(en_text, letter_info, 'en')
            if is_apostrophe:
                issues.append(('english', 'missing_apostrophe', pattern, en_text))
            else:
                issues.append(('english', 'ocr_or_other', letter, en_text))
    
    fr_singles = get_single_letter_words(fr_text)
    for letter_info in fr_singles:
        idx, letter, original = letter_info
        if not is_legitimate_single_letter(letter, 'fr'):
            is_apostrophe, pattern = check_missing_apostrophe(fr_text, letter_info, 'fr')
            if is_apostrophe:
                issues.append(('french', 'missing_apostrophe', pattern, fr_text))
            else:
                issues.append(('french', 'ocr_or_other', letter, fr_text))
    
    return issues

def filter_dataframe(filename):
    df = pd.read_json(filename, lines=True)
    df['has_single_letter'] = df.apply(
        lambda row: has_single_letter_word(row['source']) or has_single_letter_word(row['target']),
        axis=1
    )
    return df[df['has_single_letter']].copy()

def results_dataframe(dataframe):
    results = []
    for idx, row in dataframe.iterrows():
        issues = analyze_row(row)
        for issue in issues:
            results.append({
                'original_index': idx,
                'source': row['source'],
                'target': row['target'],
                'source_lang': row['source_lang'],
                'language': issue[0],
                'issue_type': issue[1],
                'pattern': issue[2],
                'text_with_issue': issue[3]
            })

    return pd.DataFrame(results)

def print_results(filtered_dataframe, results_dataframe, verbose=True):
    if not results_dataframe.empty:
        results_dataframe.to_csv("single_letter_analysis.csv", index=False)
        
        summary_df = results_dataframe.drop_duplicates(subset=['pattern']).sort_values('pattern').reset_index(drop=True)
        summary_df.to_csv("summary_of_patterns.csv", index=False)
        
        print(f"Total rows with single letter words: {len(filtered_dataframe)}")
        print(f"Total issues found: {len(results_dataframe)}")
        print(f"Unique patterns: {len(summary_df)}")
        print(f"Missing apostrophe issues: {len(results_dataframe[results_dataframe['issue_type'] == 'missing_apostrophe'])}")
        print(f"Potential OCR issues: {len(results_dataframe[results_dataframe['issue_type'] == 'ocr_or_other'])}")
        if verbose:
            print("\n--- Summary of Patterns ---")
            for _, row in summary_df.iterrows():
                print(f"{row['pattern']:<15}{row['text_with_issue']}")
    else:
        print("No issues found")


In [31]:
filename = "../../Data/training_data.jsonl"
# filename = "../../Data/pipeline_training_data.jsonl"
# filename = "../../Data/pipeline_testing_data.jsonl"

filtered_df = filter_dataframe(filename)
results_df = results_dataframe(filtered_df)
print_results(filtered_df, results_df)

Total rows with single letter words: 224414
Total issues found: 176720
Unique patterns: 143
Missing apostrophe issues: 2787
Potential OCR issues: 173933

--- Summary of Patterns ---
0              We do not present results from the 0 to 30 m here because we have not sufficiently well defined the very strong seasonal cycle for this layer yet.
1              In the northeast, most of the bottom was covered by temperatures ranging from 1 to 4 C.
2              Maritimes Region State of the Ocean 2 Average Conditions Temperature and salinity conditions within the Scotian Shelf, Bay of Fundy and Gulf of Maine vary spatially due to complex bottom topography, transport from upstream sources such as the Gulf of St.
3              La température et la salinité augmentent généralement d'est en ouest et de la côte Région des Maritimes État de l'océan 3 au large, sous l'influence des eaux du large plus chaudes et plus salées, et de l'apport d'eau plus douce du golfe du Saint-Laurent.
4            

In [32]:
filtered_df['source_lang'].value_counts()

source_lang
fr    133073
en     91341
Name: count, dtype: int64

In [33]:
filtered_df.shape

(224414, 4)

In [35]:
results_df.head(10)

Unnamed: 0,original_index,source,target,source_lang,language,issue_type,pattern,text_with_issue
0,1,C est pourquoi on mesure les conditions océano...,Physical oceanographic conditions (mainly wate...,fr,french,missing_apostrophe,c est,C est pourquoi on mesure les conditions océano...
1,10,Maritimes Region State of the Ocean 2 Average ...,Conditions moyennes La température et la salin...,en,english,ocr_or_other,2,Maritimes Region State of the Ocean 2 Average ...
2,16,"At the surface, the range is about 16 C but th...","À la surface, elle est d'environ 16 C, mais on...",en,english,ocr_or_other,c,"At the surface, the range is about 16 C but th..."
3,16,"At the surface, the range is about 16 C but th...","À la surface, elle est d'environ 16 C, mais on...",en,english,ocr_or_other,m,"At the surface, the range is about 16 C but th..."
4,16,"At the surface, the range is about 16 C but th...","À la surface, elle est d'environ 16 C, mais on...",en,french,ocr_or_other,c,"À la surface, elle est d'environ 16 C, mais on..."
5,16,"At the surface, the range is about 16 C but th...","À la surface, elle est d'environ 16 C, mais on...",en,french,ocr_or_other,m,"À la surface, elle est d'environ 16 C, mais on..."
6,17,"À la surface, elle est d'environ 16 C, mais on...","At the surface, the range is about 16 C but th...",fr,english,ocr_or_other,c,"At the surface, the range is about 16 C but th..."
7,17,"À la surface, elle est d'environ 16 C, mais on...","At the surface, the range is about 16 C but th...",fr,english,ocr_or_other,m,"At the surface, the range is about 16 C but th..."
8,17,"À la surface, elle est d'environ 16 C, mais on...","At the surface, the range is about 16 C but th...",fr,french,ocr_or_other,c,"À la surface, elle est d'environ 16 C, mais on..."
9,17,"À la surface, elle est d'environ 16 C, mais on...","At the surface, the range is about 16 C but th...",fr,french,ocr_or_other,m,"À la surface, elle est d'environ 16 C, mais on..."


# Matching Pipelines