In [121]:
import json
import os
import random
import re
import subprocess
import numpy as np
import pandas as pd

from collections import Counter

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [2]:
# all files that have been downloaded and parsed
parsed_docs_folder = os.path.join("..", "ParsedPublications")

min_year = 2023
parsed_files = list()
parsed_files_with_hq_ocr = list()
for folder in os.listdir(parsed_docs_folder):
    path = os.path.join(parsed_docs_folder, folder)
    if os.path.isdir(path):
        for json_file in os.listdir(path):
            if json_file.endswith(".json"):
                parsed_files.append(json_file.replace('.json', ''))
                if folder in [str(year) for year in range(min_year, 2024 + 1)]:
                    parsed_files_with_hq_ocr.append(json_file.replace('.json', ''))

# all files from website
fr_eng_correlation_csv = "fr_eng_correlation_data.csv"
fr_eng_correlation_df = pd.read_csv(fr_eng_correlation_csv)
# exclude files that aren't downloaded, and files that have been withdrawn
fr_eng_correlation_df = fr_eng_correlation_df[(fr_eng_correlation_df.filename_en.isin(parsed_files)) | (fr_eng_correlation_df.filename_fr.isin(parsed_files))]
fr_eng_correlation_df = fr_eng_correlation_df[(fr_eng_correlation_df.filename_en != 'WITHDRAWN') & (fr_eng_correlation_df.filename_fr != 'WITHDRAWN')]

# weblinks for previewing / checking results
weblinks_df = fr_eng_correlation_df.copy()
weblinks_df = weblinks_df[['pub_number', 'nom', 'name', 'url_fr', 'url_en', 'file_url_fr', 'file_url_en']]

# data to be used for language classifier
lang_df = fr_eng_correlation_df.copy()
lang_df = lang_df[(lang_df.filename_fr.isin(parsed_files_with_hq_ocr)) & (lang_df.filename_en.isin(parsed_files_with_hq_ocr)) & (lang_df.filename_fr != lang_df.filename_en)]


# helper functions

In [3]:
def preview_publication(pub_number):
    if type(pub_number) is pd.DataFrame and pub_number.shape[0] == 1:
        try:
            pub_number = pub_number['pub_number'].values[0]
        except ValueError:
            return None
    elif type(pub_number) is pd.Series:
        try:
            pub_number = pub_number.values[0]
        except ValueError:
            return None
    
    try:
        output_df = weblinks_df[weblinks_df.pub_number == pub_number].T
    except Exception as e:
        print(e)
        return None
        
    return output_df


def get_filepaths(row, min_year=2023):
    fr_filename, en_filename = row['filename_fr'] + '.json', row['filename_en'] + '.json'
    file_folders = [os.path.join('..', 'ParsedPublications', str(year)) for year in range(min_year, 2024 + 1)]
    
    try:
        fr_path, en_path = ([os.path.join(folder, fr_filename) for folder in file_folders if os.path.exists(os.path.join(folder, fr_filename))][0], 
                            [os.path.join(folder, en_filename) for folder in file_folders if os.path.exists(os.path.join(folder, en_filename))][0])
    except IndexError:
        return None, None
    
    return fr_path, en_path


In [4]:
# Make lists of all French words and all English words

valid_word_regex = re.compile(r'^[a-zA-ZÀ-ÿ]+$')
french_word_list = []
english_word_list = []
exclude_words_with_less_than_n = 10

# clean headers and appendices
references_fr = r'RÉFÉRENCES CITÉES'.lower()
references_en = r'REFERENCES CITED'.lower()


for i, row in lang_df.iterrows():
    fr_path, en_path = get_filepaths(row)
    
    with open(fr_path, 'r', encoding='utf-8') as file:
        fr_text = json.load(file).get('text', '').lower()
            
        parts = re.split(references_fr, fr_text, flags=re.IGNORECASE)
        if 2 < len(parts) < 5:  # if 2 or 3 occurences of references text, take the second part (to get the main body text)
            fr_text = parts[1]
        
        french_word_list.extend(word for word in fr_text.split() if valid_word_regex.match(word))
    
    with open(en_path, 'r', encoding='utf-8') as file:
        en_text = json.load(file).get('text', '').lower()
            
        parts = re.split(references_en, en_text, flags=re.IGNORECASE)
        if 2 < len(parts) < 5:  # if 2 or 3 occurences of references text, take the second part (to get the main body text)
            en_text = parts[1]
        
        english_word_list.extend(word for word in en_text.split() if valid_word_regex.match(word))
        
# For testing
french_word_counts = Counter(french_word_list)
french_word_counts_expanded = []
for word, count in french_word_counts.items():
    for _ in range(count):
        french_word_counts_expanded.append((word, count))
        
english_word_counts = Counter(english_word_list)
english_word_counts_expanded = []
for word, count in english_word_counts.items():
    for _ in range(count):
        english_word_counts_expanded.append((word, count))

full_french_word_list = french_word_list.copy()
full_english_word_list = english_word_list.copy()

# Remove words with less than 10 occurrences
french_word_list = [word for word, count in french_word_counts.items() if count >= exclude_words_with_less_than_n]
english_word_list = [word for word, count in english_word_counts.items() if count >= exclude_words_with_less_than_n]

# Convert to sets for further processing
french_words = set(french_word_list)
english_words = set(english_word_list)

# Remove overlapping words
overlapping_words = english_words & french_words
english_words.difference_update(overlapping_words)
french_words.difference_update(overlapping_words)

# Remove numeric-only words
english_words.difference_update({w for w in english_words if w.isnumeric()})
french_words.difference_update({w for w in french_words if w.isnumeric()})


In [5]:
# helper functions for word lists

def test_wordlists(text_block, english_words, french_words):
    en_count = sum(1 for word in text_block.split() if word in english_words)
    fr_count = sum(1 for word in text_block.split() if word in french_words)
    
    print('english words:', list(word for word in text_block.split() if word in english_words))
    print('french words:', list(word for word in text_block.split() if word in french_words))
    print(f'{en_count=}, {fr_count=}')    
    
def most_common_word_info(counter_obj, n=10):  
    length = counter_obj.total()
    c_v = 0
    for k, v in counter_obj.most_common(n):
        rng = f'({(100 * c_v) / length:.0f}%'
        c_v += v
        rng += f'-{(100 * c_v) / length:.0f}%)'
        print(f'{k:<20}{v:>8}{(100 * v) / length:>8.0f}%{rng:>15}')

def nth_percentile(p, counter_obj, greater_than=True):
    sorted_list = sorted(counter_obj.items(), key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    return sorted_list[index]

def nth_percentile_weighted(p, counter_expanded, greater_than=True):
    sorted_list = sorted(counter_expanded, key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    return sorted_list[index]

def count_nth_percentile(p, counter_obj, greater_than=True):
    sorted_list = sorted(counter_obj.items(), key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    return len(sorted_list[index:]) if greater_than else len(sorted_list[-max(index, 1):])

def count_nth_percentile_weighted(p, counter_expanded, greater_than=True):
    sorted_list = sorted(counter_expanded, key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    count_if_gte = sorted_list[len(sorted_list) - index][1]
    count_if_lte = sorted_list[index][1]
        
    gte = {x for x in counter_expanded if x[1] >= count_if_gte}
    lte = {x for x in counter_expanded if x[1] <= count_if_lte}
    
    return len(gte) if greater_than else len(lte)

def examples_at_word_count(n, counter_obj, n_samples=5, tolerance=0.1):
    lower, upper = int((1 - 0.1) * n), int((1 + 0.1) * n)
    all_examples = [k for k, v in counter_obj.items() if lower <= v <= upper]
    
    return random.sample(all_examples, min(n_samples, len(all_examples)))


In [6]:
test_wordlists('Total mortalities at age, based on survey data, are presented in Table', english_words, french_words)
print()
most_common_word_info(english_word_counts, 4)
print()
most_common_word_info(french_word_counts, 4)

english words: ['mortalities', 'presented']
french words: []
en_count=2, fr_count=0

the                   168894       7%        (0%-7%)
of                     94029       4%       (7%-11%)
and                    93778       4%      (11%-15%)
in                     72480       3%      (15%-19%)

de                    237366       8%        (0%-8%)
la                    140133       5%       (8%-13%)
et                    100646       4%      (13%-17%)
les                   100095       4%      (17%-21%)


In [7]:
for p in [0.1, 0.5, 0.95]:
    print(p, nth_percentile(p, english_word_counts), nth_percentile(p, english_word_counts, False))
    print(p, 'weighted', nth_percentile_weighted(p, english_word_counts_expanded), nth_percentile_weighted(p, english_word_counts_expanded, False))

print()

for n in [1, 10, 100, 1000, 10000]:
    print(n, examples_at_word_count(n, english_word_counts))

0.1 ('endemism', 87) ('terebratulina', 1)
0.1 weighted ('of', 94029) ('adolescent', 99)
0.5 ('contradiction', 3) ('hshlf', 3)
0.5 weighted ('landings', 2571) ('landings', 2571)
0.95 ('biron', 1) ('linear', 238)
0.95 weighted ('cobble', 34) ('the', 168894)

1 ['dermochelys', 'redefinition', 'chrome', 'leaded', 'barite']
10 ['psa', 'doubt', 'scientifique', 'ai', 'carroll']
100 ['alteration', 'benchmarks', 'nations', 'accurately', 'recognized']
1000 ['bc', 'decline', 'rivers', 'vessel', 'productivity']
10000 ['not', 'data', 'at']


In [11]:
for p in [0.001, 0.01, .99, .999]:
    print(p, nth_percentile(p, english_word_counts), nth_percentile(p, english_word_counts, False))
    print(p, 'weighted', nth_percentile_weighted(p, english_word_counts_expanded), nth_percentile_weighted(p, english_word_counts_expanded, False))

0.001 ('an', 7322) ('skipper', 1)
0.001 weighted ('the', 168894) ('steedman', 1)
0.01 ('indicators', 1239) ('firefighting', 1)
0.01 weighted ('the', 168894) ('myxine', 4)
0.99 ('cognition', 1) ('indicators', 1239)
0.99 weighted ('visualizing', 4) ('the', 168894)
0.999 ('jacking', 1) ('an', 7322)
0.999 weighted ('bloch', 1) ('the', 168894)


In [8]:
for p in [0.1, 0.5, 0.95]:
    print(p, nth_percentile(p, french_word_counts), nth_percentile(p, french_word_counts, False))
    print(p, 'weighted', nth_percentile_weighted(p, french_word_counts_expanded), nth_percentile_weighted(p, french_word_counts_expanded, False))

print()

for n in [1, 10, 100, 1000, 10000]:
    print(n, examples_at_word_count(n, french_word_counts))

0.1 ('appelée', 64) ('héberge', 1)
0.1 weighted ('la', 140133) ('estivale', 96)
0.5 ('cubiques', 3) ('résistent', 3)
0.5 weighted ('ne', 5866) ('ne', 5866)
0.95 ('interagency', 1) ('conseil', 172)
0.95 weighted ('simard', 32) ('de', 237366)

1 ['drauch', 'dependent', 'entraveraient', 'autoroutes', 'cristina']
10 ['pétrolier', 'sépare', 'retrouvées', 'bocaccios', 'kumar']
100 ['incluant', 'océaniques', 'acceptent', 'considère', 'considération']
1000 ['indice', 'tac', 'inférieure', 'capacité', 'cas']
10000 ['ou', 'avec', 'pas', 'aux']


In [12]:
for p in [0.001, 0.01, .99, .999]:
    print(p, nth_percentile(p, french_word_counts), nth_percentile(p, french_word_counts, False))
    print(p, 'weighted', nth_percentile_weighted(p, french_word_counts_expanded), nth_percentile_weighted(p, french_word_counts_expanded, False))

0.001 ('il', 8012) ('convoquer', 1)
0.001 weighted ('de', 237366) ('plivelic', 1)
0.01 ('différentes', 954) ('opérateurs', 1)
0.01 weighted ('de', 237366) ('synonyme', 4)
0.99 ('fréquentations', 1) ('différentes', 954)
0.99 weighted ('rein', 4) ('de', 237366)
0.999 ('retirez', 1) ('il', 8012)
0.999 weighted ('strub', 1) ('de', 237366)


In [35]:
def process_text(text, references_pattern):
    valid_word_regex = re.compile(r'^[a-zA-ZÀ-ÿ]+$')
    min_length, max_length = 5, 20
    
    document_parts = re.split(references_pattern, text, flags=re.IGNORECASE)
    if 2 < len(document_parts) < 5:
        text = document_parts[1]
    word_list = [word for word in text.split() if valid_word_regex.match(word)]
    
    cleaned_text = re.sub(r'[^a-zA-ZÀ-ÿ.\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    sentence_list = [
        sentence.strip() for sentence in cleaned_text.split('.')
        if min_length <= len(sentence.split()) <= max_length
        and all(valid_word_regex.match(word) for word in sentence.split())
    ]
    
    return word_list, sentence_list


def generate_word_lists(n):
    french_word_list = []
    english_word_list = []
    exclude_words_with_less_than_n = n
    
    # example sentences
    french_example_sentences = []
    english_example_sentences = []
    
    references_fr = r'RÉFÉRENCES CITÉES'.lower()
    references_en = r'REFERENCES CITED'.lower()
        
    for i, row in lang_df.iterrows():
        fr_path, en_path = get_filepaths(row)
        
        with open(fr_path, 'r', encoding='utf-8') as file:
            fr_text = json.load(file).get('text', '').lower()
            word_list, sentence_list = process_text(fr_text, references_fr)
            french_word_list.extend(word_list)
            french_example_sentences.extend(sentence_list)
        
        with open(en_path, 'r', encoding='utf-8') as file:
            en_text = json.load(file).get('text', '').lower()
            word_list, sentence_list = process_text(en_text, references_en)
            english_word_list.extend(word_list)
            english_example_sentences.extend(sentence_list)
            
    # Remove words with less than 10 occurrences
    french_word_list = [word for word, count in french_word_counts.items() if count >= exclude_words_with_less_than_n]
    english_word_list = [word for word, count in english_word_counts.items() if count >= exclude_words_with_less_than_n]
    
    # Convert to sets for further processing
    french_words = set(french_word_list)
    english_words = set(english_word_list)
    
    # Remove overlapping words
    overlapping_words = english_words & french_words
    english_words.difference_update(overlapping_words)
    french_words.difference_update(overlapping_words)
    
    # Remove numeric-only words
    english_words.difference_update({w for w in english_words if w.isnumeric()})
    french_words.difference_update({w for w in french_words if w.isnumeric()})
    
    return english_words, french_words, french_example_sentences, english_example_sentences


In [150]:
_, _, french_example_sentences, english_example_sentences = generate_word_lists(0)

In [30]:
for n in [0, 1, 5, 10, 20, 50, 100, 200, 500, 1000]:
    print(n, sum([1 for x in french_word_counts.values() if x > n]), sum([1 for x in english_word_counts.values() if x > n]))

0 31713 25194
1 21977 17713
5 12428 10323
10 9087 7741
20 6304 5591
50 3716 3492
100 2327 2301
200 1397 1436
500 618 657
1000 298 319


In [42]:
random.sample(french_example_sentences, 3)

['résultats de lestimation de la température au fond car pour chaque année',
 'ces unités seront utilisées pour aider à atteindre le critère de représentativité pour la conception du réseau damp',
 'ajustements du modèle et étalonnage fondé sur la longueur sélectionné pour triglops murrayi']

In [None]:
# n_trials = 10
# n_to_exclude = [0, 1, 5, 10, 20, 50, 100, 200, 500, 1000]
# 
# french_example_sentences_n = random.sample(french_example_sentences, n_trials)
# english_example_sentences_n = random.sample(english_example_sentences, n_trials)
# results = []
# 
# for n in n_to_exclude:
#     print(f'Processing {n}')
#     english_words, french_words, french_example_sentences, english_example_sentences = generate_word_lists(n)
#     
#     for sentence in french_example_sentences_n:
#         fr_count = sum(1 for word in sentence.split() if word in french_words)
#         en_count = sum(1 for word in sentence.split() if word in english_words)
#         results.append((n, 'fr', fr_count, en_count))
#     
#     for sentence in english_example_sentences_n:
#         fr_count = sum(1 for word in sentence.split() if word in french_words)
#         en_count = sum(1 for word in sentence.split() if word in english_words)
#         results.append((n, 'en', fr_count, en_count))
#     
# results_df = pd.DataFrame(results)
# results_df.columns = ['n_excluded', 'language', 'fr_count', 'en_count']
# 
# results_df['total_count'] = results_df['fr_count'] + results_df['en_count']
# 
# valid_mask = results_df['total_count'] > 0
# 
# results_df['p_correct'] = np.where(
#     (results_df['language'] == 'fr') & valid_mask,
#     results_df['fr_count'] / results_df['total_count'],
#     np.where(
#         (results_df['language'] == 'en') & valid_mask,
#         results_df['en_count'] / results_df['total_count'],
#         0
#     )
# )
# 
# results_df['p_wrong'] = np.where(
#     (results_df['language'] == 'fr') & valid_mask,
#     results_df['en_count'] / results_df['total_count'],
#     np.where(
#         (results_df['language'] == 'en') & valid_mask,
#         results_df['fr_count'] / results_df['total_count'],
#         0
#     )
# )

In [109]:
# def display_results(n, show_languages=False):
#     print('TOTAL')
#     display(results_df.loc[results_df.n_excluded == n, ['total_count', 'p_correct', 'p_wrong']].describe([0.025, .975]).T[['2.5%', 'mean', '97.5%']])
#     if show_languages:
#         print('FRENCH')
#         display(results_df.loc[(results_df.n_excluded == n) & (results_df.language == 'fr'), ['total_count', 'p_correct', 'p_wrong']].describe([0.025, .975]).T[['2.5%', 'mean', '97.5%']])
#         print('ENGLISH')
#         display(results_df.loc[(results_df.n_excluded == n) & (results_df.language == 'en'), ['total_count', 'p_correct', 'p_wrong']].describe([0.025, .975]).T[['2.5%', 'mean', '97.5%']])
#     print('\n')
# 
# for n in n_to_exclude:
#     print(f'\nWords with below {n} counts excluded\n')
#     display_results(n)

In [129]:
n_trials = 100
n_to_exclude = [0, 1, 5, 10, 20, 50, 100, 200, 500, 1000]

french_example_sentences_n = random.sample(french_example_sentences, n_trials)
english_example_sentences_n = random.sample(english_example_sentences, n_trials)
results = []

for n in n_to_exclude:
    print(f'Processing {n}')
    english_words, french_words, french_example_sentences, english_example_sentences = generate_word_lists(n)
    
    for sentence in french_example_sentences_n:
        fr_count = sum(1 for word in sentence.split() if word in french_words)
        en_count = sum(1 for word in sentence.split() if word in english_words)
        results.append((n, 'fr', fr_count, en_count))
    
    for sentence in english_example_sentences_n:
        fr_count = sum(1 for word in sentence.split() if word in french_words)
        en_count = sum(1 for word in sentence.split() if word in english_words)
        results.append((n, 'en', fr_count, en_count))
    
results_df = pd.DataFrame(results)
results_df.columns = ['n_excluded', 'language', 'fr_count', 'en_count']

results_df['total_count'] = results_df['fr_count'] + results_df['en_count']

valid_mask = results_df['total_count'] > 0

results_df['correct_count'] = np.where(
    (results_df['language'] == 'fr') & valid_mask,
    results_df['fr_count'],
    np.where(
        (results_df['language'] == 'en') & valid_mask,
        results_df['en_count'],
        0
    )
)

results_df['wrong_count'] = np.where(
    (results_df['language'] == 'fr') & valid_mask,
    results_df['en_count'],
    np.where(
        (results_df['language'] == 'en') & valid_mask,
        results_df['fr_count'],
        0
    )
)

results_df['is_correct'] = results_df['correct_count'] > results_df['wrong_count']


Processing 0
Processing 1
Processing 5
Processing 10
Processing 20
Processing 50
Processing 100
Processing 200
Processing 500
Processing 1000


In [130]:
results_df.head()

Unnamed: 0,n_excluded,language,fr_count,en_count,total_count,correct_count,wrong_count,is_correct
0,0,fr,3,0,3,3,0,True
1,0,fr,7,0,7,7,0,True
2,0,fr,3,0,3,3,0,True
3,0,fr,4,0,4,4,0,True
4,0,fr,3,0,3,3,0,True


In [134]:
# Define FP & FN for each language
results_df['fr_false_positive'] = (results_df['language'] == 'en') & (results_df['is_correct'] == False)
results_df['fr_false_negative'] = (results_df['language'] == 'fr') & (results_df['is_correct'] == False)

results_df['en_false_positive'] = (results_df['language'] == 'fr') & (results_df['is_correct'] == False)
results_df['en_false_negative'] = (results_df['language'] == 'en') & (results_df['is_correct'] == False)

# Compute separate aggregations
grouped_df = results_df.groupby('n_excluded').agg(
    total_count=('total_count', 'sum'),
    
    # Correct and incorrect classifications
    correct_count=('is_correct', 'sum'),
    wrong_count=('is_correct', lambda x: (~x).sum()),  

    # False Positives & False Negatives for each language
    fr_false_positive=('fr_false_positive', 'sum'),
    fr_false_negative=('fr_false_negative', 'sum'),
    en_false_positive=('en_false_positive', 'sum'),
    en_false_negative=('en_false_negative', 'sum')
).reset_index()

# Avoid division by zero
valid_mask = grouped_df['total_count'] > 0

# Accuracy (same for both languages)
grouped_df['accuracy'] = np.where(valid_mask, grouped_df['correct_count'] / grouped_df['total_count'], 0)

# Precision & Recall for French
fr_precision_mask = (grouped_df['correct_count'] + grouped_df['fr_false_positive']) > 0
grouped_df['fr_precision'] = np.where(
    fr_precision_mask, 
    grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['fr_false_positive']), 
    0
)

fr_recall_mask = (grouped_df['correct_count'] + grouped_df['fr_false_negative']) > 0
grouped_df['fr_recall'] = np.where(
    fr_recall_mask, 
    grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['fr_false_negative']), 
    0
)

# Precision & Recall for English
en_precision_mask = (grouped_df['correct_count'] + grouped_df['en_false_positive']) > 0
grouped_df['en_precision'] = np.where(
    en_precision_mask, 
    grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['en_false_positive']), 
    0
)

en_recall_mask = (grouped_df['correct_count'] + grouped_df['en_false_negative']) > 0
grouped_df['en_recall'] = np.where(
    en_recall_mask, 
    grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['en_false_negative']), 
    0
)

# Compute F1-score separately
grouped_df['fr_f1_score'] = np.where(
    (grouped_df['fr_precision'] + grouped_df['fr_recall']) > 0,
    2 * (grouped_df['fr_precision'] * grouped_df['fr_recall']) / (grouped_df['fr_precision'] + grouped_df['fr_recall']),
    0
)

grouped_df['en_f1_score'] = np.where(
    (grouped_df['en_precision'] + grouped_df['en_recall']) > 0,
    2 * (grouped_df['en_precision'] * grouped_df['en_recall']) / (grouped_df['en_precision'] + grouped_df['en_recall']),
    0
)

In [139]:
grouped_df.set_index('n_excluded').T

n_excluded,0,1,5,10,20,50,100,200,500,1000
total_count,528.0,528.0,835.0,906.0,1057.0,1152.0,1229.0,1278.0,1223.0,1210.0
correct_count,155.0,155.0,171.0,174.0,178.0,183.0,183.0,180.0,181.0,183.0
wrong_count,45.0,45.0,29.0,26.0,22.0,17.0,17.0,20.0,19.0,17.0
fr_false_positive,29.0,29.0,15.0,13.0,10.0,4.0,4.0,6.0,5.0,6.0
fr_false_negative,16.0,16.0,14.0,13.0,12.0,13.0,13.0,14.0,14.0,11.0
en_false_positive,16.0,16.0,14.0,13.0,12.0,13.0,13.0,14.0,14.0,11.0
en_false_negative,29.0,29.0,15.0,13.0,10.0,4.0,4.0,6.0,5.0,6.0
accuracy,0.29,0.29,0.2,0.19,0.17,0.16,0.15,0.14,0.15,0.15
fr_precision,0.84,0.84,0.92,0.93,0.95,0.98,0.98,0.97,0.97,0.97
fr_recall,0.91,0.91,0.92,0.93,0.94,0.93,0.93,0.93,0.93,0.94


In [143]:
grouped_df.set_index('n_excluded').drop(['total_count', 'fr_false_positive',
       'fr_false_negative', 'en_false_positive', 'en_false_negative'], axis=1)

Unnamed: 0_level_0,correct_count,wrong_count,accuracy,fr_precision,fr_recall,en_precision,en_recall,fr_f1_score,en_f1_score
n_excluded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,155,45,0.29,0.84,0.91,0.91,0.84,0.87,0.87
1,155,45,0.29,0.84,0.91,0.91,0.84,0.87,0.87
5,171,29,0.2,0.92,0.92,0.92,0.92,0.92,0.92
10,174,26,0.19,0.93,0.93,0.93,0.93,0.93,0.93
20,178,22,0.17,0.95,0.94,0.94,0.95,0.94,0.94
50,183,17,0.16,0.98,0.93,0.93,0.98,0.96,0.96
100,183,17,0.15,0.98,0.93,0.93,0.98,0.96,0.96
200,180,20,0.14,0.97,0.93,0.93,0.97,0.95,0.95
500,181,19,0.15,0.97,0.93,0.93,0.97,0.95,0.95
1000,183,17,0.15,0.97,0.94,0.94,0.97,0.96,0.96


In [141]:
best_by_feature = [50, 100, 1000]

Index(['total_count', 'correct_count', 'wrong_count', 'fr_false_positive',
       'fr_false_negative', 'en_false_positive', 'en_false_negative',
       'accuracy', 'fr_precision', 'fr_recall', 'en_precision', 'en_recall',
       'fr_f1_score', 'en_f1_score'],
      dtype='object')

In [122]:
results_df.groupby('n_excluded').agg(
    mean_correct=('correct_count', 'mean'),
    mean_wrong=('wrong_count', 'mean'),
    mean_accuracy=('accuracy', 'mean'),
    mean_precision=('precision', 'mean'),
    mean_recall=('recall', 'mean'),
    mean_f1_score=('f1_score', 'mean'),
).reset_index()

Unnamed: 0,n_excluded,mean_correct,mean_wrong,mean_accuracy,mean_precision,mean_recall,mean_f1_score
0,0,2.83,0.01,0.81,0.81,0.81,0.81
1,1,2.83,0.01,0.81,0.81,0.81,0.81
2,5,4.42,0.08,0.9,0.9,0.9,0.9
3,10,4.98,0.12,0.92,0.92,0.92,0.92
4,20,5.64,0.19,0.91,0.91,0.91,0.91
5,50,6.07,0.29,0.92,0.92,0.92,0.92
6,100,6.38,0.39,0.93,0.93,0.93,0.93
7,200,6.54,0.35,0.93,0.93,0.93,0.93
8,500,6.13,0.39,0.93,0.93,0.93,0.93
9,1000,5.97,0.33,0.92,0.92,0.92,0.92


In [123]:
# backup old results_df
results_df_BACKUP = results_df.copy() 

In [126]:
# repeated with tweaked hyperparams

n_trials = 1000
n_to_exclude = [x for x in range(50, 550, 50)] + [x for x in range(600, 1100, 100)] + [x for x in range(1200, 2200, 200)]

french_example_sentences_n = random.sample(french_example_sentences, n_trials)
english_example_sentences_n = random.sample(english_example_sentences, n_trials)
results = []

for n in n_to_exclude:
    print(f'Processing {n}')
    english_words, french_words, _, _ = generate_word_lists(n)
    
    for sentence in french_example_sentences_n:
        fr_count = sum(1 for word in sentence.split() if word in french_words)
        en_count = sum(1 for word in sentence.split() if word in english_words)
        results.append((n, 'fr', fr_count, en_count))
    
    for sentence in english_example_sentences_n:
        fr_count = sum(1 for word in sentence.split() if word in french_words)
        en_count = sum(1 for word in sentence.split() if word in english_words)
        results.append((n, 'en', fr_count, en_count))
    
results_df = pd.DataFrame(results)
results_df.columns = ['n_excluded', 'language', 'fr_count', 'en_count']

results_df['total_count'] = results_df['fr_count'] + results_df['en_count']

valid_mask = results_df['total_count'] > 0

results_df['correct_count'] = np.where(
    (results_df['language'] == 'fr') & valid_mask,
    results_df['fr_count'],
    np.where(
        (results_df['language'] == 'en') & valid_mask,
        results_df['en_count'],
        0
    )
)

results_df['wrong_count'] = np.where(
    (results_df['language'] == 'fr') & valid_mask,
    results_df['en_count'],
    np.where(
        (results_df['language'] == 'en') & valid_mask,
        results_df['fr_count'],
        0
    )
)

results_df['is_correct'] = results_df['correct_count'] > results_df['wrong_count']

KeyboardInterrupt: 