In [1]:
import json
import os
import random
import re
import subprocess
import numpy as np
import pandas as pd

from collections import Counter

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [2]:
# all files that have been downloaded and parsed
parsed_docs_folder = os.path.join("..", "ParsedPublications")

min_year = 2023
parsed_files = list()
parsed_files_with_hq_ocr = list()
for folder in os.listdir(parsed_docs_folder):
    path = os.path.join(parsed_docs_folder, folder)
    if os.path.isdir(path):
        for json_file in os.listdir(path):
            if json_file.endswith(".json"):
                parsed_files.append(json_file.replace('.json', ''))
                if folder in [str(year) for year in range(min_year, 2024 + 1)]:
                    parsed_files_with_hq_ocr.append(json_file.replace('.json', ''))

# all files from website
fr_eng_correlation_csv = "fr_eng_correlation_data.csv"
fr_eng_correlation_df = pd.read_csv(fr_eng_correlation_csv)
# exclude files that aren't downloaded, and files that have been withdrawn
fr_eng_correlation_df = fr_eng_correlation_df[(fr_eng_correlation_df.filename_en.isin(parsed_files)) | (fr_eng_correlation_df.filename_fr.isin(parsed_files))]
fr_eng_correlation_df = fr_eng_correlation_df[(fr_eng_correlation_df.filename_en != 'WITHDRAWN') & (fr_eng_correlation_df.filename_fr != 'WITHDRAWN')]

# weblinks for previewing / checking results
weblinks_df = fr_eng_correlation_df.copy()
weblinks_df = weblinks_df[['pub_number', 'nom', 'name', 'url_fr', 'url_en', 'file_url_fr', 'file_url_en']]

# data to be used for language classifier
lang_df = fr_eng_correlation_df.copy()
lang_df = lang_df[(lang_df.filename_fr.isin(parsed_files_with_hq_ocr)) & (lang_df.filename_en.isin(parsed_files_with_hq_ocr)) & (lang_df.filename_fr != lang_df.filename_en)]


# helper functions

In [3]:
def preview_publication(pub_number):
    if type(pub_number) is pd.DataFrame and pub_number.shape[0] == 1:
        try:
            pub_number = pub_number['pub_number'].values[0]
        except ValueError:
            return None
    elif type(pub_number) is pd.Series:
        try:
            pub_number = pub_number.values[0]
        except ValueError:
            return None
    
    try:
        output_df = weblinks_df[weblinks_df.pub_number == pub_number].T
    except Exception as e:
        print(e)
        return None
        
    return output_df


def get_filepaths(row, min_year=2023):
    fr_filename, en_filename = row['filename_fr'] + '.json', row['filename_en'] + '.json'
    file_folders = [os.path.join('..', 'ParsedPublications', str(year)) for year in range(min_year, 2024 + 1)]
    
    try:
        fr_path, en_path = ([os.path.join(folder, fr_filename) for folder in file_folders if os.path.exists(os.path.join(folder, fr_filename))][0], 
                            [os.path.join(folder, en_filename) for folder in file_folders if os.path.exists(os.path.join(folder, en_filename))][0])
    except IndexError:
        return None, None
    
    return fr_path, en_path


In [4]:
# Make lists of all French words and all English words

valid_word_regex = re.compile(r'^[a-zA-ZÀ-ÿ]+$')
french_word_list = []
english_word_list = []
exclude_words_with_less_than_n = 10

# clean headers and appendices
references_fr = r'RÉFÉRENCES CITÉES'.lower()
references_en = r'REFERENCES CITED'.lower()


for i, row in lang_df.iterrows():
    fr_path, en_path = get_filepaths(row)
    
    with open(fr_path, 'r', encoding='utf-8') as file:
        fr_text = json.load(file).get('text', '').lower()
            
        parts = re.split(references_fr, fr_text, flags=re.IGNORECASE)
        if 2 < len(parts) < 5:  # if 2 or 3 occurences of references text, take the second part (to get the main body text)
            fr_text = parts[1]
        
        french_word_list.extend(word for word in fr_text.split() if valid_word_regex.match(word))
    
    with open(en_path, 'r', encoding='utf-8') as file:
        en_text = json.load(file).get('text', '').lower()
            
        parts = re.split(references_en, en_text, flags=re.IGNORECASE)
        if 2 < len(parts) < 5:  # if 2 or 3 occurences of references text, take the second part (to get the main body text)
            en_text = parts[1]
        
        english_word_list.extend(word for word in en_text.split() if valid_word_regex.match(word))
        
# For testing
french_word_counts = Counter(french_word_list)
french_word_counts_expanded = []
for word, count in french_word_counts.items():
    for _ in range(count):
        french_word_counts_expanded.append((word, count))
        
english_word_counts = Counter(english_word_list)
english_word_counts_expanded = []
for word, count in english_word_counts.items():
    for _ in range(count):
        english_word_counts_expanded.append((word, count))

full_french_word_list = french_word_list.copy()
full_english_word_list = english_word_list.copy()

# Remove words with less than 10 occurrences
french_word_list = [word for word, count in french_word_counts.items() if count >= exclude_words_with_less_than_n]
english_word_list = [word for word, count in english_word_counts.items() if count >= exclude_words_with_less_than_n]

# Convert to sets for further processing
french_words = set(french_word_list)
english_words = set(english_word_list)

# Remove overlapping words
overlapping_words = english_words & french_words
english_words.difference_update(overlapping_words)
french_words.difference_update(overlapping_words)

# Remove numeric-only words
english_words.difference_update({w for w in english_words if w.isnumeric()})
french_words.difference_update({w for w in french_words if w.isnumeric()})


In [5]:
# helper functions for word lists

def test_wordlists(text_block, english_words, french_words):
    en_count = sum(1 for word in text_block.split() if word in english_words)
    fr_count = sum(1 for word in text_block.split() if word in french_words)
    
    print('english words:', list(word for word in text_block.split() if word in english_words))
    print('french words:', list(word for word in text_block.split() if word in french_words))
    print(f'{en_count=}, {fr_count=}')    
    
def most_common_word_info(counter_obj, n=10):  
    length = counter_obj.total()
    c_v = 0
    for k, v in counter_obj.most_common(n):
        rng = f'({(100 * c_v) / length:.0f}%'
        c_v += v
        rng += f'-{(100 * c_v) / length:.0f}%)'
        print(f'{k:<20}{v:>8}{(100 * v) / length:>8.0f}%{rng:>15}')

def nth_percentile(p, counter_obj, greater_than=True):
    sorted_list = sorted(counter_obj.items(), key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    return sorted_list[index]

def nth_percentile_weighted(p, counter_expanded, greater_than=True):
    sorted_list = sorted(counter_expanded, key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    return sorted_list[index]

def count_nth_percentile(p, counter_obj, greater_than=True):
    sorted_list = sorted(counter_obj.items(), key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    return len(sorted_list[index:]) if greater_than else len(sorted_list[-max(index, 1):])

def count_nth_percentile_weighted(p, counter_expanded, greater_than=True):
    sorted_list = sorted(counter_expanded, key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    count_if_gte = sorted_list[len(sorted_list) - index][1]
    count_if_lte = sorted_list[index][1]
        
    gte = {x for x in counter_expanded if x[1] >= count_if_gte}
    lte = {x for x in counter_expanded if x[1] <= count_if_lte}
    
    return len(gte) if greater_than else len(lte)

def examples_at_word_count(n, counter_obj, n_samples=5, tolerance=0.1):
    lower, upper = int((1 - 0.1) * n), int((1 + 0.1) * n)
    all_examples = [k for k, v in counter_obj.items() if lower <= v <= upper]
    
    return random.sample(all_examples, min(n_samples, len(all_examples)))


In [6]:
test_wordlists('Total mortalities at age, based on survey data, are presented in Table', english_words, french_words)
print()
most_common_word_info(english_word_counts, 4)
print()
most_common_word_info(french_word_counts, 4)

english words: ['mortalities', 'presented']
french words: []
en_count=2, fr_count=0

the                   168894       7%        (0%-7%)
of                     94029       4%       (7%-11%)
and                    93778       4%      (11%-15%)
in                     72480       3%      (15%-19%)

de                    237366       8%        (0%-8%)
la                    140133       5%       (8%-13%)
et                    100646       4%      (13%-17%)
les                   100095       4%      (17%-21%)


In [7]:
for p in [0.1, 0.5, 0.95]:
    print(p, nth_percentile(p, english_word_counts), nth_percentile(p, english_word_counts, False))
    print(p, 'weighted', nth_percentile_weighted(p, english_word_counts_expanded), nth_percentile_weighted(p, english_word_counts_expanded, False))

print()

for n in [1, 10, 100, 1000, 10000]:
    print(n, examples_at_word_count(n, english_word_counts))

0.1 ('endemism', 87) ('terebratulina', 1)
0.1 weighted ('of', 94029) ('adolescent', 99)
0.5 ('contradiction', 3) ('hshlf', 3)
0.5 weighted ('landings', 2571) ('landings', 2571)
0.95 ('biron', 1) ('linear', 238)
0.95 weighted ('cobble', 34) ('the', 168894)

1 ['franceschini', 'abell', 'lodge', 'geosci', 'medusarum']
10 ['sheila', 'footage', 'stepwise', 'compound', 'huron']
100 ['cage', 'y', 'adolescent', 'charlotte', 'recognized']
1000 ['redfish', 'collected', 'program', 'bc', 'part']
10000 ['not', 'at', 'data']


In [8]:
for p in [0.001, 0.01, .99, .999]:
    print(p, nth_percentile(p, english_word_counts), nth_percentile(p, english_word_counts, False))
    print(p, 'weighted', nth_percentile_weighted(p, english_word_counts_expanded), nth_percentile_weighted(p, english_word_counts_expanded, False))

0.001 ('an', 7322) ('skipper', 1)
0.001 weighted ('the', 168894) ('steedman', 1)
0.01 ('indicators', 1239) ('firefighting', 1)
0.01 weighted ('the', 168894) ('myxine', 4)
0.99 ('cognition', 1) ('indicators', 1239)
0.99 weighted ('visualizing', 4) ('the', 168894)
0.999 ('jacking', 1) ('an', 7322)
0.999 weighted ('bloch', 1) ('the', 168894)


In [9]:
for p in [0.1, 0.5, 0.95]:
    print(p, nth_percentile(p, french_word_counts), nth_percentile(p, french_word_counts, False))
    print(p, 'weighted', nth_percentile_weighted(p, french_word_counts_expanded), nth_percentile_weighted(p, french_word_counts_expanded, False))

print()

for n in [1, 10, 100, 1000, 10000]:
    print(n, examples_at_word_count(n, french_word_counts))

0.1 ('appelée', 64) ('héberge', 1)
0.1 weighted ('la', 140133) ('estivale', 96)
0.5 ('cubiques', 3) ('résistent', 3)
0.5 weighted ('ne', 5866) ('ne', 5866)
0.95 ('interagency', 1) ('conseil', 172)
0.95 weighted ('simard', 32) ('de', 237366)

1 ['invoquait', 'citez', 'neighbour', 'bessel', 'mésotrophes']
10 ['viser', 'inscription', 'chapter', 'reports', 'consister']
100 ['chaîne', 'thompson', 'indirects', 'infratidale', 'décalage']
1000 ['aussi', 'fois', 'changement', 'leurs', 'indiquent']
10000 ['pas', 'aux', 'ou', 'avec']


In [10]:
for p in [0.001, 0.01, .99, .999]:
    print(p, nth_percentile(p, french_word_counts), nth_percentile(p, french_word_counts, False))
    print(p, 'weighted', nth_percentile_weighted(p, french_word_counts_expanded), nth_percentile_weighted(p, french_word_counts_expanded, False))

0.001 ('il', 8012) ('convoquer', 1)
0.001 weighted ('de', 237366) ('plivelic', 1)
0.01 ('différentes', 954) ('opérateurs', 1)
0.01 weighted ('de', 237366) ('synonyme', 4)
0.99 ('fréquentations', 1) ('différentes', 954)
0.99 weighted ('rein', 4) ('de', 237366)
0.999 ('retirez', 1) ('il', 8012)
0.999 weighted ('strub', 1) ('de', 237366)


In [11]:
def process_text(text, references_pattern):
    valid_word_regex = re.compile(r'^[a-zA-ZÀ-ÿ]+$')
    min_length, max_length = 5, 20
    
    document_parts = re.split(references_pattern, text, flags=re.IGNORECASE)
    if 2 < len(document_parts) < 5:
        text = document_parts[1]
    word_list = [word for word in text.split() if valid_word_regex.match(word)]
    
    cleaned_text = re.sub(r'[^a-zA-ZÀ-ÿ.\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    sentence_list = [
        sentence.strip() for sentence in cleaned_text.split('.')
        if min_length <= len(sentence.split()) <= max_length
        and all(valid_word_regex.match(word) for word in sentence.split())
    ]
    
    return word_list, sentence_list


def generate_word_lists(n):
    french_word_list = []
    english_word_list = []
    exclude_words_with_less_than_n = n
    
    # example sentences
    french_example_sentences = []
    english_example_sentences = []
    
    references_fr = r'RÉFÉRENCES CITÉES'.lower()
    references_en = r'REFERENCES CITED'.lower()
        
    for i, row in lang_df.iterrows():
        fr_path, en_path = get_filepaths(row)
        
        with open(fr_path, 'r', encoding='utf-8') as file:
            fr_text = json.load(file).get('text', '').lower()
            word_list, sentence_list = process_text(fr_text, references_fr)
            french_word_list.extend(word_list)
            french_example_sentences.extend(sentence_list)
        
        with open(en_path, 'r', encoding='utf-8') as file:
            en_text = json.load(file).get('text', '').lower()
            word_list, sentence_list = process_text(en_text, references_en)
            english_word_list.extend(word_list)
            english_example_sentences.extend(sentence_list)
            
    # Remove words with less than 10 occurrences
    french_word_list = [word for word, count in french_word_counts.items() if count >= exclude_words_with_less_than_n]
    english_word_list = [word for word, count in english_word_counts.items() if count >= exclude_words_with_less_than_n]
    
    # Convert to sets for further processing
    french_words = set(french_word_list)
    english_words = set(english_word_list)
    
    # Remove overlapping words
    overlapping_words = english_words & french_words
    english_words.difference_update(overlapping_words)
    french_words.difference_update(overlapping_words)
    
    # Remove numeric-only words
    english_words.difference_update({w for w in english_words if w.isnumeric()})
    french_words.difference_update({w for w in french_words if w.isnumeric()})
    
    return english_words, french_words, french_example_sentences, english_example_sentences


In [12]:
# good start for hyperparam(s)
for n in [0, 1, 5, 10, 20, 50, 100, 200, 500, 1000]:
    print(n, sum([1 for x in french_word_counts.values() if x > n]), sum([1 for x in english_word_counts.values() if x > n]))

0 31713 25194
1 21977 17713
5 12428 10323
10 9087 7741
20 6304 5591
50 3716 3492
100 2327 2301
200 1397 1436
500 618 657
1000 298 319


In [13]:
def process_results(n_to_exclude, n_trials):
    results = []
    
    _, _, french_example_sentences, english_example_sentences = generate_word_lists(0)
    french_example_sentences_n = random.sample(french_example_sentences, n_trials)
    english_example_sentences_n = random.sample(english_example_sentences, n_trials)
    
    for n in n_to_exclude:
        print(f'Processing {n}')
        english_words, french_words, french_example_sentences, english_example_sentences = generate_word_lists(n)
        
        for sentence in french_example_sentences_n:
            fr_count = sum(1 for word in sentence.split() if word in french_words)
            en_count = sum(1 for word in sentence.split() if word in english_words)
            results.append((n, 'fr', fr_count, en_count))
        
        for sentence in english_example_sentences_n:
            fr_count = sum(1 for word in sentence.split() if word in french_words)
            en_count = sum(1 for word in sentence.split() if word in english_words)
            results.append((n, 'en', fr_count, en_count))
        
    results_df = pd.DataFrame(results)
    results_df.columns = ['n_excluded', 'language', 'fr_count', 'en_count']
    
    results_df['total_count'] = results_df['fr_count'] + results_df['en_count']
    
    valid_mask = results_df['total_count'] > 0
    
    results_df['correct_count'] = np.where(
        (results_df['language'] == 'fr') & valid_mask,
        results_df['fr_count'],
        np.where(
            (results_df['language'] == 'en') & valid_mask,
            results_df['en_count'],
            0
        )
    )
    
    results_df['wrong_count'] = np.where(
        (results_df['language'] == 'fr') & valid_mask,
        results_df['en_count'],
        np.where(
            (results_df['language'] == 'en') & valid_mask,
            results_df['fr_count'],
            0
        )
    )
    
    results_df['is_correct'] = results_df['correct_count'] > results_df['wrong_count']
    
    return results_df


n_to_exclude = [0, 1, 5, 10, 20, 50, 100, 200, 500, 1000]
n_trials = 100
results_df = process_results(n_to_exclude, n_trials)


Processing 0
Processing 1
Processing 5
Processing 10
Processing 20
Processing 50
Processing 100
Processing 200
Processing 500
Processing 1000


In [14]:
# STATS

def create_stats(results_df):
    
    # Define FP & FN for each language
    results_df['fr_false_positive'] = (results_df['language'] == 'en') & (results_df['is_correct'] == False)
    results_df['fr_false_negative'] = (results_df['language'] == 'fr') & (results_df['is_correct'] == False)
    results_df['en_false_positive'] = (results_df['language'] == 'fr') & (results_df['is_correct'] == False)
    results_df['en_false_negative'] = (results_df['language'] == 'en') & (results_df['is_correct'] == False)
    
    # Compute separate aggregations
    grouped_df = results_df.groupby('n_excluded').agg(
        total_count=('is_correct', 'count'),  # count rows (not the same as results_df['total_count']
        
        # Correct and incorrect classifications
        correct_count=('is_correct', 'sum'),
        wrong_count=('is_correct', lambda x: (~x).sum()),  
    
        # False Positives & False Negatives for each language
        fr_false_positive=('fr_false_positive', 'sum'),
        fr_false_negative=('fr_false_negative', 'sum'),
        en_false_positive=('en_false_positive', 'sum'),
        en_false_negative=('en_false_negative', 'sum')
    ).reset_index()
    
    # Avoid division by zero
    valid_mask = grouped_df['total_count'] > 0
    
    # Accuracy (same for both languages)
    grouped_df['accuracy'] = np.where(valid_mask, grouped_df['correct_count'] / grouped_df['total_count'], 0)
    
    # Precision & Recall for French
    fr_precision_mask = (grouped_df['correct_count'] + grouped_df['fr_false_positive']) > 0
    grouped_df['fr_precision'] = np.where(
        fr_precision_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['fr_false_positive']), 
        0
    )
    
    fr_recall_mask = (grouped_df['correct_count'] + grouped_df['fr_false_negative']) > 0
    grouped_df['fr_recall'] = np.where(
        fr_recall_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['fr_false_negative']), 
        0
    )
    
    # Precision & Recall for English
    en_precision_mask = (grouped_df['correct_count'] + grouped_df['en_false_positive']) > 0
    grouped_df['en_precision'] = np.where(
        en_precision_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['en_false_positive']), 
        0
    )
    
    en_recall_mask = (grouped_df['correct_count'] + grouped_df['en_false_negative']) > 0
    grouped_df['en_recall'] = np.where(
        en_recall_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['en_false_negative']), 
        0
    )
    
    # F1-scores
    grouped_df['fr_f1_score'] = np.where(
        (grouped_df['fr_precision'] + grouped_df['fr_recall']) > 0,
        2 * (grouped_df['fr_precision'] * grouped_df['fr_recall']) / (grouped_df['fr_precision'] + grouped_df['fr_recall']),
        0
    )
    
    grouped_df['en_f1_score'] = np.where(
        (grouped_df['en_precision'] + grouped_df['en_recall']) > 0,
        2 * (grouped_df['en_precision'] * grouped_df['en_recall']) / (grouped_df['en_precision'] + grouped_df['en_recall']),
        0
    )
    
    grouped_df['stats_sum'] = grouped_df[['accuracy', 'fr_precision', 'fr_recall', 'en_precision', 'en_recall', 'fr_f1_score', 'en_f1_score']].sum(axis=1)
    
    return grouped_df


grouped_df = create_stats(results_df)


In [15]:
grouped_df.set_index('n_excluded').T

n_excluded,0,1,5,10,20,50,100,200,500,1000
total_count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
correct_count,157.0,157.0,175.0,184.0,185.0,185.0,188.0,187.0,188.0,187.0
wrong_count,43.0,43.0,25.0,16.0,15.0,15.0,12.0,13.0,12.0,13.0
fr_false_positive,28.0,28.0,13.0,5.0,4.0,4.0,2.0,3.0,2.0,3.0
fr_false_negative,15.0,15.0,12.0,11.0,11.0,11.0,10.0,10.0,10.0,10.0
en_false_positive,15.0,15.0,12.0,11.0,11.0,11.0,10.0,10.0,10.0,10.0
en_false_negative,28.0,28.0,13.0,5.0,4.0,4.0,2.0,3.0,2.0,3.0
accuracy,0.79,0.79,0.88,0.92,0.93,0.93,0.94,0.94,0.94,0.94
fr_precision,0.85,0.85,0.93,0.97,0.98,0.98,0.99,0.98,0.99,0.98
fr_recall,0.91,0.91,0.94,0.94,0.94,0.94,0.95,0.95,0.95,0.95


In [21]:
# all stats added together
grouped_df.set_index('n_excluded')['stats_sum']

n_excluded
0      6.07
1      6.07
5      6.48
10     6.67
20     6.69
50     6.69
100    6.76
200    6.73
500    6.76
1000   6.73
Name: stats_sum, dtype: float64

In [17]:
# backup old dfs
results_df_BACKUP = results_df.copy() 
grouped_df_BACKUP = grouped_df.copy() 

In [18]:
n_to_exclude = [x for x in range(50, 550, 50)] + [x for x in range(600, 1100, 100)] + [x for x in range(1200, 2200, 200)]
n_trials = 1000

results_df = process_results(n_to_exclude, n_trials)
grouped_df = create_stats(results_df)

Processing 50
Processing 100
Processing 150
Processing 200
Processing 250
Processing 300
Processing 350
Processing 400
Processing 450
Processing 500
Processing 600
Processing 700
Processing 800
Processing 900
Processing 1000
Processing 1200
Processing 1400
Processing 1600
Processing 1800
Processing 2000


In [19]:
grouped_df.set_index('n_excluded').T

n_excluded,50,100,150,200,250,300,350,400,450,500,600,700,800,900,1000,1200,1400,1600,1800,2000
total_count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
correct_count,1854.0,1868.0,1871.0,1861.0,1860.0,1854.0,1856.0,1854.0,1853.0,1855.0,1854.0,1852.0,1849.0,1845.0,1844.0,1837.0,1823.0,1820.0,1815.0,1813.0
wrong_count,146.0,132.0,129.0,139.0,140.0,146.0,144.0,146.0,147.0,145.0,146.0,148.0,151.0,155.0,156.0,163.0,177.0,180.0,185.0,187.0
fr_false_positive,25.0,14.0,13.0,24.0,26.0,30.0,31.0,32.0,33.0,31.0,34.0,37.0,39.0,42.0,43.0,51.0,66.0,69.0,75.0,77.0
fr_false_negative,121.0,118.0,116.0,115.0,114.0,116.0,113.0,114.0,114.0,114.0,112.0,111.0,112.0,113.0,113.0,112.0,111.0,111.0,110.0,110.0
en_false_positive,121.0,118.0,116.0,115.0,114.0,116.0,113.0,114.0,114.0,114.0,112.0,111.0,112.0,113.0,113.0,112.0,111.0,111.0,110.0,110.0
en_false_negative,25.0,14.0,13.0,24.0,26.0,30.0,31.0,32.0,33.0,31.0,34.0,37.0,39.0,42.0,43.0,51.0,66.0,69.0,75.0,77.0
accuracy,0.93,0.93,0.94,0.93,0.93,0.93,0.93,0.93,0.93,0.93,0.93,0.93,0.92,0.92,0.92,0.92,0.91,0.91,0.91,0.91
fr_precision,0.99,0.99,0.99,0.99,0.99,0.98,0.98,0.98,0.98,0.98,0.98,0.98,0.98,0.98,0.98,0.97,0.97,0.96,0.96,0.96
fr_recall,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94,0.94


In [20]:
# all stats added together
grouped_df.set_index('n_excluded')['stats_sum']

n_excluded
50     6.70
100    6.73
150    6.74
200    6.72
250    6.71
300    6.70
350    6.71
400    6.70
450    6.70
500    6.70
600    6.70
700    6.70
800    6.69
900    6.68
1000   6.68
1200   6.66
1400   6.63
1600   6.63
1800   6.62
2000   6.61
Name: stats_sum, dtype: float64

In [22]:
# let's check again to see if it's still similar
results_df = process_results(n_to_exclude, n_trials)
grouped_df = create_stats(results_df)

Processing 50
Processing 100
Processing 150
Processing 200
Processing 250
Processing 300
Processing 350
Processing 400
Processing 450
Processing 500
Processing 600
Processing 700
Processing 800
Processing 900
Processing 1000
Processing 1200
Processing 1400
Processing 1600
Processing 1800
Processing 2000


In [23]:
grouped_df.set_index('n_excluded')['stats_sum']

n_excluded
50     6.77
100    6.79
150    6.79
200    6.77
250    6.77
300    6.77
350    6.77
400    6.76
450    6.76
500    6.74
600    6.75
700    6.74
800    6.73
900    6.75
1000   6.75
1200   6.72
1400   6.71
1600   6.70
1800   6.69
2000   6.68
Name: stats_sum, dtype: float64

In [24]:
# tighter hyperparams, more trials
grouped_df = create_stats(process_results([x for x in range(50, 525, 25)], 5000))
grouped_df.set_index('n_excluded')['stats_sum']

Processing 50
Processing 75
Processing 100
Processing 125
Processing 150
Processing 175
Processing 200
Processing 225
Processing 250
Processing 275
Processing 300
Processing 325
Processing 350
Processing 375
Processing 400
Processing 425
Processing 450
Processing 475
Processing 500


n_excluded
50    6.72
75    6.74
100   6.74
125   6.74
150   6.75
175   6.75
200   6.74
225   6.74
250   6.74
275   6.74
300   6.74
325   6.75
350   6.75
375   6.75
400   6.75
425   6.74
450   6.74
475   6.74
500   6.73
Name: stats_sum, dtype: float64

# Save Data For Classifier

In [39]:
english_words, french_words, french_example_sentences, english_example_sentences = generate_word_lists(150)

# save optimised word lists 
with open("language_classifier/wordlists.json", "w", encoding="utf-8") as f:
    json.dump({"en": list(english_words), "fr": list(french_words)}, f, ensure_ascii=False, indent=4)
    
# save 10k sentences per language for testing
with open("language_classifier/example_sentences.json", "w", encoding="utf-8") as f:
    json.dump({"en": random.sample(english_example_sentences, 10000), "fr": random.sample(french_example_sentences, 10000)}, f, ensure_ascii=False, indent=4)