In [107]:
import unicodedata
import json
import os
import random
import re
import subprocess
import numpy as np
import pandas as pd

from collections import Counter

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [7]:
# all files that have been downloaded and parsed
parsed_docs_folder = os.path.join("..", "ParsedPublications")

min_year = 2023
parsed_files = list()
parsed_files_with_hq_ocr = list()
for folder in os.listdir(parsed_docs_folder):
    path = os.path.join(parsed_docs_folder, folder)
    if os.path.isdir(path):
        for json_file in os.listdir(path):
            if json_file.endswith(".json"):
                parsed_files.append(json_file.replace('.json', ''))
                if folder in [str(year) for year in range(min_year, 2024 + 1)]:
                    parsed_files_with_hq_ocr.append(json_file.replace('.json', ''))

# all files from website
fr_eng_correlation_csv = "fr_eng_correlation_data.csv"
fr_eng_correlation_df = pd.read_csv(fr_eng_correlation_csv)
# exclude files that aren't downloaded, and files that have been withdrawn
fr_eng_correlation_df = fr_eng_correlation_df[(fr_eng_correlation_df.filename_en.isin(parsed_files)) | (fr_eng_correlation_df.filename_fr.isin(parsed_files))]
fr_eng_correlation_df = fr_eng_correlation_df[(fr_eng_correlation_df.filename_en != 'WITHDRAWN') & (fr_eng_correlation_df.filename_fr != 'WITHDRAWN')]

# weblinks for previewing / checking results
weblinks_df = fr_eng_correlation_df.copy()
weblinks_df = weblinks_df[['pub_number', 'nom', 'name', 'url_fr', 'url_en', 'file_url_fr', 'file_url_en']]

# data to be used for language classifier
lang_df = fr_eng_correlation_df.copy()
lang_df = lang_df[(lang_df.filename_fr.isin(parsed_files_with_hq_ocr)) & (lang_df.filename_en.isin(parsed_files_with_hq_ocr)) & (lang_df.filename_fr != lang_df.filename_en)]

# scrabble dictionaries for removing questionable words
with open('language_classifier/scrabble_dictionaries/en_scrabble.txt', 'r') as f:
    en_scrabble = {line.lower().strip() for line in f}
    
with open('language_classifier/scrabble_dictionaries/fr_scrabble.txt', 'r') as f:
    fr_scrabble = {line.lower().strip() for line in f}

# helper functions

In [15]:
def preview_publication(pub_number):
    if type(pub_number) is pd.DataFrame and pub_number.shape[0] == 1:
        try:
            pub_number = pub_number['pub_number'].values[0]
        except ValueError:
            return None
    elif type(pub_number) is pd.Series:
        try:
            pub_number = pub_number.values[0]
        except ValueError:
            return None
    
    try:
        output_df = weblinks_df[weblinks_df.pub_number == pub_number].T
    except Exception as e:
        print(e)
        return None
        
    return output_df


def get_filepaths(row, min_year=2023):
    fr_filename, en_filename = row['filename_fr'] + '.json', row['filename_en'] + '.json'
    file_folders = [os.path.join('..', 'ParsedPublications', str(year)) for year in range(min_year, 2024 + 1)]
    
    try:
        fr_path, en_path = ([os.path.join(folder, fr_filename) for folder in file_folders if os.path.exists(os.path.join(folder, fr_filename))][0], 
                            [os.path.join(folder, en_filename) for folder in file_folders if os.path.exists(os.path.join(folder, en_filename))][0])
    except IndexError:
        return None, None
    
    return fr_path, en_path


def remove_accents(word):
    return ''.join(c for c in unicodedata.normalize('NFKD', word) if not unicodedata.combining(c))


def remove_non_scrabble_words(words, scrabble_dictionary):
    return {word for word in words if remove_accents(word) in scrabble_dictionary}


def test_wordlists(text_block, english_words, french_words):
    en_count = sum(1 for word in text_block.split() if word in english_words)
    fr_count = sum(1 for word in text_block.split() if word in french_words)
    
    print('english words:', list(word for word in text_block.split() if word in english_words))
    print('french words:', list(word for word in text_block.split() if word in french_words))
    print(f'{en_count=}, {fr_count=}')    
    
    
def most_common_word_info(counter_obj, n=10):  
    length = counter_obj.total()
    c_v = 0
    for k, v in counter_obj.most_common(n):
        rng = f'({(100 * c_v) / length:.0f}%'
        c_v += v
        rng += f'-{(100 * c_v) / length:.0f}%)'
        print(f'{k:<20}{v:>8}{(100 * v) / length:>8.0f}%{rng:>15}')


def nth_percentile(p, counter_obj, greater_than=True):
    sorted_list = sorted(counter_obj.items(), key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    return sorted_list[index]


def nth_percentile_weighted(p, counter_expanded, greater_than=True):
    sorted_list = sorted(counter_expanded, key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    return sorted_list[index]


def count_nth_percentile(p, counter_obj, greater_than=True):
    sorted_list = sorted(counter_obj.items(), key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    return len(sorted_list[index:]) if greater_than else len(sorted_list[-max(index, 1):])


def count_nth_percentile_weighted(p, counter_expanded, greater_than=True):
    sorted_list = sorted(counter_expanded, key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    count_if_gte = sorted_list[len(sorted_list) - index][1]
    count_if_lte = sorted_list[index][1]
        
    gte = {x for x in counter_expanded if x[1] >= count_if_gte}
    lte = {x for x in counter_expanded if x[1] <= count_if_lte}
    
    return len(gte) if greater_than else len(lte)


def examples_at_word_count(n, counter_obj, n_samples=5, tolerance=0.1):
    lower, upper = int((1 - 0.1) * n), int((1 + 0.1) * n)
    all_examples = [k for k, v in counter_obj.items() if lower <= v <= upper]
    
    return random.sample(all_examples, min(n_samples, len(all_examples)))


In [66]:
def process_sentences_list(text, ref):
    valid_word_regex = re.compile(r'^[a-zA-ZÀ-ÿ]+$')
    min_length, max_length = 5, 20
    sentences = []
    
    document_parts = re.split(ref, text, flags=re.IGNORECASE)
    if 2 < len(document_parts) < 5:
        text = document_parts[1]
    
    cleaned_text = re.sub(r'[^a-zA-ZÀ-ÿ.\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return [
        sentence.strip() for sentence in cleaned_text.split('.')
        if min_length <= len(sentence.split()) <= max_length
        and all(valid_word_regex.match(word) for word in sentence.split())
    ]


def generate_sentences_lists():
    valid_word_regex = re.compile(r'^[a-zA-ZÀ-ÿ]+$')
    min_length, max_length = 5, 20
    
    french_example_sentences = []
    english_example_sentences = []
    
    references_fr = r'RÉFÉRENCES CITÉES'.lower()
    references_en = r'REFERENCES CITED'.lower()
        
    for i, row in lang_df.iterrows():
        fr_path, en_path = get_filepaths(row)
        
        with open(fr_path, 'r', encoding='utf-8') as file:
            fr_text = json.load(file).get('text', '').lower()
            french_example_sentences.extend(process_sentences_list(fr_text, references_fr))
        
        with open(en_path, 'r', encoding='utf-8') as file:
            en_text = json.load(file).get('text', '').lower()
            english_example_sentences.extend(process_sentences_list(en_text, references_en))
            
    # clean messy sentences
    margin_required = 1

    french_example_sentences = [
        sentence for sentence in french_example_sentences
        if sum(1 for x in sentence.split() if x in fr_scrabble) > sum(1 for x in sentence.split() if x in en_scrabble) + margin_required
    ]
    
    english_example_sentences = [
        sentence for sentence in english_example_sentences
        if sum(1 for x in sentence.split() if x in en_scrabble) > sum(1 for x in sentence.split() if x in fr_scrabble) + margin_required
    ]
        
    return french_example_sentences, english_example_sentences


In [29]:
# generate example sentences for testing
french_example_sentences, english_example_sentences = generate_sentences_lists()

In [30]:
# before scrabble cleaning
len(french_example_sentences), len(english_example_sentences)

(44271, 68556)

In [67]:
# after scrabble cleaning
french_example_sentences, english_example_sentences = generate_sentences_lists()
len(french_example_sentences), len(english_example_sentences)

(35393, 66084)

In [92]:
def process_word_list(text, references_pattern):
    valid_word_regex = re.compile(r'^[a-zA-ZÀ-ÿ]+$')
    
    document_parts = re.split(references_pattern, text, flags=re.IGNORECASE)
    if 2 < len(document_parts) < 5:
        text = document_parts[1]
    word_list = [word for word in text.split() if valid_word_regex.match(word)]
    
    return word_list


def generate_word_lists(n):
    french_word_list = []
    english_word_list = []
    exclude_words_with_less_than_n = n
    
    references_fr = r'RÉFÉRENCES CITÉES'.lower()
    references_en = r'REFERENCES CITED'.lower()
        
    for i, row in lang_df.iterrows():
        fr_path, en_path = get_filepaths(row)
        
        with open(fr_path, 'r', encoding='utf-8') as file:
            fr_text = json.load(file).get('text', '').lower()
            french_word_list.extend(process_word_list(fr_text, references_fr))
        
        with open(en_path, 'r', encoding='utf-8') as file:
            en_text = json.load(file).get('text', '').lower()
            english_word_list.extend(process_word_list(en_text, references_en))
    
    
    french_word_counts = Counter(french_word_list)
    english_word_counts = Counter(english_word_list)
            
    french_word_list = [word for word, count in french_word_counts.items() if count >= exclude_words_with_less_than_n]
    english_word_list = [word for word, count in english_word_counts.items() if count >= exclude_words_with_less_than_n]
    
    french_words = set(french_word_list)
    english_words = set(english_word_list)
    
    french_word_list = remove_non_scrabble_words(french_word_list, fr_scrabble)
    english_word_list = remove_non_scrabble_words(english_word_list, en_scrabble)
    
    overlapping_words = english_words & french_words
    english_words.difference_update(overlapping_words)
    french_words.difference_update(overlapping_words)
    
    english_words.difference_update({w for w in english_words if w.isnumeric()})
    french_words.difference_update({w for w in french_words if w.isnumeric()})
    
    return english_words, french_words


In [93]:
for n in [0, 1, 10, 150, 1000]:
    english_words, french_words = generate_word_lists(n)
    print(len(english_words), len(french_words))

12395 18914
12395 18914
5618 7113
1531 1517
288 268


In [97]:
def process_results(n_to_exclude, french_example_sentences_n, english_example_sentences_n):
    results = []
    
    for n in n_to_exclude:
        print(f'Processing {n}')
        english_words, french_words = generate_word_lists(n)
        
        for sentence in french_example_sentences_n:
            fr_count = sum(1 for word in sentence.split() if word in french_words)
            en_count = sum(1 for word in sentence.split() if word in english_words)
            results.append((n, 'fr', fr_count, en_count))
        
        for sentence in english_example_sentences_n:
            fr_count = sum(1 for word in sentence.split() if word in french_words)
            en_count = sum(1 for word in sentence.split() if word in english_words)
            results.append((n, 'en', fr_count, en_count))
        
    results_df = pd.DataFrame(results)
    results_df.columns = ['n_excluded', 'language', 'fr_count', 'en_count']
    
    results_df['total_count'] = results_df['fr_count'] + results_df['en_count']
    
    valid_mask = results_df['total_count'] > 0
    
    results_df['correct_count'] = np.where(
        (results_df['language'] == 'fr') & valid_mask,
        results_df['fr_count'],
        np.where(
            (results_df['language'] == 'en') & valid_mask,
            results_df['en_count'],
            0
        )
    )
    
    results_df['wrong_count'] = np.where(
        (results_df['language'] == 'fr') & valid_mask,
        results_df['en_count'],
        np.where(
            (results_df['language'] == 'en') & valid_mask,
            results_df['fr_count'],
            0
        )
    )
    
    results_df['is_correct'] = results_df['correct_count'] > results_df['wrong_count']
    
    return results_df


def create_stats(results_df):
    
    # Define FP & FN for each language
    results_df['fr_false_positive'] = (results_df['language'] == 'en') & (results_df['is_correct'] == False)
    results_df['fr_false_negative'] = (results_df['language'] == 'fr') & (results_df['is_correct'] == False)
    results_df['en_false_positive'] = (results_df['language'] == 'fr') & (results_df['is_correct'] == False)
    results_df['en_false_negative'] = (results_df['language'] == 'en') & (results_df['is_correct'] == False)
    
    # Compute separate aggregations
    grouped_df = results_df.groupby('n_excluded').agg(
        total_count=('is_correct', 'count'),  # count rows (not the same as results_df['total_count']
        
        # Correct and incorrect classifications
        correct_count=('is_correct', 'sum'),
        wrong_count=('is_correct', lambda x: (~x).sum()),  
    
        # False Positives & False Negatives for each language
        fr_false_positive=('fr_false_positive', 'sum'),
        fr_false_negative=('fr_false_negative', 'sum'),
        en_false_positive=('en_false_positive', 'sum'),
        en_false_negative=('en_false_negative', 'sum')
    ).reset_index()
    
    # Avoid division by zero
    valid_mask = grouped_df['total_count'] > 0
    
    # Accuracy (same for both languages)
    grouped_df['accuracy'] = np.where(valid_mask, grouped_df['correct_count'] / grouped_df['total_count'], 0)
    
    # Precision & Recall for French
    fr_precision_mask = (grouped_df['correct_count'] + grouped_df['fr_false_positive']) > 0
    grouped_df['fr_precision'] = np.where(
        fr_precision_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['fr_false_positive']), 
        0
    )
    
    fr_recall_mask = (grouped_df['correct_count'] + grouped_df['fr_false_negative']) > 0
    grouped_df['fr_recall'] = np.where(
        fr_recall_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['fr_false_negative']), 
        0
    )
    
    # Precision & Recall for English
    en_precision_mask = (grouped_df['correct_count'] + grouped_df['en_false_positive']) > 0
    grouped_df['en_precision'] = np.where(
        en_precision_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['en_false_positive']), 
        0
    )
    
    en_recall_mask = (grouped_df['correct_count'] + grouped_df['en_false_negative']) > 0
    grouped_df['en_recall'] = np.where(
        en_recall_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['en_false_negative']), 
        0
    )
    
    # F1-scores
    grouped_df['fr_f1_score'] = np.where(
        (grouped_df['fr_precision'] + grouped_df['fr_recall']) > 0,
        2 * (grouped_df['fr_precision'] * grouped_df['fr_recall']) / (grouped_df['fr_precision'] + grouped_df['fr_recall']),
        0
    )
    
    grouped_df['en_f1_score'] = np.where(
        (grouped_df['en_precision'] + grouped_df['en_recall']) > 0,
        2 * (grouped_df['en_precision'] * grouped_df['en_recall']) / (grouped_df['en_precision'] + grouped_df['en_recall']),
        0
    )
    
    grouped_df['stats_sum'] = grouped_df[['accuracy', 'fr_precision', 'fr_recall', 'en_precision', 'en_recall', 'fr_f1_score', 'en_f1_score']].sum(axis=1)
    
    return grouped_df


In [98]:
n_to_exclude = [0, 1, 5, 10, 20, 50, 100, 200, 500, 1000]
n_trials = 100

results_df = process_results(
    n_to_exclude, 
    random.sample(french_example_sentences, n_trials), 
    random.sample(english_example_sentences, n_trials)
)
grouped_df = create_stats(results_df)

Processing 0
Processing 1
Processing 5
Processing 10
Processing 20
Processing 50
Processing 100
Processing 200
Processing 500
Processing 1000


In [100]:
grouped_df.set_index('n_excluded').T

n_excluded,0,1,5,10,20,50,100,200,500,1000
total_count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
correct_count,177.0,177.0,191.0,198.0,198.0,199.0,199.0,198.0,198.0,196.0
wrong_count,23.0,23.0,9.0,2.0,2.0,1.0,1.0,2.0,2.0,4.0
fr_false_positive,21.0,21.0,9.0,2.0,2.0,1.0,1.0,1.0,1.0,3.0
fr_false_negative,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
en_false_positive,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
en_false_negative,21.0,21.0,9.0,2.0,2.0,1.0,1.0,1.0,1.0,3.0
accuracy,0.89,0.89,0.95,0.99,0.99,0.99,0.99,0.99,0.99,0.98
fr_precision,0.89,0.89,0.95,0.99,0.99,0.99,0.99,0.99,0.99,0.98
fr_recall,0.99,0.99,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99


In [101]:
# all stats added together
grouped_df.set_index('n_excluded')['stats_sum']

n_excluded
0      6.53
1      6.53
5      6.82
10     6.96
20     6.96
50     6.98
100    6.98
200    6.96
500    6.96
1000   6.92
Name: stats_sum, dtype: float64

In [102]:
# backup old dfs
results_df_BACKUP = results_df.copy() 
grouped_df_BACKUP = grouped_df.copy() 

In [103]:
# check from 10 to 500 in more detail
n_to_exclude = [x for x in range(10, 200, 10)] + [x for x in range(200, 520, 20)]
n_trials = 1000

results_df = process_results(
    n_to_exclude, 
    random.sample(french_example_sentences, n_trials), 
    random.sample(english_example_sentences, n_trials)
)
grouped_df = create_stats(results_df)

Processing 10
Processing 20
Processing 30
Processing 40
Processing 50
Processing 60
Processing 70
Processing 80
Processing 90
Processing 100
Processing 110
Processing 120
Processing 130
Processing 140
Processing 150
Processing 160
Processing 170
Processing 180
Processing 190
Processing 200
Processing 220
Processing 240
Processing 260
Processing 280
Processing 300
Processing 320
Processing 340
Processing 360
Processing 380
Processing 400
Processing 420
Processing 440
Processing 460
Processing 480
Processing 500


In [104]:
grouped_df.set_index('n_excluded').T

n_excluded,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,220,240,260,280,300,320,340,360,380,400,420,440,460,480,500
total_count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
correct_count,1938.0,1965.0,1973.0,1977.0,1987.0,1988.0,1990.0,1988.0,1989.0,1993.0,1992.0,1992.0,1992.0,1994.0,1994.0,1996.0,1998.0,1991.0,1991.0,1991.0,1989.0,1987.0,1986.0,1985.0,1983.0,1983.0,1979.0,1979.0,1980.0,1978.0,1977.0,1977.0,1978.0,1978.0,1979.0
wrong_count,62.0,35.0,27.0,23.0,13.0,12.0,10.0,12.0,11.0,7.0,8.0,8.0,8.0,6.0,6.0,4.0,2.0,9.0,9.0,9.0,11.0,13.0,14.0,15.0,17.0,17.0,21.0,21.0,20.0,22.0,23.0,23.0,22.0,22.0,21.0
fr_false_positive,52.0,34.0,25.0,22.0,12.0,10.0,8.0,10.0,9.0,5.0,6.0,7.0,7.0,5.0,5.0,3.0,1.0,8.0,8.0,8.0,10.0,12.0,13.0,13.0,15.0,15.0,20.0,20.0,19.0,20.0,21.0,21.0,20.0,20.0,18.0
fr_false_negative,10.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0
en_false_positive,10.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0
en_false_negative,52.0,34.0,25.0,22.0,12.0,10.0,8.0,10.0,9.0,5.0,6.0,7.0,7.0,5.0,5.0,3.0,1.0,8.0,8.0,8.0,10.0,12.0,13.0,13.0,15.0,15.0,20.0,20.0,19.0,20.0,21.0,21.0,20.0,20.0,18.0
accuracy,0.97,0.98,0.99,0.99,0.99,0.99,0.99,0.99,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99
fr_precision,0.97,0.98,0.99,0.99,0.99,0.99,1.0,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99
fr_recall,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [105]:
# all stats added together
grouped_df.set_index('n_excluded')['stats_sum']

n_excluded
10    6.87
20    6.93
30    6.95
40    6.95
50    6.97
60    6.98
70    6.98
80    6.98
90    6.98
100   6.99
110   6.98
120   6.98
130   6.98
140   6.99
150   6.99
160   6.99
170   7.00
180   6.98
190   6.98
200   6.98
220   6.98
240   6.97
260   6.97
280   6.97
300   6.97
320   6.97
340   6.96
360   6.96
380   6.96
400   6.96
420   6.95
440   6.95
460   6.96
480   6.96
500   6.96
Name: stats_sum, dtype: float64

# 170 looks best, confirm with a bigger sample and tighter hyperparameters 

In [106]:
n_to_exclude = [x for x in range(100, 205, 5)]
n_trials = 5000

results_df = process_results(
    n_to_exclude, 
    random.sample(french_example_sentences, n_trials), 
    random.sample(english_example_sentences, n_trials)
)
grouped_df = create_stats(results_df)
grouped_df.set_index('n_excluded')['stats_sum']

Processing 100
Processing 105
Processing 110
Processing 115
Processing 120
Processing 125
Processing 130
Processing 135
Processing 140
Processing 145
Processing 150
Processing 155
Processing 160
Processing 165
Processing 170
Processing 175
Processing 180
Processing 185
Processing 190
Processing 195
Processing 200


n_excluded
100   6.97
105   6.97
110   6.97
115   6.98
120   6.98
125   6.98
130   6.98
135   6.98
140   6.98
145   6.98
150   6.98
155   6.98
160   6.98
165   6.98
170   6.98
175   6.97
180   6.97
185   6.97
190   6.97
195   6.97
200   6.97
Name: stats_sum, dtype: float64

In [108]:
# looks like anything between 115 and 170 is good... try once more with more sig figs and more trials
n_to_exclude = [x for x in range(110, 180, 5)]
n_trials = 10000

results_df = process_results(
    n_to_exclude, 
    random.sample(french_example_sentences, n_trials), 
    random.sample(english_example_sentences, n_trials)
)
grouped_df = create_stats(results_df)
grouped_df.set_index('n_excluded')['stats_sum']

Processing 110
Processing 115
Processing 120
Processing 125
Processing 130
Processing 135
Processing 140
Processing 145
Processing 150
Processing 155
Processing 160
Processing 165
Processing 170
Processing 175


n_excluded
110   6.979
115   6.981
120   6.982
125   6.982
130   6.984
135   6.984
140   6.984
145   6.985
150   6.984
155   6.985
160   6.982
165   6.980
170   6.982
175   6.973
Name: stats_sum, dtype: float64

In [None]:
# best answers centre around 150, but results are very similar. choose 150 arbitrarily because it is the 'roundest' number

# Save Data For Classifier

In [109]:
english_words, french_words = generate_word_lists(150)

# save optimised word lists 
with open("language_classifier/wordlists.json", "w", encoding="utf-8") as f:
    json.dump({"en": list(english_words), "fr": list(french_words)}, f, ensure_ascii=False, indent=4)
    
# save 10k sentences per language for testing
with open("language_classifier/example_sentences.json", "w", encoding="utf-8") as f:
    json.dump({"en": english_example_sentences, "fr": french_example_sentences}, f, ensure_ascii=False, indent=4)