In [1]:
import json
import os
import random
import re
import subprocess
import numpy as np
import pandas as pd

from collections import Counter

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [2]:
# all files that have been downloaded and parsed
parsed_docs_folder = os.path.join("..", "ParsedPublications")

min_year = 2023
parsed_files = list()
parsed_files_with_hq_ocr = list()
for folder in os.listdir(parsed_docs_folder):
    path = os.path.join(parsed_docs_folder, folder)
    if os.path.isdir(path):
        for json_file in os.listdir(path):
            if json_file.endswith(".json"):
                parsed_files.append(json_file.replace('.json', ''))
                if folder in [str(year) for year in range(min_year, 2024 + 1)]:
                    parsed_files_with_hq_ocr.append(json_file.replace('.json', ''))

# all files from website
fr_eng_correlation_csv = "fr_eng_correlation_data.csv"
fr_eng_correlation_df = pd.read_csv(fr_eng_correlation_csv)
# exclude files that aren't downloaded, and files that have been withdrawn
fr_eng_correlation_df = fr_eng_correlation_df[(fr_eng_correlation_df.filename_en.isin(parsed_files)) | (fr_eng_correlation_df.filename_fr.isin(parsed_files))]
fr_eng_correlation_df = fr_eng_correlation_df[(fr_eng_correlation_df.filename_en != 'WITHDRAWN') & (fr_eng_correlation_df.filename_fr != 'WITHDRAWN')]

# weblinks for previewing / checking results
weblinks_df = fr_eng_correlation_df.copy()
weblinks_df = weblinks_df[['pub_number', 'nom', 'name', 'url_fr', 'url_en', 'file_url_fr', 'file_url_en']]

# data to be used for language classifier
lang_df = fr_eng_correlation_df.copy()
lang_df = lang_df[(lang_df.filename_fr.isin(parsed_files_with_hq_ocr)) & (lang_df.filename_en.isin(parsed_files_with_hq_ocr)) & (lang_df.filename_fr != lang_df.filename_en)]


# helper functions

In [3]:
def preview_publication(pub_number):
    if type(pub_number) is pd.DataFrame and pub_number.shape[0] == 1:
        try:
            pub_number = pub_number['pub_number'].values[0]
        except ValueError:
            return None
    elif type(pub_number) is pd.Series:
        try:
            pub_number = pub_number.values[0]
        except ValueError:
            return None
    
    try:
        output_df = weblinks_df[weblinks_df.pub_number == pub_number].T
    except Exception as e:
        print(e)
        return None
        
    return output_df


def get_filepaths(row, min_year=2023):
    fr_filename, en_filename = row['filename_fr'] + '.json', row['filename_en'] + '.json'
    file_folders = [os.path.join('..', 'ParsedPublications', str(year)) for year in range(min_year, 2024 + 1)]
    
    try:
        fr_path, en_path = ([os.path.join(folder, fr_filename) for folder in file_folders if os.path.exists(os.path.join(folder, fr_filename))][0], 
                            [os.path.join(folder, en_filename) for folder in file_folders if os.path.exists(os.path.join(folder, en_filename))][0])
    except IndexError:
        return None, None
    
    return fr_path, en_path


In [180]:
# Make lists of all French words and all English words

valid_word_regex = re.compile(r'^[a-zA-ZÀ-ÿ]+$')
french_word_list = []
english_word_list = []
exclude_words_with_less_than_n = 10

# clean headers and appendices
references_fr = r'RÉFÉRENCES CITÉES'.lower()
references_en = r'REFERENCES CITED'.lower()


for i, row in lang_df.iterrows():
    fr_path, en_path = get_filepaths(row)
    
    with open(fr_path, 'r', encoding='utf-8') as file:
        fr_text = json.load(file).get('text', '').lower()
            
        parts = re.split(references_fr, fr_text, flags=re.IGNORECASE)
        if 2 < len(parts) < 5:  # if 2 or 3 occurences of references text, take the second part (to get the main body text)
            fr_text = parts[1]
        
        french_word_list.extend(word for word in fr_text.split() if valid_word_regex.match(word))
    
    with open(en_path, 'r', encoding='utf-8') as file:
        en_text = json.load(file).get('text', '').lower()
            
        parts = re.split(references_en, en_text, flags=re.IGNORECASE)
        if 2 < len(parts) < 5:  # if 2 or 3 occurences of references text, take the second part (to get the main body text)
            en_text = parts[1]
        
        english_word_list.extend(word for word in en_text.split() if valid_word_regex.match(word))
        
# For testing
french_word_counts = Counter(french_word_list)
french_word_counts_expanded = []
for word, count in french_word_counts.items():
    for _ in range(count):
        french_word_counts_expanded.append((word, count))
        
english_word_counts = Counter(english_word_list)
english_word_counts_expanded = []
for word, count in english_word_counts.items():
    for _ in range(count):
        english_word_counts_expanded.append((word, count))

full_french_word_list = french_word_list.copy()
full_english_word_list = english_word_list.copy()

# Remove words with less than 10 occurrences
french_word_list = [word for word, count in french_word_counts.items() if count >= exclude_words_with_less_than_n]
english_word_list = [word for word, count in english_word_counts.items() if count >= exclude_words_with_less_than_n]

# Convert to sets for further processing
french_words = set(french_word_list)
english_words = set(english_word_list)

# Remove overlapping words
overlapping_words = english_words & french_words
english_words.difference_update(overlapping_words)
french_words.difference_update(overlapping_words)

# Remove numeric-only words
english_words.difference_update({w for w in english_words if w.isnumeric()})
french_words.difference_update({w for w in french_words if w.isnumeric()})


In [226]:
# helper functions for word lists

def test_wordlists(text_block, english_words, french_words):
    en_count = sum(1 for word in text_block.split() if word in english_words)
    fr_count = sum(1 for word in text_block.split() if word in french_words)
    
    print('english words:', list(word for word in text_block.split() if word in english_words))
    print('french words:', list(word for word in text_block.split() if word in french_words))
    print(f'{en_count=}, {fr_count=}')    
    
def most_common_word_info(counter_obj, n=10):  
    length = counter_obj.total()
    c_v = 0
    for k, v in counter_obj.most_common(n):
        rng = f'({(100 * c_v) / length:.0f}%'
        c_v += v
        rng += f'-{(100 * c_v) / length:.0f}%)'
        print(f'{k:<20}{v:>8}{(100 * v) / length:>8.0f}%{rng:>15}')


In [236]:
test_wordlists('Total mortalities at age, based on survey data, are presented in Table', english_words, french_words)
print()
most_common_word_info(english_word_counts, 5)
print()
most_common_word_info(french_word_counts, 5)

english words: ['mortalities', 'presented']
french words: []
en_count=2, fr_count=0

the                   168894       7%        (0%-7%)
of                     94029       4%       (7%-11%)
and                    93778       4%      (11%-15%)
in                     72480       3%      (15%-19%)
to                     57614       2%      (19%-21%)

de                    237366       8%        (0%-8%)
la                    140133       5%       (8%-13%)
et                    100646       4%      (13%-17%)
les                   100095       4%      (17%-21%)
des                    97138       3%      (21%-24%)


In [404]:
# * what is the nth percentile within the Counter obj word_counts? eg, for the 95th percentile, check 95% of the way down word_counts and see how many times that word occured.
# * what is the nth percentile - weighted by occurrence? eg, for the 95th percentile, maybe 95% of words occur at least 100 times. this should be based on this code snippet:
#     expanded_counts = []
#     for word, count in word_counts.items():
#         expanded_counts.extend([count] * count)
# * how many words are in the top nth percentile of entries in word_counts (a Counter obj)? ie, how many entries in word_counts have a value of at least the nth percentile (unweighted) calcualted above?
# * how many words are in the top nth percentile words based on occurrence, based on this function:
#     expanded_counts = []
#     for word, count in word_counts.items():
#         expanded_counts.extend([count] * count)
# ie, if you added every word to a list the number of times it happens in word counts, and made a histogram, how many different distinct words would fall above the nth percentile
# * what are examples of words within a tolerance of the unweighted percentile?
# * what are examples of words within a tolerance of the weighted percentile?


def nth_percentile(p, counter_obj, greater_than=True):
    sorted_list = sorted(counter_obj.items(), key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    return sorted_list[index]

def nth_percentile_weighted(p, counter_expanded, greater_than=True):
    sorted_list = sorted(counter_expanded, key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    return sorted_list[index]

def count_nth_percentile(p, counter_obj, greater_than=True):
    sorted_list = sorted(counter_obj.items(), key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    return len(sorted_list[index:]) if greater_than else len(sorted_list[-max(index, 1):])

def count_nth_percentile_weighted(p, counter_expanded, greater_than=True):
    sorted_list = sorted(counter_expanded, key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    count_if_gte = sorted_list[len(sorted_list) - index][1]
    count_if_lte = sorted_list[index][1]
        
    gte = {x for x in counter_expanded if x[1] >= count_if_gte}
    lte = {x for x in counter_expanded if x[1] <= count_if_lte}
    
    return len(gte) if greater_than else len(lte)

def examples_at_word_count(n, counter_obj, n_samples=5, tolerance=0.1):
    lower, upper = int((1 - 0.1) * n), int((1 + 0.1) * n)
    all_examples = [k for k, v in counter_obj.items() if lower <= v <= upper]
    
    return random.sample(all_examples, min(n_samples, len(all_examples)))
    
    

In [422]:
n = 300
tolerance = 0.1
n_samples = 5
counter_obj = english_word_counts

lower, upper = int((1 - 0.1) * n), int((1 + 0.1) * n)
all_examples = [k for k, v in counter_obj.items() if lower <= v <= upper]

r_trn = random.sample(all_examples, min(n_samples, len(all_examples)))
r_trn

['efforts', 'discards', 'trout', 'narw', 'appear']

In [413]:
[x for x in counter_obj][0]

'canadian'

In [410]:
most_common_word_info(english_word_counts, 15)

the                   168894       7%        (0%-7%)
of                     94029       4%       (7%-11%)
and                    93778       4%      (11%-15%)
in                     72480       3%      (15%-19%)
to                     57614       2%      (19%-21%)
for                    36588       2%      (21%-23%)
a                      34697       2%      (23%-24%)
is                     24125       1%      (24%-25%)
that                   19399       1%      (25%-26%)
from                   18371       1%      (26%-27%)
are                    18228       1%      (27%-28%)
on                     17458       1%      (28%-28%)
was                    16979       1%      (28%-29%)
with                   15689       1%      (29%-30%)
as                     15356       1%      (30%-30%)


In [409]:
p = 0.7

print('greater_than', p)
count_nth_percentile_weighted(p, english_word_counts_expanded, True)

# print('less_than', p)
# count_nth_percentile_weighted(p, english_word_counts_expanded, False)

greater_than 0.7


15

In [403]:
counter_expanded = english_word_counts_expanded
greater_than = True
p = 0.9

sorted_list = sorted(counter_expanded, key=lambda x: x[1], reverse=greater_than)
index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)

count_if_gte = sorted_list[len(sorted_list) - index][1]
count_if_lte = sorted_list[index][1]
    
gte = {x for x in counter_expanded if x[1] >= count_if_gte}
lte = {x for x in counter_expanded if x[1] <= count_if_lte}

r_trn = len(gte) if greater_than else len(lte)

print(index)
print(count_if_gte)
print(count_if_lte)
print(list(gte)[:5])
print(list(lte)[:5])

print(r_trn)

2080718
94029
99
[('the', 168894), ('of', 94029)]
[('smus', 4), ('malma', 3), ('ornamental', 78), ('fishermen', 59), ('aphyocharax', 1)]
2


In [390]:
index, len(counter_expanded)

(2288789, 2311909)

In [382]:
sorted(counter_expanded, key=lambda x: x[1], reverse=True)

[('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 168894),
 ('the', 1

In [366]:
def count_nth_percentile_weighted(p, counter_expanded, greater_than=True):
    sorted_list = sorted(counter_expanded, key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    print(index)
    print(sorted_list[index])
    
    n_at_nth = sorted_list[index][1]
    
    print(n_at_nth)
        
    gte = {x for x in counter_expanded if x[1] >= n_at_nth}
    lte = {x for x in counter_expanded if x[1] <= n_at_nth}
    
    print('occur more than', n_at_nth, random.sample(list(gte), min(5, len(gte))))
    print('occur less than', n_at_nth, random.sample(list(lte), min(5, len(lte))))
    
    print('number more than', n_at_nth, len(gte))
    print('number less than', n_at_nth, len(lte))
        
    return len(gte) if greater_than else len(lte)

p = 0.9999

print('greater_than', p)
count_nth_percentile_weighted(p, english_word_counts_expanded, True)

# print('less_than', p)
# count_nth_percentile_weighted(p, english_word_counts_expanded, False)


greater_than 0.9999
2311677
('zodiacs', 1)
1
occur more than 1 [('certainty', 130), ('prioritized', 25), ('analyzes', 7), ('carrier', 8), ('milipsigate', 6)]
occur less than 1 [('tatara', 1), ('acacienne', 1), ('trimean', 1), ('rpm', 1), ('thereof', 1)]
number more than 1 25194
number less than 1 7481


25194

number more than 1 25194
number less than 1 7481
number more than 168894 1
number less than 168894 25194


(25194, 25194)

In [349]:
count_nth_percentile(1, english_word_counts, True), count_nth_percentile(1, english_word_counts, False)
# gte 100%, lte 100%

(1, 25193)

In [347]:
# greater than 100th percentile (only top entry)? or top 100th percentile (all entries)?

def count_nth_percentile_weighted(p, counter_expanded, greater_than=True):
    sorted_list = sorted(counter_expanded, key=lambda x: x[1], reverse=greater_than)
    index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0)
    
    print(sorted_list[index])
    n_at_nth = sorted_list[index][1]
    print(n_at_nth)
        
    gte = {x for x in counter_expanded if x[1] >= n_at_nth}
    lte = {x for x in counter_expanded if x[1] <= n_at_nth}
    
    print('occur more than', n_at_nth, random.sample(list(gte), 5))
    print('occur less than', n_at_nth, random.sample(list(lte), 5))
    
    print('number more than', n_at_nth, len(gte))
    print('number less than', n_at_nth, len(lte))
        
    return len(gte) if greater_than else len(lte)

p = 1
count_nth_percentile_weighted(p, english_word_counts_expanded, True)

('hardware', 1)
1
occur more than 1 [('averse', 1), ('chsrg', 12), ('advises', 2), ('aurelia', 4), ('seizures', 1)]
occur less than 1 [('bugs', 1), ('fenced', 1), ('crivelli', 1), ('interrupt', 1), ('gaskin', 1)]
number more than 1 25194
number less than 1 7481


25194

In [285]:
counter_expanded = english_word_counts_expanded
greater_than=True

sorted_list = sorted(counter_expanded, key=lambda x: x[1], reverse=greater_than)
print(sorted_list[0])
index = max(min(len(sorted_list) - 1, int(len(sorted_list) * p)), 0) # backwards?
print(index)
print(sorted_list[index])
n_at_nth = sorted_list[index][1]
print(n_at_nth)
gte = {x for x in counter_expanded if x[1] >= n_at_nth}
lte = {x for x in counter_expanded if x[1] <= n_at_nth}

('the', 168894)
2311908
('hardware', 1)
1


In [270]:
n_at_nth = 2571
len({x for x in english_word_counts_expanded if x[1] > n_at_nth})

108

In [244]:
len(english_word_counts)

25194

In [261]:
english_word_counts.total()

2311909

0 0


25194

In [238]:
p = 0.1
nth_percentile(p, english_word_counts), nth_percentile(p, english_word_counts, False)

(('endemism', 87), ('terebratulina', 1))

In [239]:
p = 0.1
nth_percentile_weighted(p, english_word_counts_expanded), nth_percentile_weighted(p, english_word_counts_expanded, False)

(('of', 94029), ('adolescent', 99))

In [241]:
p = 0.95
nth_percentile_weighted(p, english_word_counts_expanded), nth_percentile_weighted(p, english_word_counts_expanded, False)

(('cobble', 34), ('the', 168894))

In [240]:
most_common_word_info(english_word_counts, 5)
print()
most_common_word_info(french_word_counts, 5)

the                   168894       7%        (0%-7%)
of                     94029       4%       (7%-11%)
and                    93778       4%      (11%-15%)
in                     72480       3%      (15%-19%)
to                     57614       2%      (19%-21%)

de                    237366       8%        (0%-8%)
la                    140133       5%       (8%-13%)
et                    100646       4%      (13%-17%)
les                   100095       4%      (17%-21%)
des                    97138       3%      (21%-24%)


In [215]:
nth_percentile_weighted(.2, english_word_counts_expanded)

462381


('to', 57614)

In [224]:
p = 0.05
nth_percentile_weighted(p, english_word_counts_expanded), nth_percentile_weighted(p, english_word_counts_expanded, False)

115595
115595


(('the', 168894), ('conductivity', 34))

In [117]:
# TODO: update
def calc_percentile(n, word_counts, greater_than=True):
    n_least_common = int(len(word_counts) * n)
    less_than_n = word_counts.most_common()[::-1][:n_least_common][::-1]
    greater_than_n = word_counts.most_common()[:n_least_common][::-1]
    if greater_than:
        nth_percentile = greater_than_n[0][1]
    else:
        nth_percentile = less_than_n[0][1]

    print(f"{n*100:0.0f}th percentile: {nth_percentile}")

    expanded_counts = []
    for word, count in word_counts.items():
        expanded_counts.extend([count] * count)
    expanded_counts = sorted(expanded_counts) if greater_than else sorted(expanded_counts, reverse=True)
    weighted_nth_percentile = int(np.percentile(expanded_counts, n))
    print(f"Weighted {n*100:0.0f}th percentile: {weighted_nth_percentile}")

    # examples
    print('EXAMPLES')
    n_examples = 5
    tolerance_for_examples = 1

    lower, upper = nth_percentile * (1 - tolerance_for_examples), nth_percentile * (1 + tolerance_for_examples)
    examples = [x[0] for x in greater_than_n if lower < x[1] < upper] if greater_than else [x[0] for x in less_than_n if lower < x[1] < upper]
    list_of_n_examples = random.sample(examples, n_examples)

    lower_weighted, upper_weighted = weighted_nth_percentile * (1 - tolerance_for_examples), weighted_nth_percentile * (1 + tolerance_for_examples)
    examples_weighted = [k for k, v in word_counts.items() if lower_weighted < v < upper_weighted]
    list_of_n_examples_weighted = random.sample(examples_weighted, n_examples)

    print('number of words greater than nth percentile:\t\t\t', len(examples))
    print('number of words greater than nth percentile (weighted):\t', len(examples_weighted))

    print(f"\t{n*100:0.0f}th percentile word examples: \t\t\t\t{', '.join(list_of_n_examples)}")
    print(f"\tWeighted {n*100:0.0f}th percentile word examples: \t{', '.join(list_of_n_examples_weighted)}")


calc_percentile(0.95, french_word_counts)

95th percentile: 1
Weighted 95th percentile: 4
EXAMPLES
number of words greater than nth percentile:			 8150
number of words greater than nth percentile (weighted):	 20924
	95th percentile word examples: 				réarrangements, krause, yellowfin, reconnaissions, maceina
	Weighted 95th percentile word examples: 	maculatus, miriam, rotations, fair, jackie


In [118]:
calc_percentile(0.05, french_word_counts)

5th percentile: 172
Weighted 5th percentile: 1
EXAMPLES
number of words greater than nth percentile:			 711
number of words greater than nth percentile (weighted):	 9736
	5th percentile word examples: 				degré, cétacés, effectuées, sauf, combinaison
	Weighted 5th percentile word examples: 	ensure, cascapédia, obtained, dépendît, retablissement


In [119]:
calc_percentile(0.5, french_word_counts)

50th percentile: 3
Weighted 50th percentile: 2
EXAMPLES
number of words greater than nth percentile:			 3428
number of words greater than nth percentile (weighted):	 16409
	50th percentile word examples: 				codage, mucus, doyle, êtres, peregrinum
	Weighted 50th percentile word examples: 	subsisterait, boulva, rhinichthys, dimorphism, chou


In [121]:
def calc_percentile(n, word_counts):
    total_words = len(word_counts)
    n_index = int(total_words * (1 - n))
    nth_percentile = sorted(word_counts.values(), reverse=True)[n_index]

    expanded_counts = []
    for word, count in word_counts.items():
        expanded_counts.extend([count] * count)
    weighted_nth_percentile = int(np.percentile(expanded_counts, n * 100))

    num_words_above_percentile = sum(1 for count in word_counts.values() if count >= nth_percentile)
    num_words_above_weighted_percentile = sum(1 for word, count in word_counts.items() if count >= weighted_nth_percentile)

    tolerance_for_examples = 0.1
    lower, upper = nth_percentile * (1 - tolerance_for_examples), nth_percentile * (1 + tolerance_for_examples)
    examples = [word for word, count in word_counts.items() if lower <= count <= upper]

    lower_weighted, upper_weighted = weighted_nth_percentile * (1 - tolerance_for_examples), weighted_nth_percentile * (1 + tolerance_for_examples)
    examples_weighted = [word for word, count in word_counts.items() if lower_weighted <= count <= upper_weighted]

    n_examples = 5
    list_of_n_examples = random.sample(examples, min(n_examples, len(examples))) if examples else []
    list_of_n_examples_weighted = random.sample(examples_weighted, min(n_examples, len(examples_weighted))) if examples_weighted else []

    print(f"{n*100:0.0f}th percentile: {nth_percentile}")
    print(f"Weighted {n*100:0.0f}th percentile: {weighted_nth_percentile}")
    print(f"Number of words above nth percentile: {num_words_above_percentile}")
    print(f"Number of words above weighted nth percentile: {num_words_above_weighted_percentile}")
    print(f"Examples within {tolerance_for_examples*100:.0f}% of nth percentile: {', '.join(list_of_n_examples) if list_of_n_examples else 'None'}")
    print(f"Examples within {tolerance_for_examples*100:.0f}% of weighted nth percentile: {', '.join(list_of_n_examples_weighted) if list_of_n_examples_weighted else 'None'}")

    
calc_percentile(0.95, french_word_counts)

95th percentile: 172
Weighted 95th percentile: 237366
Number of words above nth percentile: 1586
Number of words above weighted nth percentile: 1
Examples within 10% of nth percentile: future, naturelles, aquaculture, décrits, biorégion
Examples within 10% of weighted nth percentile: de


In [122]:
calc_percentile(0.05, french_word_counts)

5th percentile: 1
Weighted 5th percentile: 32
Number of words above nth percentile: 31713
Number of words above weighted nth percentile: 4942
Examples within 10% of nth percentile: delphniapterus, marchent, sado, kenai, your
Examples within 10% of weighted nth percentile: solide, retirées, numéros, souplesse, déclarations


In [123]:
calc_percentile(0.5, french_word_counts)

50th percentile: 3
Weighted 50th percentile: 5866
Number of words above nth percentile: 17762
Number of words above weighted nth percentile: 46
Examples within 10% of nth percentile: normalité, préconisé, hyperplasie, hamley, présenteraient
Examples within 10% of weighted nth percentile: se, ne, relevé, cette, stock
