In [1]:
import unicodedata
import os
import json
import random
import numpy as np
import pandas as pd

from language_classifier import LanguageClassifier
    
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

with open("example_sentences.json", "r", encoding="utf-8") as f:
    sentences = json.load(f)
    

In [2]:
def evaluate_classifier(classifier, n_trials, tolerance_list):
    results = []
    
    for tolerance in tolerance_list:
        for k, v in sentences.items():
            for sentence in random.sample(v, n_trials):
                classification = classifier.classify(sentence, tolerance) 
                results.append([tolerance, k, classification, sentence])
                
    df = pd.DataFrame(results, columns=['tolerance', 'language', 'classification', 'sentence'])
    
    df['is_correct'] = df['classification'] == df['language']
    df['fr_false_positive'] = (df['classification'] == 'fr') & (df['language'] == 'en')
    df['fr_false_negative'] = (df['classification'] != 'fr') & (df['language'] == 'fr')
    df['en_false_positive'] = (df['classification'] == 'en') & (df['language'] == 'fr')
    df['en_false_negative'] = (df['classification'] != 'en') & (df['language'] == 'en')
    
    return df

In [3]:
def create_stats(results_df):
    grouped_df = results_df.groupby('tolerance').agg(
        total_count=('is_correct', 'count'),  # count rows (not the same as results_df['total_count']
        
        # Correct and incorrect classifications
        correct_count=('is_correct', 'sum'),
        wrong_count=('is_correct', lambda x: (~x).sum()),  
    
        # False Positives & False Negatives for each language
        fr_false_positive=('fr_false_positive', 'sum'),
        fr_false_negative=('fr_false_negative', 'sum'),
        en_false_positive=('en_false_positive', 'sum'),
        en_false_negative=('en_false_negative', 'sum')
    ).reset_index()
    
    # Avoid division by zero
    valid_mask = grouped_df['total_count'] > 0
    
    # Accuracy (same for both languages)
    grouped_df['accuracy'] = np.where(valid_mask, grouped_df['correct_count'] / grouped_df['total_count'], 0)
    
    # Precision & Recall for French
    fr_precision_mask = (grouped_df['correct_count'] + grouped_df['fr_false_positive']) > 0
    grouped_df['fr_precision'] = np.where(
        fr_precision_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['fr_false_positive']), 
        0
    )
    
    fr_recall_mask = (grouped_df['correct_count'] + grouped_df['fr_false_negative']) > 0
    grouped_df['fr_recall'] = np.where(
        fr_recall_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['fr_false_negative']), 
        0
    )
    
    # Precision & Recall for English
    en_precision_mask = (grouped_df['correct_count'] + grouped_df['en_false_positive']) > 0
    grouped_df['en_precision'] = np.where(
        en_precision_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['en_false_positive']), 
        0
    )
    
    en_recall_mask = (grouped_df['correct_count'] + grouped_df['en_false_negative']) > 0
    grouped_df['en_recall'] = np.where(
        en_recall_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['en_false_negative']), 
        0
    )
    
    # F1-scores
    grouped_df['fr_f1_score'] = np.where(
        (grouped_df['fr_precision'] + grouped_df['fr_recall']) > 0,
        2 * (grouped_df['fr_precision'] * grouped_df['fr_recall']) / (grouped_df['fr_precision'] + grouped_df['fr_recall']),
        0
    )
    
    grouped_df['en_f1_score'] = np.where(
        (grouped_df['en_precision'] + grouped_df['en_recall']) > 0,
        2 * (grouped_df['en_precision'] * grouped_df['en_recall']) / (grouped_df['en_precision'] + grouped_df['en_recall']),
        0
    )
    
    grouped_df['stats_sum'] = grouped_df[['accuracy', 'fr_precision', 'fr_recall', 'en_precision', 'en_recall', 'fr_f1_score', 'en_f1_score']].sum(axis=1)
    
    return grouped_df


In [4]:
n = 10000
tolerances = [0, 1, 2, 3, 4, 5]

clf = LanguageClassifier()

df = evaluate_classifier(clf, n, tolerances)
grouped_df = create_stats(df)

grouped_df.T

Unnamed: 0,0,1,2,3,4,5
tolerance,0.0,1.0,2.0,3.0,4.0,5.0
total_count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
correct_count,17522.0,18787.0,18772.0,18711.0,18627.0,18523.0
wrong_count,2478.0,1213.0,1228.0,1289.0,1373.0,1477.0
fr_false_positive,19.0,21.0,20.0,20.0,19.0,19.0
fr_false_negative,2044.0,1090.0,1120.0,1182.0,1267.0,1371.0
en_false_positive,760.0,850.0,957.0,1027.0,1113.0,1217.0
en_false_negative,434.0,123.0,108.0,107.0,106.0,106.0
accuracy,0.88,0.94,0.94,0.94,0.93,0.93
fr_precision,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# looks like 1 is the optimal tolerance

In [6]:
# should avoiding false negatives be priority? (exclude more to get better quality)
#   tolerance of 1 still looks good
grouped_df[['fr_recall', 'en_recall']]

Unnamed: 0,fr_recall,en_recall
0,0.9,0.98
1,0.95,0.99
2,0.94,0.99
3,0.94,0.99
4,0.94,0.99
5,0.93,0.99


# let's take a look at some mistakes

In [7]:
# first, let's drop everything that is not tolerance == 1

df = df[df.tolerance == 1].reset_index(drop=True)

In [8]:
# how many errors vs correct?
(
    df.loc[~df.is_correct, ['language', 'classification', 'sentence']].shape[0], 
    df.loc[df.is_correct, ['language', 'classification', 'sentence']].shape[0],
    df.loc[~df.is_correct, ['language', 'classification', 'sentence']].shape[0] / df.loc[df.is_correct, ['language', 'classification', 'sentence']].shape[0]
 )

(1213, 18787, 0.06456592324479693)

In [9]:
df.loc[~df.is_correct, ['language', 'classification', 'sentence']]

Unnamed: 0,language,classification,sentence
185,en,fr,évaluation de létat des stocks de homard homarus americanus de la gaspésie zph et québec en
197,en,mixed,a new version of depomod newdepomod sams has recently been developed version
230,en,unknown,c at sydney figure ai
379,en,unknown,wood and pulp plantations none
572,en,fr,application de la procédure de gestion de la morue charbonnière anoplopoma fimbria de la colombiebritannique pour lannée de pêche
...,...,...,...
19925,fr,en,evaluating dewatering approaches to protect larval pacific lamprey u
19930,fr,en,reductions in distribution photosynthesis and productivity of eelgrass zostera marina associated with oyster crassostrea virginica aquaculture
19961,fr,en,changing estuaries and impacts on juvenile salmon a systematic review
19975,fr,en,densovirus associated with seastar wasting disease and mass mortality


In [10]:
# 'fr' misclassified as en is by far the most common error
df.loc[~df.is_correct, ['classification']].value_counts()

classification
en                850
unknown           241
mixed             101
fr                 21
Name: count, dtype: int64

In [11]:
df.loc[(~df.is_correct) & (df.classification == 'en')].sample(20)

Unnamed: 0,tolerance,language,classification,sentence,is_correct,fr_false_positive,fr_false_negative,en_false_positive,en_false_negative
19684,1,fr,en,cambridge university press cambridge uk,False,False,True,True,False
15603,1,fr,en,oil spills and marine mammals in british columbia canada development and application of a risk based conceptual framework,False,False,True,True,False
18894,1,fr,en,morphometry and gonad maturity of male snow crab chionoecetes opilio,False,False,True,True,False
18795,1,fr,en,caret classification and regression training,False,False,True,True,False
18852,1,fr,en,development of a monitoring program for tracking the recovery of endangered freshwater mussels in the sydenham river ontario,False,False,True,True,False
16930,1,fr,en,summary of the british columbia sablefish anoplopoma fimbria research and assessment survey,False,False,True,True,False
12334,1,fr,en,the nestucca oil spill preliminary evaluation of impacts on the west coast of vancouver island,False,False,True,True,False
15648,1,fr,en,hydrogen peroxide proposed registration document prd pesticide management regulatory agency health canada,False,False,True,True,False
15302,1,fr,en,dynamics of snow crab chionoecetes opilio movement and migration along the newfoundland and labrador and eastern barents sea continental shelves,False,False,True,True,False
12745,1,fr,en,reproductive success of jack and fullsize males in a wild coho salmon population,False,False,True,True,False


# remove incorrect words from classifiers

In [36]:
with open('scrabble_dictionaries/en_scrabble.txt', 'r') as f:
    en_scrabble = [line.lower().strip() for line in f]
    
with open('scrabble_dictionaries/fr_scrabble.txt', 'r') as f:
    fr_scrabble = [line.lower().strip() for line in f]

In [37]:
english_words = clf.english_words
french_words = clf.french_words

maybe_not_english = []
maybe_not_french = []

def remove_accents(word):
    return ''.join(c for c in unicodedata.normalize('NFKD', word) if not unicodedata.combining(c))

for word in english_words:
    if word not in en_scrabble:
        maybe_not_english.append(word)
        
for word in french_words:
    word = remove_accents(word)
    if word not in fr_scrabble:
        maybe_not_french.append(word)

In [38]:
# these seem reasonable to delete
len(maybe_not_english), len(maybe_not_french)

(83, 43)

In [41]:
english_words = [x for x in english_words if x not in maybe_not_english]
french_words = [x for x in french_words if x not in maybe_not_french]

In [42]:
# final word lists
len(english_words), len(french_words) 

(1448, 1484)

# remove bad sentences from sentences

In [44]:
french_example_sentences = sentences['fr']
english_example_sentences = sentences['en']

len(french_example_sentences), len(english_example_sentences) 

In [50]:
# remove english sentences from french_example_sentences
def clean_incorrect_sentences(list_of_sentences, n_wrong_language_max=0):
    good_sentences = []
    bad_sentences = []
    
    for sentence in list_of_sentences:
        words = sentence.split()
        n_wrong = len([x for x in words if remove_accents(x) in wrong_lang_dictionary])    
        
        if n_wrong > n_wrong_language_max:
            bad_sentences.append(sentence)
        else:
            good_sentences.append(sentence)
            
    return good_sentences, bad_sentences
        

In [51]:
french_example_sentences, bad_french = clean_incorrect_sentences(french_example_sentences, en_scrabble, 0)

In [52]:
len(french_example_sentences), len(bad_french)

(110, 9890)

In [56]:
[x for x in "évaluation du homard damerique homarus americanus dans les zones de pêche du homard à".split() if remove_accents(x) in en_scrabble]

['évaluation', 'dans', 'les', 'zones', 'de']

In [58]:
[x for x in "évaluation du homard damerique homarus americanus dans les zones de pêche du homard à".split() if x in en_scrabble]

['dans', 'les', 'zones', 'de']

In [57]:
[x for x in "évaluation du homard damerique homarus americanus dans les zones de pêche du homard à".split() if remove_accents(x) in fr_scrabble]

['évaluation',
 'du',
 'homard',
 'dans',
 'les',
 'zones',
 'de',
 'pêche',
 'du',
 'homard']

In [None]:
# save optimised word lists 
with open("wordlists.json", "w", encoding="utf-8") as f:
    json.dump({"en": list(english_words), "fr": list(french_words)}, f, ensure_ascii=False, indent=4)
    
# save 10k sentences per language for testing
with open("example_sentences.json", "w", encoding="utf-8") as f:
    json.dump({"en": random.sample(english_example_sentences, 10000), "fr": random.sample(french_example_sentences, 10000)}, f, ensure_ascii=False, indent=4)