In [15]:
import os
import json
import random
import numpy as np
import pandas as pd

from language_classifier import LanguageClassifier
    
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

with open("example_sentences.json", "r", encoding="utf-8") as f:
    sentences = json.load(f)
    

In [2]:
def evaluate_classifier(classifier, n_trials, tolerance_list):
    results = []
    
    for tolerance in tolerance_list:
        for k, v in sentences.items():
            for sentence in random.sample(v, n_trials):
                classification = classifier.classify(sentence, tolerance) 
                results.append([tolerance, k, classification, sentence])
                
    df = pd.DataFrame(results, columns=['tolerance', 'language', 'classification', 'sentence'])
    
    df['is_correct'] = df['classification'] == df['language']
    df['fr_false_positive'] = (df['classification'] == 'fr') & (df['language'] == 'en')
    df['fr_false_negative'] = (df['classification'] != 'fr') & (df['language'] == 'fr')
    df['en_false_positive'] = (df['classification'] == 'en') & (df['language'] == 'fr')
    df['en_false_negative'] = (df['classification'] != 'en') & (df['language'] == 'en')
    
    return df

In [3]:
def create_stats(results_df):
    grouped_df = results_df.groupby('tolerance').agg(
        total_count=('is_correct', 'count'),  # count rows (not the same as results_df['total_count']
        
        # Correct and incorrect classifications
        correct_count=('is_correct', 'sum'),
        wrong_count=('is_correct', lambda x: (~x).sum()),  
    
        # False Positives & False Negatives for each language
        fr_false_positive=('fr_false_positive', 'sum'),
        fr_false_negative=('fr_false_negative', 'sum'),
        en_false_positive=('en_false_positive', 'sum'),
        en_false_negative=('en_false_negative', 'sum')
    ).reset_index()
    
    # Avoid division by zero
    valid_mask = grouped_df['total_count'] > 0
    
    # Accuracy (same for both languages)
    grouped_df['accuracy'] = np.where(valid_mask, grouped_df['correct_count'] / grouped_df['total_count'], 0)
    
    # Precision & Recall for French
    fr_precision_mask = (grouped_df['correct_count'] + grouped_df['fr_false_positive']) > 0
    grouped_df['fr_precision'] = np.where(
        fr_precision_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['fr_false_positive']), 
        0
    )
    
    fr_recall_mask = (grouped_df['correct_count'] + grouped_df['fr_false_negative']) > 0
    grouped_df['fr_recall'] = np.where(
        fr_recall_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['fr_false_negative']), 
        0
    )
    
    # Precision & Recall for English
    en_precision_mask = (grouped_df['correct_count'] + grouped_df['en_false_positive']) > 0
    grouped_df['en_precision'] = np.where(
        en_precision_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['en_false_positive']), 
        0
    )
    
    en_recall_mask = (grouped_df['correct_count'] + grouped_df['en_false_negative']) > 0
    grouped_df['en_recall'] = np.where(
        en_recall_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['en_false_negative']), 
        0
    )
    
    # F1-scores
    grouped_df['fr_f1_score'] = np.where(
        (grouped_df['fr_precision'] + grouped_df['fr_recall']) > 0,
        2 * (grouped_df['fr_precision'] * grouped_df['fr_recall']) / (grouped_df['fr_precision'] + grouped_df['fr_recall']),
        0
    )
    
    grouped_df['en_f1_score'] = np.where(
        (grouped_df['en_precision'] + grouped_df['en_recall']) > 0,
        2 * (grouped_df['en_precision'] * grouped_df['en_recall']) / (grouped_df['en_precision'] + grouped_df['en_recall']),
        0
    )
    
    grouped_df['stats_sum'] = grouped_df[['accuracy', 'fr_precision', 'fr_recall', 'en_precision', 'en_recall', 'fr_f1_score', 'en_f1_score']].sum(axis=1)
    
    return grouped_df


In [8]:
n = 10000
tolerances = [0, 1, 2, 3, 4, 5]

clf = LanguageClassifier()

df = evaluate_classifier(clf, n, tolerances)
grouped_df = create_stats(df)

grouped_df.T

Unnamed: 0,0,1,2,3,4,5
tolerance,0.0,1.0,2.0,3.0,4.0,5.0
total_count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
correct_count,17522.0,18787.0,18772.0,18711.0,18627.0,18523.0
wrong_count,2478.0,1213.0,1228.0,1289.0,1373.0,1477.0
fr_false_positive,19.0,21.0,20.0,20.0,19.0,19.0
fr_false_negative,2044.0,1090.0,1120.0,1182.0,1267.0,1371.0
en_false_positive,760.0,850.0,957.0,1027.0,1113.0,1217.0
en_false_negative,434.0,123.0,108.0,107.0,106.0,106.0
accuracy,0.8761,0.93935,0.9386,0.93555,0.93135,0.92615
fr_precision,0.998917,0.998883,0.998936,0.998932,0.998981,0.998975


In [5]:
# looks like 1 is the optimal tolerance

In [58]:
# should avoiding false negatives be priority? (exclude more to get better quality)
#   tolerance of 1 still looks good
grouped_df[['fr_recall', 'en_recall']]

Unnamed: 0,fr_recall,en_recall
0,0.9,0.98
1,0.95,0.99
2,0.94,0.99
3,0.94,0.99
4,0.94,0.99
5,0.93,0.99


# let's take a look at some mistakes

In [24]:
# first, let's drop everything that is not tolerance == 1

df = df[df.tolerance == 1].reset_index(drop=True)

In [26]:
# how many errors vs correct?
(
    df.loc[~df.is_correct, ['language', 'classification', 'sentence']].shape[0], 
    df.loc[df.is_correct, ['language', 'classification', 'sentence']].shape[0],
    df.loc[~df.is_correct, ['language', 'classification', 'sentence']].shape[0] / df.loc[df.is_correct, ['language', 'classification', 'sentence']].shape[0]
 )

(1213, 18787, 0.06456592324479693)

In [27]:
df.loc[~df.is_correct, ['language', 'classification', 'sentence']]

Unnamed: 0,language,classification,sentence
217,en,fr,spatial interpolations calcium and ph
262,en,unknown,the expedition highlights the capabilities of rovs figure
593,en,unknown,van beveren stock structure a
599,en,unknown,b fourbeard rockling enchelyopus cimbrius fig
640,en,unknown,redefining the oceanic distribution of atlantic salmon
...,...,...,...
19967,fr,en,california sea lion abundance estimation in canada
19972,fr,en,identifying a limit reference point for striped shrimp pandalus montagui in shrimp fishing area using a multiindicator approach
19989,fr,en,spatial variation in life history characteristics of waved whelk buccinum undatum l
19990,fr,en,cosewic assessment and update status report on the steller sea lion eumetopias jubatus in canada


In [28]:
# 'fr' misclassified as en is by far the most common error
df.loc[~df.is_correct, ['classification']].value_counts()

classification
en                850
unknown           241
mixed             101
fr                 21
Name: count, dtype: int64

In [64]:
df.loc[(~df.is_correct) & (df.classification == 'en')].sample(20)

Unnamed: 0,tolerance,language,classification,sentence,is_correct,fr_false_positive,fr_false_negative,en_false_positive,en_false_negative
16185,1,fr,en,killer whales the natural history and genealogy of orcinus orca in british columbia and washington state,False,False,True,True,False
15652,1,fr,en,lawrence areas e f and in and advice for the fishery,False,False,True,True,False
19216,1,fr,en,impacts of organic enrichment from finfish aquaculture on seagrass beds and associated macroinfaunal communities in atlantic canada,False,False,True,True,False
10742,1,fr,en,proceedings of the national advisory meeting on conservation translocations of saralisted freshwater fishes and mussels october,False,False,True,True,False
16963,1,fr,en,distribution of killer whale pods in prince william sound alaska over a thirteenyear period,False,False,True,True,False
17532,1,fr,en,lawrence from an aerial survey conducted in june,False,False,True,True,False
14908,1,fr,en,photoacclimation and light thresholds for cold temperate seagrasses,False,False,True,True,False
14533,1,fr,en,scallop production areas in the bay of fundy stock status for and forecast for,False,False,True,True,False
17571,1,fr,en,effects of nutrient enrichment in the nations estuaries a decade of change,False,False,True,True,False
17478,1,fr,en,precocial male maturation contributes to the introgression of farmed atlantic salmon into wild populations,False,False,True,True,False


In [39]:
# this implies that we need to remove french words from the english word_list
incorrectly_en = df.loc[(~df.is_correct) & (df.classification == 'en'), 'sentence'].to_list()

In [46]:
remove_from_en_wordlist = []

for sentence in incorrectly_en:
    remove_from_en_wordlist.extend([x for x in clf.english_words if x in sentence.split()])

In [49]:
remove_from_en_wordlist = sorted(list(set(remove_from_en_wordlist)))

### it looks like a bunch of fr sentences are actually english, which is the problem. and the clf is working as expected

In [52]:
random.sample(incorrectly_en, 10)

['rosenfeld amiro bowlby et al',
 'sous presse czich et al',
 'moccia et bevan bannister et al',
 'vi liste des figures figure',
 'risk assessment for two solitary and three colonial tunicates in both atlantic and pacific canadian waters',
 'status of white sturgeon acipenser transmontanus richardson throughout the species range threats to survival and prognosis for the future',
 'abundanceoccupancy patterns in a riverine fish assemblage',
 'sablefish anoplopoma fimbria observed from a manned submersible',
 'geographical patterns of straying of fall chinook salmon oncorhynchus tshawytscha walbaum from columbia river usa hatcheries',
 'cole et newton bladon et al']

In [57]:
str(random.sample(remove_from_en_wordlist, 100))

"['snow', 'bank', 'scallop', 'differences', 'conducted', 'appendix', 'october', 'beluga', 'can', 'link', 'incorporating', 'pacifc', 'licence', 'scientific', 'rules', 'identification', 'evaluation', 'parameters', 'survey', 'thresholds', 'shoreline', 'nafo', 'observed', 'including', 'recovery', 'values', 'invasive', 'june', 'lobster', 'adaptive', 'pandalus', 'evaluating', 'interim', 'targets', 'physical', 'northeast', 'requires', 'march', 'baseline', 'geographic', 'sustainable', 'mammals', 'recent', 'like', 'ontario', 'trajectory', 'concern', 'potential', 'exploratory', 'brunswick', 'spawner', 'dive', 'independent', 'sponge', 'ad', 'method', 'is', 'resource', 'forward', 'reproductive', 'sfa', 'consumption', 'represents', 'frequency', 'current', 'seamount', 'effect', 'herring', 'decision', 'two', 'organization', 'february', 'banks', 'response', 'setting', 'silver', 'better', 'among', 'during', 'light', 'coastal', 'monitoring', 'assumptions', 'life', 'nova', 'final', 'vi', 'indicator', 'ha

In [None]:
# that could imply that a bunch of english words made it into the french list
# TODO: check all words with off the shelf package, consider deleting words if misclassified

## do this

There are several Python packages for detecting the language of individual words. The best choice depends on your needs (speed, accuracy, support for short words). Here are the top options:

1️⃣ langid (Lightweight & Fast)
✅ Pros:

Fast, lightweight, and works offline.
Supports many languages.
Can be trained for better accuracy.
🚀 Install:

bash
Copy
Edit
pip install langid
📝 Usage:


```py
import langid

print(langid.classify("bonjour"))  # Output: ('fr', -54.171) → French
print(langid.classify("hello"))    # Output: ('en', -26.325) → English
```
🔹 Best for: Short words, speed, and offline use.

2️⃣ langdetect (Google’s Language Detector)
✅ Pros:

Works well for short words.
Multi-language support.
Can return probabilities for multiple languages.
🚀 Install:

bash
Copy
Edit
pip install langdetect
📝 Usage:


```py
from langdetect import detect, detect_langs

print(detect("bonjour"))  # Output: 'fr' → French
print(detect("hello"))    # Output: 'en' → English
print(detect_langs("bonjour"))  # Output: [fr:0.99] → Language with probability
```
🔹 Best for: General use, probability-based detection.

3️⃣ pycld2 (Google’s Compact Language Detector v2)
✅ Pros:

More accurate than langid.
Provides confidence scores.
Supports multiple languages.
🚀 Install:

bash
Copy
Edit
pip install pycld2
📝 Usage:


```py
import pycld2 as cld2

_, _, details = cld2.detect("bonjour", returnVectors=True)
print(details)  # Output: [('FRENCH', 'fr', 98.0)] → Language with confidence score
```
🔹 Best for: Accuracy & multilingual support.

4️⃣ fasttext (Best for Word-Level Language Detection)
✅ Pros:

Very accurate for single words.
Uses a pre-trained language model (from Facebook).
Supports 170+ languages.
🚀 Install:

bash
Copy
Edit
pip install fasttext
📝 Usage:

```py
import fasttext

model = fasttext.load_model("lid.176.bin")  # Download from https://fasttext.cc/docs/en/language-identification.html

print(model.predict("bonjour"))  # Output: ('__label__fr', 0.99) → French
print(model.predict("hello"))    # Output: ('__label__en', 0.99) → English
```
🔹 Best for: High accuracy for individual words.

🔹 Which One Should You Use?
Package	Speed	Accuracy for Single Words	Supports Short Text?	Offline?
langid	✅ Fast	✅ Decent	❌ Weak for short words	✅ Yes
langdetect	✅ Fast	✅ Decent	✅ Better for short words	✅ Yes
pycld2	⏳ Slower	✅✅ Good	✅ Works for short words	✅ Yes
fasttext	⏳ Slower	✅✅✅ Best	✅ Best for single words	✅ Yes
🔹 Final Recommendation
If you need fast & lightweight → Use langid.
If you want probability-based detection → Use langdetect.
If you need better accuracy for short words → Use pycld2.
If you need the best accuracy for individual words → Use fasttext.
Would you like a custom function that combines the best of these approaches? 🚀








In [4]:
# are you joking!!!!??
import langid

print(langid.classify("bonjour"))

('en', -22.992373943328857)


In [7]:
# so, is the word "bonjour" english or croatian?

from langdetect import detect, detect_langs

print(detect("bonjour"))

hr


# give up again on existing packages and use scrabble dictionaries

In [20]:
with open(os.path.abspath('scrabble_dictionaries/en_scrabble.txt'), 'r') as f:
    en_scrabble = [line.strip() for line in f]
    
with open(os.path.abspath('scrabble_dictionaries/fr_scrabble.txt'), 'r') as f:
    fr_scrabble = [line.strip() for line in f]

In [None]:
maybe_not_english = []
maybe_not_french = []
