In [1]:
import unicodedata
import os
import json
import random
import string
import numpy as np
import pandas as pd

from language_classifier import LanguageClassifier
from faker import Faker

faker_en = Faker('en_US')
faker_fr = Faker('fr_FR')
faker_es = Faker('es_ES')
    
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

with open("example_sentences.json", "r", encoding="utf-8") as f:
    sentences = json.load(f)
    

In [2]:
def evaluate_classifier(classifier, sentences, n_trials, tolerance_list):
    results = []

    for tolerance in tolerance_list:
        for k, v in sentences.items():
            for sentence in random.sample(v, n_trials):
                classification = classifier.classify(sentence, tolerance) 
                results.append([tolerance, k, classification, sentence])

    df = pd.DataFrame(results, columns=['tolerance', 'language', 'classification', 'sentence'])

    df['is_correct'] = df['classification'] == df['language']
    df['fr_false_positive'] = (df['classification'] == 'fr') & (df['language'] == 'en')
    df['fr_false_negative'] = (df['classification'] != 'fr') & (df['language'] == 'fr')
    df['en_false_positive'] = (df['classification'] == 'en') & (df['language'] == 'fr')
    df['en_false_negative'] = (df['classification'] != 'en') & (df['language'] == 'en')

    return df

In [3]:
def create_stats(results_df):
    grouped_df = results_df.groupby('tolerance').agg(
        total_count=('is_correct', 'count'),  # count rows (not the same as results_df['total_count']

        # Correct and incorrect classifications
        correct_count=('is_correct', 'sum'),
        wrong_count=('is_correct', lambda x: (~x).sum()),  

        # False Positives & False Negatives for each language
        fr_false_positive=('fr_false_positive', 'sum'),
        fr_false_negative=('fr_false_negative', 'sum'),
        en_false_positive=('en_false_positive', 'sum'),
        en_false_negative=('en_false_negative', 'sum')
    ).reset_index()

    # Avoid division by zero
    valid_mask = grouped_df['total_count'] > 0

    # Accuracy (same for both languages)
    grouped_df['accuracy'] = np.where(valid_mask, grouped_df['correct_count'] / grouped_df['total_count'], 0)

    # Precision & Recall for French
    fr_precision_mask = (grouped_df['correct_count'] + grouped_df['fr_false_positive']) > 0
    grouped_df['fr_precision'] = np.where(
        fr_precision_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['fr_false_positive']), 
        0
    )

    fr_recall_mask = (grouped_df['correct_count'] + grouped_df['fr_false_negative']) > 0
    grouped_df['fr_recall'] = np.where(
        fr_recall_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['fr_false_negative']), 
        0
    )

    # Precision & Recall for English
    en_precision_mask = (grouped_df['correct_count'] + grouped_df['en_false_positive']) > 0
    grouped_df['en_precision'] = np.where(
        en_precision_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['en_false_positive']), 
        0
    )

    en_recall_mask = (grouped_df['correct_count'] + grouped_df['en_false_negative']) > 0
    grouped_df['en_recall'] = np.where(
        en_recall_mask, 
        grouped_df['correct_count'] / (grouped_df['correct_count'] + grouped_df['en_false_negative']), 
        0
    )

    # F1-scores
    grouped_df['fr_f1_score'] = np.where(
        (grouped_df['fr_precision'] + grouped_df['fr_recall']) > 0,
        2 * (grouped_df['fr_precision'] * grouped_df['fr_recall']) / (grouped_df['fr_precision'] + grouped_df['fr_recall']),
        0
    )

    grouped_df['en_f1_score'] = np.where(
        (grouped_df['en_precision'] + grouped_df['en_recall']) > 0,
        2 * (grouped_df['en_precision'] * grouped_df['en_recall']) / (grouped_df['en_precision'] + grouped_df['en_recall']),
        0
    )

    # Sum of statistics (optional, useful for ranking)
    grouped_df['recall_avg'] = grouped_df[['fr_recall', 'en_recall']].mean(axis=1)
    grouped_df['precision_avg'] = grouped_df[['fr_precision', 'en_precision']].mean(axis=1)
    grouped_df['f1_score_avg'] = grouped_df[['fr_f1_score', 'en_f1_score']].mean(axis=1)
    grouped_df['all_stats_avg'] = grouped_df[['accuracy', 'fr_precision', 'fr_recall', 'en_precision', 'en_recall', 'fr_f1_score', 'en_f1_score']].mean(axis=1)

    return grouped_df


In [4]:
n = 10000
tolerances = [0, 1, 2, 3, 4, 5]

clf = LanguageClassifier()

df = evaluate_classifier(clf, sentences, n, tolerances)
grouped_df = create_stats(df)

grouped_df.T

Unnamed: 0,0,1,2,3,4,5
tolerance,0.0,1.0,2.0,3.0,4.0,5.0
total_count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
correct_count,18683.0,19872.0,19890.0,19850.0,19804.0,19700.0
wrong_count,1317.0,128.0,110.0,150.0,196.0,300.0
fr_false_positive,2.0,6.0,2.0,7.0,3.0,4.0
fr_false_negative,988.0,69.0,56.0,88.0,150.0,253.0
en_false_positive,1.0,6.0,39.0,76.0,141.0,242.0
en_false_negative,329.0,59.0,54.0,62.0,46.0,47.0
accuracy,0.93,0.99,0.99,0.99,0.99,0.98
fr_precision,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# looks like 1-3 is the optimal tolerance range
grouped_df[['all_stats_avg', 'f1_score_avg']]

Unnamed: 0,all_stats_avg,f1_score_avg
0,0.98,0.98
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0
4,0.99,1.0
5,0.99,0.99


In [6]:
# should avoiding false negatives be priority? (exclude more to get better quality)
#   tolerance range of 1-3 still looks good
grouped_df[['fr_recall', 'en_recall', 'recall_avg']]

Unnamed: 0,fr_recall,en_recall,recall_avg
0,0.95,0.98,0.97
1,1.0,1.0,1.0
2,1.0,1.0,1.0
3,1.0,1.0,1.0
4,0.99,1.0,1.0
5,0.99,1.0,0.99


# let's take a look at some mistakes

In [7]:
# first, let's drop everything that is not tolerance == 1

df = df[df.tolerance == 1].reset_index(drop=True)

In [8]:
# how many errors vs correct?
(
    df.loc[~df.is_correct, ['language', 'classification', 'sentence']].shape[0], 
    df.loc[df.is_correct, ['language', 'classification', 'sentence']].shape[0],
    df.loc[~df.is_correct, ['language', 'classification', 'sentence']].shape[0] / df.loc[df.is_correct, ['language', 'classification', 'sentence']].shape[0]
 )

(128, 19872, 0.00644122383252818)

In [9]:
df.loc[~df.is_correct, ['language', 'classification', 'sentence']]

Unnamed: 0,language,classification,sentence
103,en,mixed,dispersion dispersion is complex and the rates depend on many factors water velocity shears i
572,en,unknown,status of the pacific whitesided dolphin lagenorhynchus obliquidens in canada
841,en,unknown,wildland hydrology pagosa springs colorado
972,en,unknown,war civil unrest and military exercises
1147,en,unknown,cacsassccs assessment of the northern gulf of st
...,...,...,...
19649,fr,mixed,une allocation de tonnes est prévue pour la composante du banc de georges mais aucun débarquement na été déclaré pour
19765,fr,en,et des connaissances traditionnelles des inuit inuit qaujimajatuqangit lewis et al
19893,fr,mixed,le long des voies migratoires empruntées par plusieurs populations de march et maiers de march et al
19899,fr,en,liparis liparis liparis bathyarcticus limace nébuleuse à c mecklenburg et al


In [10]:
df.loc[~df.is_correct, ['classification']].value_counts()

classification
mixed             68
unknown           48
en                 6
fr                 6
Name: count, dtype: int64

# Generate Fake Sentences - Test the Classifier

In [11]:
def generate_non_alphabetic_sentence(length):
    characters = string.digits + string.punctuation + " " * 30 + "|" * 5
    sentence = "".join(random.choices(characters, k=length))
    sentence = sentence.replace(" ", " " * random.randint(2, 4))
    return sentence

def generate_fake_sentence(sentence_type):
    min_words, max_words = 10, 15
    n_words = random.choice(range(min_words, max_words + 1))
    
    if sentence_type == 'en':
        return faker_en.sentence(n_words)
    elif sentence_type == 'fr':
        return faker_fr.sentence(n_words)
    elif sentence_type == 'mixed':
        words_en = faker_en.sentence(n_words).split()
        words_fr = faker_fr.sentence(n_words).split()
        return random.choice([
            " ".join(words_en[:n_words // 2] + words_fr[n_words // 2:]), 
            " ".join(words_fr[:n_words // 2] + words_en[n_words // 2:])
        ])
    elif sentence_type == 'unknown':
        return random.choice([
            faker_es.sentence(n_words), 
            generate_non_alphabetic_sentence(n_words * 5)
        ])
    
    return None


n_trials = 10000
fake_sentences = dict()

for language in ['en', 'fr', 'mixed', 'unknown']:
    fake_sentences[language] = list()
    for _ in range(n_trials):
        fake_sentences[language].append(generate_fake_sentence(language))
        

In [12]:
# re-write / overwrite evaluation and statistics functions

def evaluate_classifier(classifier, sentences, n_trials, tolerance_list):
    results = []
    
    for tolerance in tolerance_list:
        for k, v in sentences.items():
            for sentence in random.sample(v, n_trials):
                classification = classifier.classify(sentence, tolerance) 
                results.append([tolerance, k, classification, sentence])
                
    df = pd.DataFrame(results, columns=['tolerance', 'language', 'classification', 'sentence'])
    
    # Correct classification
    df['is_correct'] = df['classification'] == df['language']

    # False positives (Predicted X but should be Y)
    df['fr_false_positive'] = (df['classification'] == 'fr') & (df['language'] != 'fr')
    df['en_false_positive'] = (df['classification'] == 'en') & (df['language'] != 'en')
    df['mixed_false_positive'] = (df['classification'] == 'mixed') & (df['language'] != 'mixed')
    df['unknown_false_positive'] = (df['classification'] == 'unknown') & (df['language'] != 'unknown')

    # False negatives (Should be X but classified as something else)
    df['fr_false_negative'] = (df['classification'] != 'fr') & (df['language'] == 'fr')
    df['en_false_negative'] = (df['classification'] != 'en') & (df['language'] == 'en')
    df['mixed_false_negative'] = (df['classification'] != 'mixed') & (df['language'] == 'mixed')
    df['unknown_false_negative'] = (df['classification'] != 'unknown') & (df['language'] == 'unknown')

    return df


def create_stats(results_df):
    grouped_df = results_df.groupby('tolerance').agg(
        total_count=('is_correct', 'count'),  # Total rows
        
        # Correct and incorrect classifications
        correct_count=('is_correct', 'sum'),
        wrong_count=('is_correct', lambda x: (~x).sum()),  

        # False Positives & False Negatives for each category
        fr_false_positive=('fr_false_positive', 'sum'),
        fr_false_negative=('fr_false_negative', 'sum'),
        en_false_positive=('en_false_positive', 'sum'),
        en_false_negative=('en_false_negative', 'sum'),
        mixed_false_positive=('mixed_false_positive', 'sum'),
        mixed_false_negative=('mixed_false_negative', 'sum'),
        unknown_false_positive=('unknown_false_positive', 'sum'),
        unknown_false_negative=('unknown_false_negative', 'sum')
    ).reset_index()

    # Avoid division by zero
    valid_mask = grouped_df['total_count'] > 0

    # Accuracy
    grouped_df['accuracy'] = np.where(valid_mask, grouped_df['correct_count'] / grouped_df['total_count'], 0)

    # Helper function for precision and recall
    def calc_precision(correct, false_pos):
        return np.where((correct + false_pos) > 0, correct / (correct + false_pos), 0)

    def calc_recall(correct, false_neg):
        return np.where((correct + false_neg) > 0, correct / (correct + false_neg), 0)

    def calc_f1(precision, recall):
        return np.where((precision + recall) > 0, 2 * (precision * recall) / (precision + recall), 0)

    # Precision & Recall for French
    grouped_df['fr_precision'] = calc_precision(grouped_df['correct_count'], grouped_df['fr_false_positive'])
    grouped_df['fr_recall'] = calc_recall(grouped_df['correct_count'], grouped_df['fr_false_negative'])

    # Precision & Recall for English
    grouped_df['en_precision'] = calc_precision(grouped_df['correct_count'], grouped_df['en_false_positive'])
    grouped_df['en_recall'] = calc_recall(grouped_df['correct_count'], grouped_df['en_false_negative'])

    # Precision & Recall for Mixed
    grouped_df['mixed_precision'] = calc_precision(grouped_df['correct_count'], grouped_df['mixed_false_positive'])
    grouped_df['mixed_recall'] = calc_recall(grouped_df['correct_count'], grouped_df['mixed_false_negative'])

    # Precision & Recall for Unknown
    grouped_df['unknown_precision'] = calc_precision(grouped_df['correct_count'], grouped_df['unknown_false_positive'])
    grouped_df['unknown_recall'] = calc_recall(grouped_df['correct_count'], grouped_df['unknown_false_negative'])

    # F1-scores
    grouped_df['fr_f1_score'] = calc_f1(grouped_df['fr_precision'], grouped_df['fr_recall'])
    grouped_df['en_f1_score'] = calc_f1(grouped_df['en_precision'], grouped_df['en_recall'])
    grouped_df['mixed_f1_score'] = calc_f1(grouped_df['mixed_precision'], grouped_df['mixed_recall'])
    grouped_df['unknown_f1_score'] = calc_f1(grouped_df['unknown_precision'], grouped_df['unknown_recall'])

    # Sum of statistics (optional, useful for ranking)
    grouped_df['all_recall_avg'] = grouped_df[['fr_recall', 'en_recall', 'mixed_recall', 'unknown_recall']].mean(axis=1)
    grouped_df['all_precision_avg'] = grouped_df[['fr_precision', 'en_precision', 'mixed_precision', 'unknown_precision']].mean(axis=1)
    grouped_df['all_f1_score_avg'] = grouped_df[['fr_f1_score', 'en_f1_score', 'mixed_f1_score', 'unknown_f1_score']].mean(axis=1)
    grouped_df['all_stats_avg'] = grouped_df[
        ['accuracy', 'fr_precision', 'fr_recall', 'en_precision', 'en_recall',
         'mixed_precision', 'mixed_recall', 'unknown_precision', 'unknown_recall',
         'fr_f1_score', 'en_f1_score', 'mixed_f1_score', 'unknown_f1_score']
    ].mean(axis=1)

    return grouped_df


In [13]:
n = 10000
tolerances = [0, 1, 2, 3, 4, 5]

clf = LanguageClassifier()

df = evaluate_classifier(clf, fake_sentences, n, tolerances)
grouped_df = create_stats(df)


In [16]:
# zero tolerance seems best for fake sentences / clean data
grouped_df[['tolerance', 'accuracy', 'all_recall_avg', 'all_precision_avg', 'all_f1_score_avg', 'all_stats_avg']]

Unnamed: 0,tolerance,accuracy,all_recall_avg,all_precision_avg,all_f1_score_avg,all_stats_avg
0,0,0.8,0.94,0.94,0.94,0.93
1,1,0.75,0.93,0.93,0.92,0.91
2,2,0.69,0.91,0.91,0.9,0.89
3,3,0.67,0.9,0.9,0.89,0.88
4,4,0.66,0.9,0.9,0.89,0.88
5,5,0.66,0.9,0.9,0.89,0.88


# CONCLUSION
#### use tolerance 1 as default to balance between CSAS results (1-3) and clean sentences (0)