In [1]:
import unicodedata
import os
import json
import random
import string
import numpy as np
import pandas as pd

from language_classifier import LanguageClassifier
from faker import Faker

faker_en = Faker('en_US')
faker_fr = Faker('fr_FR')
faker_es = Faker('es_ES')
    
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

with open("example_sentences.json", "r", encoding="utf-8") as f:
    sentences = json.load(f)
    
# random seed for reproducibility
random.seed(42)
    

In [2]:
def evaluate_classifier(classifier, sentences, n_trials, tolerance_list, min_words_list):
    results = []
    
    for tolerance in tolerance_list:
        for k, v in sentences.items():
            for sentence in random.sample(v, n_trials):
                for min_words_in_language in min_words_list:
                    classification = classifier.classify(sentence, tolerance, min_words_in_language) 
                    results.append([tolerance, min_words_in_language, k, classification, sentence])
                
    df = pd.DataFrame(results, columns=['tolerance', 'min_words_in_language', 'language', 'classification', 'sentence'])
    
    # Correct classification
    df['is_correct'] = df['classification'] == df['language']

    # False positives (Predicted X but should be Y)
    df['fr_false_positive'] = (df['classification'] == 'fr') & (df['language'] != 'fr')
    df['en_false_positive'] = (df['classification'] == 'en') & (df['language'] != 'en')
    df['mixed_false_positive'] = (df['classification'] == 'mixed') & (df['language'] != 'mixed')
    df['unknown_false_positive'] = (df['classification'] == 'unknown') & (df['language'] != 'unknown')

    # False negatives (Should be X but classified as something else)
    df['fr_false_negative'] = (df['classification'] != 'fr') & (df['language'] == 'fr')
    df['en_false_negative'] = (df['classification'] != 'en') & (df['language'] == 'en')
    df['mixed_false_negative'] = (df['classification'] != 'mixed') & (df['language'] == 'mixed')
    df['unknown_false_negative'] = (df['classification'] != 'unknown') & (df['language'] == 'unknown')

    return df


def create_stats(results_df):
    grouped_df = results_df.groupby(['tolerance', 'min_words_in_language']).agg(
        total_count=('is_correct', 'count'),  # Total rows
        
        # Correct and incorrect classifications
        correct_count=('is_correct', 'sum'),
        wrong_count=('is_correct', lambda x: (~x).sum()),  

        # False Positives & False Negatives for each category
        fr_false_positive=('fr_false_positive', 'sum'),
        fr_false_negative=('fr_false_negative', 'sum'),
        en_false_positive=('en_false_positive', 'sum'),
        en_false_negative=('en_false_negative', 'sum'),
        mixed_false_positive=('mixed_false_positive', 'sum'),
        mixed_false_negative=('mixed_false_negative', 'sum'),
        unknown_false_positive=('unknown_false_positive', 'sum'),
        unknown_false_negative=('unknown_false_negative', 'sum')
    )

    # Avoid division by zero
    valid_mask = grouped_df['total_count'] > 0

    # Accuracy
    grouped_df['accuracy'] = np.where(valid_mask, grouped_df['correct_count'] / grouped_df['total_count'], 0)

    # Helper function for precision and recall
    def calc_precision(correct, false_pos):
        return np.where((correct + false_pos) > 0, correct / (correct + false_pos), 0)

    def calc_recall(correct, false_neg):
        return np.where((correct + false_neg) > 0, correct / (correct + false_neg), 0)

    def calc_f1(precision, recall):
        return np.where((precision + recall) > 0, 2 * (precision * recall) / (precision + recall), 0)

    # Precision & Recall for French
    grouped_df['fr_precision'] = calc_precision(grouped_df['correct_count'], grouped_df['fr_false_positive'])
    grouped_df['fr_recall'] = calc_recall(grouped_df['correct_count'], grouped_df['fr_false_negative'])

    # Precision & Recall for English
    grouped_df['en_precision'] = calc_precision(grouped_df['correct_count'], grouped_df['en_false_positive'])
    grouped_df['en_recall'] = calc_recall(grouped_df['correct_count'], grouped_df['en_false_negative'])

    # Precision & Recall for Mixed
    grouped_df['mixed_precision'] = calc_precision(grouped_df['correct_count'], grouped_df['mixed_false_positive'])
    grouped_df['mixed_recall'] = calc_recall(grouped_df['correct_count'], grouped_df['mixed_false_negative'])

    # Precision & Recall for Unknown
    grouped_df['unknown_precision'] = calc_precision(grouped_df['correct_count'], grouped_df['unknown_false_positive'])
    grouped_df['unknown_recall'] = calc_recall(grouped_df['correct_count'], grouped_df['unknown_false_negative'])

    # F1-scores
    grouped_df['fr_f1_score'] = calc_f1(grouped_df['fr_precision'], grouped_df['fr_recall'])
    grouped_df['en_f1_score'] = calc_f1(grouped_df['en_precision'], grouped_df['en_recall'])
    grouped_df['mixed_f1_score'] = calc_f1(grouped_df['mixed_precision'], grouped_df['mixed_recall'])
    grouped_df['unknown_f1_score'] = calc_f1(grouped_df['unknown_precision'], grouped_df['unknown_recall'])

    # Sum of statistics (optional, useful for ranking) 
    #  NOTE: only aggregate en and fr
    grouped_df['false_positive_sum'] = grouped_df[['fr_false_positive', 'en_false_positive']].sum(axis=1)
    grouped_df['false_negative_sum'] = grouped_df[['fr_false_negative', 'en_false_negative']].sum(axis=1)
    grouped_df['recall_avg'] = grouped_df[['fr_recall', 'en_recall']].mean(axis=1)
    grouped_df['precision_avg'] = grouped_df[['fr_precision', 'en_precision']].mean(axis=1)
    grouped_df['f1_score_avg'] = grouped_df[['fr_f1_score', 'en_f1_score']].mean(axis=1)
    grouped_df['all_stats_avg'] = grouped_df[
        ['accuracy', 'fr_precision', 'fr_recall', 'en_precision', 'en_recall', 'fr_f1_score', 'en_f1_score']
    ].mean(axis=1)

    return grouped_df


In [3]:
n_trials = min(len(sentences['fr']), len(sentences['en']))  # all fr, equal number of en
tolerances = [0, 1, 2, 3, 4, 5]  # NOTE: tolerance has been renamed to be max_incorrect_words
min_words_list = [1, 2, 3, 4, 5]

clf = LanguageClassifier()

df1 = evaluate_classifier(clf, sentences, n_trials, tolerances, min_words_list)
grouped_df1 = create_stats(df1)

In [4]:
grouped_df1[['false_positive_sum', 'false_negative_sum', 'recall_avg', 'precision_avg', 'f1_score_avg', 'all_stats_avg']]

Unnamed: 0_level_0,Unnamed: 1_level_0,false_positive_sum,false_negative_sum,recall_avg,precision_avg,f1_score_avg,all_stats_avg
tolerance,min_words_in_language,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,16,4643,0.966,1.0,0.983,0.976
0,2,0,5703,0.958,1.0,0.979,0.97
0,3,0,8167,0.939,1.0,0.968,0.957
0,4,0,12651,0.902,1.0,0.948,0.932
0,5,0,18414,0.851,1.0,0.919,0.897
1,1,22,531,0.996,1.0,0.998,0.997
1,2,5,1632,0.988,1.0,0.994,0.992
1,3,1,4222,0.969,1.0,0.984,0.978
1,4,0,8842,0.934,1.0,0.966,0.953
1,5,0,14929,0.883,1.0,0.938,0.919


In [5]:
grouped_df1.groupby('tolerance').mean().T

tolerance,0,1,2,3,4,5
total_count,70786.0,70786.0,70786.0,70786.0,70786.0,70786.0
correct_count,60870.4,64754.8,65005.2,64934.6,64960.4,64985.2
wrong_count,9915.6,6031.2,5780.8,5851.4,5825.6,5800.8
fr_false_positive,2.4,3.8,4.6,4.6,2.8,2.2
fr_false_negative,5099.0,2048.0,1872.2,1860.0,1858.0,1858.0
en_false_positive,0.8,1.8,2.4,2.4,2.4,2.4
en_false_negative,4816.6,3983.2,3908.6,3991.4,3967.6,3942.8
mixed_false_positive,4267.6,405.8,48.6,6.0,1.0,0.0
mixed_false_negative,0.0,0.0,0.0,0.0,0.0,0.0
unknown_false_positive,5644.8,5619.8,5725.2,5838.4,5819.4,5796.2


In [6]:
for n_words in min_words_list:
    print('min words', n_words)
    display(grouped_df1.xs(key=n_words, level=1).groupby('tolerance').sum().T.loc[['false_positive_sum', 'false_negative_sum']])

min words 1


tolerance,0,1,2,3,4,5
false_positive_sum,16.0,22.0,25.0,27.0,17.0,17.0
false_negative_sum,4643.0,531.0,336.0,337.0,307.0,302.0


min words 2


tolerance,0,1,2,3,4,5
false_positive_sum,0.0,5.0,5.0,6.0,7.0,4.0
false_negative_sum,5703.0,1632.0,1369.0,1398.0,1387.0,1384.0


min words 3


tolerance,0,1,2,3,4,5
false_positive_sum,0.0,1.0,3.0,2.0,2.0,2.0
false_negative_sum,8167.0,4222.0,3932.0,3989.0,3999.0,3970.0


min words 4


tolerance,0,1,2,3,4,5
false_positive_sum,0.0,0.0,1.0,0.0,0.0,0.0
false_negative_sum,12651.0,8842.0,8609.0,8693.0,8637.0,8615.0


min words 5


tolerance,0,1,2,3,4,5
false_positive_sum,0.0,0.0,1.0,0.0,0.0,0.0
false_negative_sum,18414.0,14929.0,14658.0,14840.0,14798.0,14733.0


In [7]:
grouped_df1.xs(key=1, level=0)

Unnamed: 0_level_0,total_count,correct_count,wrong_count,fr_false_positive,fr_false_negative,en_false_positive,en_false_negative,mixed_false_positive,mixed_false_negative,unknown_false_positive,unknown_false_negative,accuracy,fr_precision,fr_recall,en_precision,en_recall,mixed_precision,mixed_recall,unknown_precision,unknown_recall,fr_f1_score,en_f1_score,mixed_f1_score,unknown_f1_score,false_positive_sum,false_negative_sum,recall_avg,precision_avg,f1_score_avg,all_stats_avg
min_words_in_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
1,70786,70255,531,16,264,6,267,310,0,199,0,0.992,1.0,0.996,1.0,0.996,0.996,1.0,0.997,1.0,0.998,0.998,0.998,0.999,22,531,0.996,1.0,0.998,0.997
2,70786,69154,1632,3,467,2,1165,310,0,1317,0,0.977,1.0,0.993,1.0,0.983,0.996,1.0,0.981,1.0,0.997,0.992,0.998,0.991,5,1632,0.988,1.0,0.994,0.992
3,70786,66564,4222,0,1252,1,2970,422,0,3799,0,0.94,1.0,0.982,1.0,0.957,0.994,1.0,0.946,1.0,0.991,0.978,0.997,0.972,1,4222,0.969,1.0,0.984,0.978
4,70786,61944,8842,0,2973,0,5869,475,0,8367,0,0.875,1.0,0.954,1.0,0.913,0.992,1.0,0.881,1.0,0.977,0.955,0.996,0.937,0,8842,0.934,1.0,0.966,0.953
5,70786,55857,14929,0,5284,0,9645,512,0,14417,0,0.789,1.0,0.914,1.0,0.853,0.991,1.0,0.795,1.0,0.955,0.921,0.995,0.886,0,14929,0.883,1.0,0.938,0.919


In [8]:
false_positive_sentences = list(set(df1.loc[
    ((df1.fr_false_positive == True) | (df1.en_false_positive == True)), 
    'sentence'
].to_list()))

def show_classification(sentence):
    print(f"{sentence=}")
    for tolerance in tolerances:
        for n_words in min_words_list:
            classification = clf.classify(sentence, tolerance, n_words)
            if classification in ['en', 'fr'] and n_words > 1:
                print(f"{classification=}, {tolerance=}, {n_words=}")
                
for sentence in false_positive_sentences:
    show_classification(sentence)
    print()

sentence='national academies of sciences engineering and medicine'

sentence='principal investigators dfo eddy carmack retired'

sentence='sea otters and oil ecologic perspectives'

sentence='ices journal of marine sciences'
classification='fr', tolerance=1, n_words=2
classification='fr', tolerance=2, n_words=2
classification='fr', tolerance=3, n_words=2
classification='fr', tolerance=4, n_words=2
classification='fr', tolerance=5, n_words=2

sentence='widespread diminishing effects on calcium in freshwaters'

sentence='a mitochondrial superoxide theory for oxidative stress diseases and aging'

sentence='stock synthesis user manual version'

sentence='the quest for successful atlantic salmon restoration perspectives priorities and maxims'

sentence='f fertility solved for figure cvf coefficient of variation for f'

sentence='explain interbasin in res doc'

sentence='smallmouth bass and chain pickerel in the petite rivière lakes'
classification='fr', tolerance=1, n_words=2
classification

In [9]:
# en/fr n>1 errors avoided with:
#   n_words > 2 or tolerance = 0
#   n_words > 3 or tolerance = 0
#                  tolerance < 2
#   n_words > 3 or tolerance < 2
#   n_words > 2 or tolerance = 0

In [10]:
for tolerance in [0, 1]:
    print('tolerance', tolerance)
    display(grouped_df1.xs(key=tolerance, level=0).groupby('min_words_in_language').sum().T.loc[['false_positive_sum', 'false_negative_sum']])

tolerance 0


min_words_in_language,1,2,3,4,5
false_positive_sum,16.0,0.0,0.0,0.0,0.0
false_negative_sum,4643.0,5703.0,8167.0,12651.0,18414.0


tolerance 1


min_words_in_language,1,2,3,4,5
false_positive_sum,22.0,5.0,1.0,0.0,0.0
false_negative_sum,531.0,1632.0,4222.0,8842.0,14929.0


In [12]:
n_trials = min(len(sentences['fr']), len(sentences['en']))
possible_n_words = [2, 3, 4]   # no false positives require >4 to avoid
possible_tolerance = [0, 1]  # tolerance doesn't improve much past one

df1_v1 = evaluate_classifier(clf, sentences, n_trials, possible_tolerance, possible_n_words)
grouped_df1_v1 = create_stats(df1_v1)

In [21]:
# BEST SCORES: 
#  (0, 2) - perfect false negative, then best all stats 
#     (5666 false negative) <- highest quality data
#  (1, 2) - then best all stats, only 4 false positives (typical is between 0-7 as long as n_words > 1) 
#     (1618 false negative) <- most data (only ~%5.7 increase in (presumably lower quality data) data)
grouped_df1_v1[['false_positive_sum', 'false_negative_sum', 'recall_avg', 'precision_avg', 'f1_score_avg', 'all_stats_avg']].sort_values(['false_positive_sum', 'all_stats_avg'], ascending=[True, False])

Unnamed: 0_level_0,Unnamed: 1_level_0,false_positive_sum,false_negative_sum,recall_avg,precision_avg,f1_score_avg,all_stats_avg
tolerance,min_words_in_language,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2,0,5666,0.958,1.0,0.979,0.971
0,3,0,8103,0.939,1.0,0.969,0.957
1,4,0,8892,0.933,1.0,0.965,0.953
0,4,0,12576,0.903,1.0,0.949,0.932
1,3,1,4258,0.969,1.0,0.984,0.978
1,2,4,1618,0.988,1.0,0.994,0.992


In [35]:
grouped_df1[['false_positive_sum', 'false_negative_sum', 'recall_avg', 'precision_avg', 'f1_score_avg', 'all_stats_avg']].sort_values(['false_positive_sum', 'all_stats_avg'], ascending=[True, False])

Unnamed: 0_level_0,Unnamed: 1_level_0,false_positive_sum,false_negative_sum,recall_avg,precision_avg,f1_score_avg,all_stats_avg
tolerance,min_words_in_language,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2,0,5703,0.958,1.0,0.979,0.97
0,3,0,8167,0.939,1.0,0.968,0.957
5,4,0,8615,0.936,1.0,0.967,0.955
4,4,0,8637,0.936,1.0,0.967,0.955
3,4,0,8693,0.935,1.0,0.966,0.954
1,4,0,8842,0.934,1.0,0.966,0.953
0,4,0,12651,0.902,1.0,0.948,0.932
5,5,0,14733,0.885,1.0,0.939,0.92
4,5,0,14798,0.884,1.0,0.938,0.92
3,5,0,14840,0.884,1.0,0.938,0.919


In [8]:
# max min_words best for minimizing false_positives (3, 4, 5)
grouped_df1.groupby('min_words_in_language').mean().T

min_words_in_language,1,2,3,4,5
total_count,70786.0,70786.0,70786.0,70786.0,70786.0
correct_count,69710.0,68640.5,66072.833,61444.833,55390.667
wrong_count,1076.0,2145.5,4713.167,9341.167,15395.333
fr_false_positive,14.333,2.167,0.167,0.167,0.167
fr_false_negative,670.667,873.667,1640.667,3340.167,5637.5
en_false_positive,6.333,2.333,1.5,0.0,0.0
en_false_negative,405.333,1271.833,3072.5,6001.0,9757.833
mixed_false_positive,796.667,796.667,804.5,788.333,754.667
mixed_false_negative,0.0,0.0,0.0,0.0,0.0
unknown_false_positive,258.667,1344.333,3907.0,8552.667,14640.5


In [9]:
# looks like 3-5 is the optimal tolerance range, but results are close
grouped_df1.groupby('tolerance').mean()[['all_stats_avg', 'f1_score_avg']]

Unnamed: 0_level_0,all_stats_avg,f1_score_avg
tolerance,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.946,0.96
1,0.968,0.976
2,0.969,0.977
3,0.969,0.977
4,0.969,0.977
5,0.969,0.977


In [10]:
# tolerance of 0 or 1 best 
grouped_df1.groupby('tolerance').mean()[['fr_recall', 'en_recall', 'recall_avg', 'false_positive_sum']]

Unnamed: 0_level_0,fr_recall,en_recall,recall_avg,false_positive_sum
tolerance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.921,0.925,0.923,3.2
1,0.968,0.941,0.954,5.6
2,0.97,0.942,0.956,7.0
3,0.971,0.941,0.956,7.0
4,0.971,0.941,0.956,5.2
5,0.971,0.941,0.956,4.6


In [11]:
# looks like 1 is the optimal tolerance range
grouped_df1.groupby('min_words_in_language').mean()[['all_stats_avg', 'f1_score_avg']]

Unnamed: 0_level_0,all_stats_avg,f1_score_avg
min_words_in_language,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.994,0.996
2,0.989,0.992
3,0.976,0.982
4,0.951,0.963
5,0.916,0.935


In [12]:
# 1 is best
grouped_df1.groupby('min_words_in_language').mean()[['fr_recall', 'en_recall', 'recall_avg', 'false_positive_sum']]

Unnamed: 0_level_0,fr_recall,en_recall,recall_avg,false_positive_sum
min_words_in_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.99,0.994,0.992,20.667
2,0.987,0.982,0.985,4.5
3,0.976,0.955,0.966,1.667
4,0.948,0.911,0.93,0.167
5,0.908,0.85,0.879,0.167


In [13]:
# check min words only where tolerance == 1
grouped_df1.loc[1, ['accuracy', 'recall_avg', 'false_positive_sum', 'precision_avg', 'f1_score_avg', 'all_stats_avg']]

Unnamed: 0_level_0,accuracy,recall_avg,false_positive_sum,precision_avg,f1_score_avg,all_stats_avg
min_words_in_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.992,0.996,22,1.0,0.998,0.997
2,0.977,0.988,5,1.0,0.994,0.992
3,0.94,0.969,1,1.0,0.984,0.978
4,0.875,0.934,0,1.0,0.966,0.953
5,0.789,0.883,0,1.0,0.938,0.919


In [14]:
# check min words only where tolerance == 2
grouped_df1.loc[2, ['accuracy', 'recall_avg', 'false_positive_sum', 'precision_avg', 'f1_score_avg', 'all_stats_avg']]

Unnamed: 0_level_0,accuracy,recall_avg,false_positive_sum,precision_avg,f1_score_avg,all_stats_avg
min_words_in_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.995,0.998,25,1.0,0.999,0.998
2,0.981,0.99,5,1.0,0.995,0.993
3,0.944,0.972,3,1.0,0.986,0.98
4,0.878,0.936,1,1.0,0.967,0.955
5,0.793,0.886,1,1.0,0.939,0.92


# let's take a look at some mistakes

In [15]:
# first, let's drop everything that is not tolerance or min wors of 1

df1_v2 = df1[(df1.tolerance == 1) & (df1.min_words_in_language == 1)].reset_index(drop=True)

In [16]:
# how many errors vs correct?
(
    df1_v2.loc[~df.is_correct, ['language', 'classification', 'sentence']].shape[0], 
    df1_v2.loc[df.is_correct, ['language', 'classification', 'sentence']].shape[0],
    df1_v2.loc[~df.is_correct, ['language', 'classification', 'sentence']].shape[0] / df.loc[df.is_correct, ['language', 'classification', 'sentence']].shape[0]
 )

NameError: name 'df' is not defined

In [14]:
df1_v2.loc[~df1_v2.is_correct, ['language', 'classification', 'sentence']]

Unnamed: 0,language,classification,sentence
151,en,mixed,we are also grateful to the staff of the réseau québécois durgences pour les mammifères marins rqumm c
365,en,mixed,learned deafness and chronic stress clark et al
369,en,unknown,the brooding feather star antedon sp
617,en,unknown,annotation photogrammetry mosaicing and relating e
659,en,unknown,iqr and upward to q
...,...,...,...
18974,fr,unknown,nageoires adipeuse ventrale pelvienne détiquettes p
19453,fr,mixed,en le relevé sest déroulé du août au septembre à bord du ngcc teleost et du ngcc john cabot
19833,fr,mixed,proportion année indice du frai na na figures figure
19843,fr,mixed,prises nominales déclarées en tonnes de pandalus borealis et de pandalus montagui dans la zone dévaluation est


In [15]:
df1_v2.loc[~df1_v2.is_correct, ['classification']].value_counts()

classification
mixed             93
unknown           61
fr                10
en                 2
Name: count, dtype: int64

# Generate Fake Sentences - Test the Classifier

In [16]:
def generate_non_alphabetic_sentence(length):
    characters = string.digits + string.punctuation + " " * 30 + "|" * 5
    sentence = "".join(random.choices(characters, k=length))
    sentence = sentence.replace(" ", " " * random.randint(2, 4))
    return sentence

def generate_fake_sentence(sentence_type):
    min_words, max_words = 10, 15
    n_words = random.choice(range(min_words, max_words + 1))
    
    if sentence_type == 'en':
        return faker_en.sentence(n_words)
    elif sentence_type == 'fr':
        return faker_fr.sentence(n_words)
    elif sentence_type == 'mixed':
        words_en = faker_en.sentence(n_words).split()
        words_fr = faker_fr.sentence(n_words).split()
        return random.choice([
            " ".join(words_en[:n_words // 2] + words_fr[n_words // 2:]), 
            " ".join(words_fr[:n_words // 2] + words_en[n_words // 2:])
        ])
    elif sentence_type == 'unknown':
        return random.choice([
            faker_es.sentence(n_words), 
            generate_non_alphabetic_sentence(n_words * 5)
        ])
    
    return None


n_trials = 10000
fake_sentences = dict()

for language in ['en', 'fr', 'mixed', 'unknown']:
    fake_sentences[language] = list()
    for _ in range(n_trials):
        fake_sentences[language].append(generate_fake_sentence(language))
        

In [17]:
n = 10000
tolerances = [0, 1, 2, 3, 4, 5]
min_words_list = [1, 2, 3, 4, 5]

clf = LanguageClassifier()

grouped_df2 = create_stats(evaluate_classifier(clf, fake_sentences, n, tolerances, min_words_list))

In [18]:
grouped_df2.groupby('tolerance').mean().T

tolerance,0,1,2,3,4,5
total_count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
correct_count,26552.8,25948.6,24258.4,23054.2,22678.0,22603.0
wrong_count,13447.2,14051.4,15741.6,16945.8,17322.0,17397.0
fr_false_positive,519.8,956.6,1220.0,1321.4,1337.4,1341.4
fr_false_negative,5372.2,4834.8,4803.0,4802.0,4802.0,4802.0
en_false_positive,1387.8,2760.4,3497.8,3753.2,3789.2,3791.2
en_false_negative,2886.6,2245.2,2218.2,2218.2,2218.2,2218.2
mixed_false_positive,1689.8,484.6,37.2,1.0,0.0,0.0
mixed_false_negative,4722.0,6505.0,8334.6,9543.8,9920.0,9995.0
unknown_false_positive,9849.8,9849.8,10986.6,11870.2,12195.4,12264.4


In [19]:
# zero tolerance seems best for fake sentences / clean data
grouped_df2[['accuracy', 'recall_avg', 'false_positive_sum', 'precision_avg', 'f1_score_avg', 'all_stats_avg']].groupby('tolerance').mean()

Unnamed: 0_level_0,accuracy,recall_avg,false_positive_sum,precision_avg,f1_score_avg,all_stats_avg
tolerance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.664,0.859,1907.6,0.97,0.907,0.876
1,0.649,0.876,3717.0,0.942,0.902,0.869
2,0.606,0.87,4717.8,0.922,0.888,0.853
3,0.576,0.864,5074.6,0.913,0.88,0.841
4,0.567,0.861,5126.6,0.911,0.877,0.838
5,0.565,0.861,5132.6,0.91,0.877,0.837


In [20]:
# 1 word minimum per sentence seems optimal for min_words
grouped_df2[['accuracy', 'recall_avg', 'false_positive_sum', 'precision_avg', 'f1_score_avg', 'all_stats_avg']].groupby('min_words_in_language').mean()

Unnamed: 0_level_0,accuracy,recall_avg,false_positive_sum,precision_avg,f1_score_avg,all_stats_avg
min_words_in_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.722,0.977,8685.333,0.871,0.92,0.894
2,0.707,0.944,6261.333,0.902,0.921,0.891
3,0.628,0.882,3908.0,0.929,0.902,0.865
4,0.529,0.803,1870.833,0.958,0.871,0.828
5,0.437,0.718,671.333,0.981,0.828,0.785


In [21]:
# check only where tolerance == 1
grouped_df2.loc[1, ['accuracy', 'false_positive_sum', 'recall_avg', 'precision_avg', 'f1_score_avg', 'all_stats_avg']]

Unnamed: 0_level_0,accuracy,false_positive_sum,recall_avg,precision_avg,f1_score_avg,all_stats_avg
min_words_in_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.761,8063,0.981,0.885,0.93,0.908
2,0.746,5639,0.95,0.915,0.931,0.906
3,0.685,3044,0.894,0.949,0.919,0.887
4,0.579,1355,0.819,0.972,0.887,0.848
5,0.471,484,0.735,0.987,0.841,0.8


In [22]:
# check only where tolerance == 2
grouped_df2.loc[2, ['accuracy', 'false_positive_sum', 'recall_avg', 'precision_avg', 'f1_score_avg', 'all_stats_avg']]

Unnamed: 0_level_0,accuracy,false_positive_sum,recall_avg,precision_avg,f1_score_avg,all_stats_avg
min_words_in_language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.716,9449,0.982,0.861,0.916,0.89
2,0.701,7025,0.948,0.891,0.917,0.888
3,0.619,4430,0.885,0.92,0.9,0.861
4,0.542,1990,0.809,0.957,0.875,0.832
5,0.455,695,0.728,0.981,0.835,0.792
