In [1]:
import pandas as pd
import numpy as np
from langdetect import detect

## Lingua

In [2]:
titles = pd.read_csv('stratified_references_sample_5k_lingua.tsv', sep='\t')
titles.drop(columns=['index'], inplace=True)
titles

Unnamed: 0,bibitem_title,bibitem_string,uuid,language
0,An easy proof of the interior gradient bound f...,"N. J. Korevaar, An easy proof of the interior ...",4dd26b01-f41f-4095-9c55-9302c64e179a,ENGLISH
1,"Quantum black holes with charge, colour, and s...","D. M. Gingrich, “Quantum black holes with char...",0c94ecea-d596-45d2-b87f-6c08bb0a5bf7,ENGLISH
2,,"Bretz H P, Erdmann M, Schiffer P, Walz D and W...",a519032b-0d38-4aed-9ad3-17fcbd2b78e7,UNKNOWN
3,,"Reynolds, R.J. 1984, ApJ, 282, 191",04a8d47a-1505-43b1-b632-257941d1055d,UNKNOWN
4,,"Fong D., Justtanont K., Meixner M., Campbell M...",aed8b252-48bc-4d3c-a78c-e66fc684916e,UNKNOWN
...,...,...,...,...
4995,,"A. Djouadi, J. Kalinowski and P. Zerwas, Z. Ph...",292a7f24-5f83-4292-b20b-6dd98c1450e0,UNKNOWN
4996,,"V.G. Turaev and O.Y. Viro, “State Sum Invarian...",f22f9cd3-d065-4e9a-94b5-085d3334be7f,UNKNOWN
4997,¦A course in computational algebraic number th...,"H. Cohen, ¦A course in computational algebraic...",1550d374-506f-4dc4-b288-4aacdd54baed,ENGLISH
4998,Achieving 100% throughput in an input-queued s...,"N. McKeown, V. Anantharam, and J. Walrand. Ach...",2e869ead-41d5-4107-a741-80fb905d55ee,ENGLISH


In [3]:
manually = pd.read_csv('stratified_references_sample_5k_annot.csv')
manually = manually[['title given', 'language', 'marked', 'bibitem_string', 'uuid']]
manually

Unnamed: 0,title given,language,marked,bibitem_string,uuid
0,yes,en,no,"N. J. Korevaar, An easy proof of the interior ...",4dd26b01-f41f-4095-9c55-9302c64e179a
1,yes,en,no,"D. M. Gingrich, “Quantum black holes with char...",0c94ecea-d596-45d2-b87f-6c08bb0a5bf7
2,no,-,no,"Bretz H P, Erdmann M, Schiffer P, Walz D and W...",a519032b-0d38-4aed-9ad3-17fcbd2b78e7
3,no,-,no,"Reynolds, R.J. 1984, ApJ, 282, 191",04a8d47a-1505-43b1-b632-257941d1055d
4,no,-,no,"Fong D., Justtanont K., Meixner M., Campbell M...",aed8b252-48bc-4d3c-a78c-e66fc684916e
...,...,...,...,...,...
4995,no,-,no,"A. Djouadi, J. Kalinowski and P. Zerwas, Z. Ph...",292a7f24-5f83-4292-b20b-6dd98c1450e0
4996,yes,en,no,"V.G. Turaev and O.Y. Viro, “State Sum Invarian...",f22f9cd3-d065-4e9a-94b5-085d3334be7f
4997,yes,en,no,"H. Cohen, ¦A course in computational algebraic...",1550d374-506f-4dc4-b288-4aacdd54baed
4998,yes,en,no,"N. McKeown, V. Anantharam, and J. Walrand. Ach...",2e869ead-41d5-4107-a741-80fb905d55ee


In [4]:
df_eval = pd.merge(titles, manually, how='inner', on='uuid').drop(columns = 'bibitem_string_y')
df_eval = df_eval.rename(columns={
    'language_x': 'lang_lingua',
    'language_y': 'lang_manual',
    'bibitem_string_x': 'bibitem_string'})

def change_language(lang):
    if lang == 'ENGLISH':
        return 'en'
    elif lang == 'GERMAN':
        return 'de'
    elif lang == 'FRENCH':
        return 'fr'
    elif lang == 'RUSSIAN':
        return 'ru'
    elif lang == 'ITALIAN':
        return 'it'
    elif lang == 'JAPANESE':
        return 'ja'
    elif lang == 'CHINESE':
        return 'zh'
    elif pd.isna(lang):
        return np.nan
    else:
        return '-'    
    
df_eval.lang_lingua = [change_language(df_eval.lang_lingua.iloc[i]) for i in range(len(df_eval))]
df_eval

Unnamed: 0,bibitem_title,bibitem_string,uuid,lang_lingua,title given,lang_manual,marked
0,An easy proof of the interior gradient bound f...,"N. J. Korevaar, An easy proof of the interior ...",4dd26b01-f41f-4095-9c55-9302c64e179a,en,yes,en,no
1,"Quantum black holes with charge, colour, and s...","D. M. Gingrich, “Quantum black holes with char...",0c94ecea-d596-45d2-b87f-6c08bb0a5bf7,en,yes,en,no
2,,"Bretz H P, Erdmann M, Schiffer P, Walz D and W...",a519032b-0d38-4aed-9ad3-17fcbd2b78e7,-,no,-,no
3,,"Reynolds, R.J. 1984, ApJ, 282, 191",04a8d47a-1505-43b1-b632-257941d1055d,-,no,-,no
4,,"Fong D., Justtanont K., Meixner M., Campbell M...",aed8b252-48bc-4d3c-a78c-e66fc684916e,-,no,-,no
...,...,...,...,...,...,...,...
4995,,"A. Djouadi, J. Kalinowski and P. Zerwas, Z. Ph...",292a7f24-5f83-4292-b20b-6dd98c1450e0,-,no,-,no
4996,,"V.G. Turaev and O.Y. Viro, “State Sum Invarian...",f22f9cd3-d065-4e9a-94b5-085d3334be7f,-,yes,en,no
4997,¦A course in computational algebraic number th...,"H. Cohen, ¦A course in computational algebraic...",1550d374-506f-4dc4-b288-4aacdd54baed,en,yes,en,no
4998,Achieving 100% throughput in an input-queued s...,"N. McKeown, V. Anantharam, and J. Walrand. Ach...",2e869ead-41d5-4107-a741-80fb905d55ee,en,yes,en,no


In [5]:
from sklearn.metrics import classification_report

print(classification_report(df_eval.lang_manual.fillna('-'), df_eval.lang_lingua.fillna('-'), zero_division=0))

              precision    recall  f1-score   support

           -       0.77      0.99      0.87      2737
          de       0.48      0.44      0.46        27
          en       0.98      0.62      0.76      2188
          fr       0.40      0.61      0.48        33
          it       0.60      0.60      0.60         5
          ja       0.00      0.00      0.00         1
          ru       0.00      0.00      0.00         8
          zh       0.00      0.00      0.00         1

    accuracy                           0.82      5000
   macro avg       0.40      0.41      0.40      5000
weighted avg       0.86      0.82      0.82      5000



In [6]:
# False Positives
for lang in ['fr', 'de', 'it', 'ja','ru','zh']:
    fp = df_eval[
        (df_eval.lang_manual == lang) & 
        (df_eval.lang_lingua != lang) &
        (df_eval.marked == 'no')
    ].shape[0]
    print(f'{lang}: {fp}')

fr: 12
de: 15
it: 2
ja: 0
ru: 2
zh: 0
