In [24]:
# -- fix path --
from pathlib import Path
import sys
sys.path.append(str(Path('..').resolve()))
from source.resources import *
from source.metrics import *
from source.helper import *
from source.preprocessor import *
from source.constants import *
import Levenshtein
import wordfreq
import torch
from string import punctuation
from nltk import word_tokenize
from functools import lru_cache
import unicodedata
from collections import Counter
import nltk
import magic
import platform
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
# import spacy
# nlp = spacy.load("en_core_web_sm")

from joblib import Memory

from transformers import pipeline

current_dir = Path('.')
memory = Memory(CACHE_DIR, verbose=False)



[nltk_data] Downloading package punkt to /home/kim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
@lru_cache(maxsize=5)
def get_stopwords(language):
    return set(stopwords.words(language))


@lru_cache(maxsize=1024)
def is_punctuation(word):
    return not ''.join([char for char in word if char not in punctuation])

def remove_punctuation(candidates):
    return [candidate for candidate in candidates if not is_punctuation(candidate)]


def remove_stopwords(candidates, language):
    stopwords = get_stopwords(language)
    return [candidate for candidate in candidates if candidate.lower() not in stopwords]
    


In [26]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if platform.system() == 'Darwin' and torch.backends.mps.is_available():
    device = 'mps' # Mac m-serie

def replace(text, old, new):
    pattern = re.compile(old, re.IGNORECASE)
    return pattern.sub(new, text)

def generate_candidates(model, text, top_k=10):
    res = np.array(model(text, top_k=top_k))
    if res.ndim > 1: res = res[0]
    return [item['token_str'].strip() for item in res]

@memory.cache()
def eval_dataset(model_name, dataset, language, top_k=10, device=device):
    nlp_fill = pipeline('fill-mask', model=model_name, tokenizer=model_name, device=device)
    data = load_dataset(dataset)
    data = update_candidates(data)
    
    list_pred_candidates = []
    for i in tqdm(range(len(data)), total=len(data)):
        row = data.iloc[i]
        text = row['text']
        complex_word = row['complex_word']
        masked_text = replace(text, complex_word, nlp_fill.tokenizer.mask_token)
        masked_text = f'{text} {nlp_fill.tokenizer.sep_token} {masked_text}'

        candidates = generate_candidates(nlp_fill, masked_text, top_k=top_k+50)
        candidates = remove_punctuation(candidates)
        candidates = remove_stopwords(candidates, language)
        list_pred_candidates.append(candidates[:top_k])
        # print(text)
        # print(complex_word, row['candidates'])
        # print(candidates)
        # print('='*80)
        # break 
    data['mask_pred_candidates'] = list_pred_candidates
    res = precision_metrics_at_k(data['mask_pred_candidates'], data['list_candidates'], k=top_k)
    # print(res)
    return data, res

In [27]:
def run_experiments(models, dataset, language, top_k=50):
    results_dir = RESOURCES_DIR / 'mask_pred_candidates'
    results_dir.mkdir(parents=True, exist_ok=True)
    output_name = f'{dataset}_topk_{top_k}'
    
    results = []
    for model in models:
        output_filepath = results_dir / f'{output_name}_{model.replace("/", "|")}.csv'
        if not output_filepath.exists():
            try: 
                data, res = eval_dataset(model, dataset, language, top_k, device)
            except NotImplementedError:
                print(f'{model} is not implemented for mps, use cpu instead.')
                data, res = eval_dataset(model, dataset, language, top_k, device='cpu')
            
            #preserve array
            data['candidates'] = data['candidates'].apply(json.dumps)
            data['list_candidates'] = data['list_candidates'].apply(json.dumps)
            data['mask_pred_candidates'] = data['mask_pred_candidates'].apply(json.dumps)
            
            data.to_csv(output_filepath, index=False)
            results.append((model,  res['potential'], res['recall'], res['precision']))
        else:
            data = pd.read_csv(output_filepath)
            data['list_candidates'] = data['list_candidates'].apply(json.loads)
            data['mask_pred_candidates'] = data['mask_pred_candidates'].apply(json.loads)
            res = precision_metrics_at_k(data['mask_pred_candidates'], data['list_candidates'], k=top_k)
            
            acc1 = accuracy_at_1(data['mask_pred_candidates'], data['list_candidates'])
            results.append((model, acc1,  res['potential'], res['recall'], res['precision']))
            # print(f'{model:<50}: {value:.4f}')
            
        # print(f'{model:<50}: {res["potential"]:.4f} {res["recall"]:.4f} {res["precision"]:.4f} ')

    print("\nSorted:", '='*80)
    print(f'Top k: {top_k}')
    results = sorted(results, key=lambda x: x[2], reverse=True)
    
    scores_dir = current_dir / 'scores'
    scores_dir.mkdir(parents=True, exist_ok=True)
    
    # print('\t'.join(results.key()))
    print('ACC1\tPotential\tRecall\tPrecision')
    scores = []
    # with log_stdout(scores_dir / f'{output_name}.txt'):
    for model, acc1, potential, recall, precision in results:
        print(f'{model:<50}: {acc1:.4f} {potential:.4f} {recall:4f} {precision:.4f}')
        # scores.append({'model':model, 'ACC@1': acc1, 'Potential':potential, 'Recall':recall, 'precision':precision})
        scores.append({'model':model, 'Potential':potential, 'ACC@1': acc1})
    pd.DataFrame(scores).to_csv(scores_dir / f'{output_name}.csv', index=False)

# English

In [28]:

models = ['bert-base-uncased',
          'bert-large-uncased',
          'bert-base-cased',
          'bert-large-cased',
          'xlm-roberta-large',
          'albert-base-v2',
          'roberta-base',
          'roberta-large',
          'distilbert-base-uncased',
        #   'bert-base-multilingual-cased',
          ]
for top_k in [5, 10, 15, 20, 30, 40, 50]:
  run_experiments(models, Dataset.TSAR_EN, 'english', top_k=top_k)


Top k: 5
ACC1	Potential	Recall	Precision
roberta-base                                      : 0.5518 0.9404 0.218587 0.4088
bert-base-uncased                                 : 0.5518 0.8990 0.209554 0.3907
bert-large-uncased                                : 0.5699 0.8938 0.207627 0.3845
bert-large-cased                                  : 0.5596 0.8860 0.215043 0.3995
roberta-large                                     : 0.5881 0.8575 0.176404 0.3363
bert-base-cased                                   : 0.5311 0.8549 0.204418 0.3788
distilbert-base-uncased                           : 0.5389 0.8549 0.192577 0.3591
albert-base-v2                                    : 0.5207 0.8005 0.179119 0.3347
xlm-roberta-large                                 : 0.4301 0.7073 0.135140 0.2518

Top k: 10
ACC1	Potential	Recall	Precision
roberta-base                                      : 0.5518 0.9715 0.311185 0.2972
bert-large-uncased                                : 0.5699 0.9456 0.301236 0.2839
bert-large-ca

# Spanish

In [29]:

models = ['PlanTL-GOB-ES/roberta-base-bne',
          'PlanTL-GOB-ES/roberta-large-bne',
          'dccuchile/bert-base-spanish-wwm-cased', #BETO
          'dccuchile/bert-base-spanish-wwm-uncased',
          'dccuchile/distilbert-base-spanish-uncased',
          'dccuchile/albert-base-spanish',
          'dccuchile/albert-xxlarge-spanish',
          'xlm-roberta-large',
          'distilbert-base-multilingual-cased',
          'bert-base-multilingual-uncased',
        #   'flax-community/bertin-roberta-large-spanish',
        #   'bertin-project/bertin-roberta-base-spanish',
        #   'skimai/spanberta-base-cased',
          ]
for top_k in [5, 10, 15, 20, 30, 40, 50]:
  run_experiments(models, Dataset.TSAR_ES, 'spanish', top_k=top_k)


Top k: 5
ACC1	Potential	Recall	Precision
PlanTL-GOB-ES/roberta-large-bne                   : 0.4357 0.7743 0.169460 0.2955
PlanTL-GOB-ES/roberta-base-bne                    : 0.3990 0.7717 0.160067 0.2761
dccuchile/bert-base-spanish-wwm-cased             : 0.3963 0.7375 0.154160 0.2719
dccuchile/albert-xxlarge-spanish                  : 0.3570 0.6745 0.135248 0.2346
dccuchile/albert-base-spanish                     : 0.2782 0.6168 0.127575 0.2184
xlm-roberta-large                                 : 0.3412 0.5879 0.105462 0.1874
dccuchile/distilbert-base-spanish-uncased         : 0.3255 0.5827 0.114422 0.1953
dccuchile/bert-base-spanish-wwm-uncased           : 0.3150 0.5696 0.111070 0.1895
bert-base-multilingual-uncased                    : 0.2205 0.4646 0.080137 0.1407
distilbert-base-multilingual-cased                : 0.1365 0.3176 0.049224 0.0861

Top k: 10
ACC1	Potential	Recall	Precision
PlanTL-GOB-ES/roberta-base-bne                    : 0.3990 0.8373 0.219590 0.1924
PlanTL-GOB-ES

# Portuguese

In [30]:


models = ['neuralmind/bert-base-portuguese-cased',
          'neuralmind/bert-large-portuguese-cased',
          'rdenadai/BR_BERTo',
          'josu/roberta-pt-br',
          'xlm-roberta-large',
          'xlm-roberta-base',
          'bert-base-multilingual-cased',
        #   'bert-base-multilingual-uncased', 
        #   'facebook/xlm-roberta-xl',
        #   'flax-community/alberti-bert-base-multilingual-cased'
          
          ]
for top_k in [5, 10, 15, 20, 30, 40, 50]:
  run_experiments(models, Dataset.TSAR_PT, 'portuguese', top_k=top_k)


Top k: 5
ACC1	Potential	Recall	Precision
neuralmind/bert-large-portuguese-cased            : 0.2850 0.7409 0.169373 0.2440
neuralmind/bert-base-portuguese-cased             : 0.2642 0.7358 0.172531 0.2518
xlm-roberta-large                                 : 0.3057 0.5725 0.119594 0.1762
xlm-roberta-base                                  : 0.2513 0.5104 0.101851 0.1534
rdenadai/BR_BERTo                                 : 0.1503 0.3834 0.065692 0.0974
josu/roberta-pt-br                                : 0.0337 0.3368 0.062257 0.0860
bert-base-multilingual-cased                      : 0.1528 0.3161 0.054858 0.0788

Top k: 10
ACC1	Potential	Recall	Precision
neuralmind/bert-large-portuguese-cased            : 0.2850 0.8394 0.240175 0.1767
neuralmind/bert-base-portuguese-cased             : 0.2642 0.8109 0.234283 0.1772
xlm-roberta-large                                 : 0.3057 0.6347 0.156321 0.1171
xlm-roberta-base                                  : 0.2513 0.5959 0.134424 0.1031
rdenadai/BR_B

# Others

In [31]:

models = ['bert-base-uncased',
          'bert-large-uncased',
          'bert-base-cased',
          'bert-large-cased',
          'xlm-roberta-large',
          'albert-base-v2',
          'roberta-base',
          'roberta-large',
          'distilbert-base-uncased',
        #   'bert-base-multilingual-cased',
          ]
for dataset in [Dataset.LexMTurk, Dataset.NNSeval, Dataset.BenchLS]:
  for top_k in [5, 10, 15, 20, 30, 40, 50]:
    run_experiments(models, dataset, 'english', top_k=top_k)


Top k: 5
ACC1	Potential	Recall	Precision
bert-large-cased                                  : 0.1340 0.9280 0.218528 0.3748
bert-large-uncased                                : 0.1420 0.9260 0.211691 0.3684
bert-base-uncased                                 : 0.1080 0.9220 0.210670 0.3652
bert-base-cased                                   : 0.1400 0.9060 0.215749 0.3776
xlm-roberta-large                                 : 0.4460 0.8960 0.208994 0.3748
distilbert-base-uncased                           : 0.1360 0.8860 0.192959 0.3364
roberta-base                                      : 0.0820 0.8740 0.181617 0.3156
albert-base-v2                                    : 0.0700 0.8380 0.172271 0.2856
roberta-large                                     : 0.0820 0.7220 0.129208 0.2260

Top k: 10
ACC1	Potential	Recall	Precision
bert-large-cased                                  : 0.1340 0.9740 0.319136 0.2998
bert-base-uncased                                 : 0.1080 0.9720 0.319357 0.3006
bert-large-un