In [1]:
import pandas as pd
import numpy as np
import torch
import unicodedata
import wandb
from tqdm import tqdm

from sklearn.metrics import classification_report, accuracy_score
from tqdm.notebook import tqdm_notebook
from torch.utils.data import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# The languages in order they are classified by the classifiers on the build models.
langs = ['ace', 'afr', 'als', 'amh', 'ang', 'ara', 'arg', 'arz', 'asm', 'ast', 'ava', 'aym', 'azb', 'aze', 'bak', 'bar', 'bcl', 'be-tarask', 'bel', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bpy', 'bre', 'bul', 'bxr', 'cat', 'cbk', 'cdo', 'ceb', 'ces', 'che', 'chr', 'chv', 'ckb', 'cor', 'cos', 'crh', 'csb', 'cym', 'dan', 'deu', 'diq', 'div', 'dsb', 'dty', 'egl', 'ell', 'eng', 'epo', 'est', 'eus', 'ext', 'fao', 'fas', 'fin', 'fra', 'frp', 'fry', 'fur', 'gag', 'gla', 'gle', 'glg', 'glk', 'glv', 'grn', 'guj', 'hak', 'hat', 'hau', 'hbs', 'heb', 'hif', 'hin', 'hrv', 'hsb', 'hun', 'hye', 'ibo', 'ido', 'ile', 'ilo', 'ina', 'ind', 'isl', 'ita', 'jam', 'jav', 'jbo', 'jpn', 'kaa', 'kab', 'kan', 'kat', 'kaz', 'kbd', 'khm', 'kin', 'kir', 'koi', 'kok', 'kom', 'kor', 'krc', 'ksh', 'kur', 'lad', 'lao', 'lat', 'lav', 'lez', 'lij', 'lim', 'lin', 'lit', 'lmo', 'lrc', 'ltg', 'ltz', 'lug', 'lzh', 'mai', 'mal', 'map-bms', 'mar', 'mdf', 'mhr', 'min', 'mkd', 'mlg', 'mlt', 'mon', 'mri', 'mrj', 'msa', 'mwl', 'mya', 'myv', 'mzn', 'nan', 'nap', 'nav', 'nci', 'nds', 'nds-nl', 'nep', 'new', 'nld', 'nno', 'nob', 'nrm', 'nso', 'oci', 'olo', 'ori', 'orm', 'oss', 'pag', 'pam', 'pan', 'pap', 'pcd', 'pdc', 'pfl', 'pnb', 'pol', 'por', 'pus', 'que', 'roa-tara', 'roh', 'ron', 'rue', 'rup', 'rus', 'sah', 'san', 'scn', 'sco', 'sgs', 'sin', 'slk', 'slv', 'sme', 'sna', 'snd', 'som', 'spa', 'sqi', 'srd', 'srn', 'srp', 'stq', 'sun', 'swa', 'swe', 'szl', 'tam', 'tat', 'tcy', 'tel', 'tet', 'tgk', 'tgl', 'tha', 'ton', 'tsn', 'tuk', 'tur', 'tyv', 'udm', 'uig', 'ukr', 'urd', 'uzb', 'vec', 'vep', 'vie', 'vls', 'vol', 'vro', 'war', 'wln', 'wol', 'wuu', 'xho', 'xmf', 'yid', 'yor', 'zea', 'zh-yue', 'zho']

# Language to index conversion
language_to_index = {lang: i for i, lang in enumerate(langs)}
index_to_language = {i: lang for i, lang in enumerate(langs)}

## Load and preprocess data

In [3]:
df = pd.read_csv('../data/Language-Identification_test.csv')

In [4]:
language_map = {
    "ar": "ara",
    "bg": "bul",
    "de": "deu",
    "el": "ell",
    "en": "eng",
    "es": "spa",
    "fr": "fra",
    "hi": "hin",
    "it": "ita",
    "ja": "jpn",
    "nl": "nld",
    "pl": "pol",
    "pt": "por",
    "ru": "rus",
    "sw": "swa",
    "th": "tha",
    "tr": "tur",
    "ur": "urd",
    "vi": "vie",
    "zh": "zho",
}

langs_orderd = list(language_map.values())
langs_orderd.sort()

df = df.replace({'labels':language_map})
df.rename(columns = {'labels':'lang', 'text':'sentence'}, inplace = True)

In [5]:
print(df)

     lang                                           sentence
0     nld                    Een man zingt en speelt gitaar.
1     nld  De technologisch geplaatste Nasdaq Composite I...
2     spa  Es muy resistente la parte trasera rígida y lo...
3     ita  "In tanti modi diversi, l'abilità artistica de...
4     ara  منحدر يواجه العديد من النقاشات المتجهه إزاء ال...
...   ...                                                ...
9995  zho                               史料很充分，对岸的很多观点与大陆迥异啊。
9996  tur  Örneğin, teşhis Yunanca bir kelimeden alındı (...
9997  vie  Nếu lite/light chỉ đơn giản là mô tả một đặc t...
9998  bul  Например, една щатска столица, която посетихме...
9999  pol                   Mam dla ciebie kilka propozycji:

[10000 rows x 2 columns]


In [6]:
y_test_id = [language_to_index[lang] for lang in df.lang]
test = list(df["sentence"])
# , torch.Tensor(y_test_id).to(device).long())

batch_size = 32

test_dl = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

## Evaluation and Model Code 

In [7]:
class ClassifierHead(torch.nn.Module):
    def __init__(
        self, input_dim, output_dim, hidden_dims=None, activation=torch.nn.ReLU,
        dropout_prob=0.0
    ):
        super().__init__()
        if hidden_dims is None:
            hidden_dims = []

        # Save arguments
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dims = hidden_dims
        self.activation = activation
        self.dropout_prob = dropout_prob

        dims = [input_dim] + hidden_dims + [output_dim]

        # Construct layers
        layers = []
        for i_dim, o_dim in zip(dims, dims[1:]):
            layers.append(torch.nn.Linear(i_dim, o_dim))
            layers.append(self.activation)

            if self.dropout_prob:
                layers.append(torch.nn.Dropout(self.dropout_prob))
        
        if self.dropout_prob:
            layers = layers[:-2]  # remove last activation and dropout
        else:
            layers = layers[:-1]  # remove last activation
            
        self.classifier = torch.nn.Sequential(*layers)

    def forward(self, x):
        return self.classifier(x)

In [8]:
def metadata_collector(sentences, device, features=list(), normalize=False):
    data = torch.zeros((len(sentences), len(features))).to(device)
    
    for i, paragraph in enumerate(sentences):
        for char in paragraph:
            
            cat = unicodedata.category(char)
            
            for idx in range(len(features)):
                if cat in features[idx]:
                    data[i][idx] += 1            
    
    # normalize the data to percentage of the sentence exists of
    if normalize:
        return torch.div(data.T, torch.sum(data, 1)).T
    else:
        return data

In [9]:
def generate_bert_embeddings(
    model, tokenizer, data_loader, batch_size: int = 4
) -> torch.Tensor:
    model.eval()

    with torch.no_grad():
        embeddings = []
        # Loop over the sentences in batches
        for sentences_batch in tqdm(data_loader):
            encoded_input = tokenizer(
                list(sentences_batch),
                padding=True,
                truncation=True,
                return_tensors="pt",
            ).to(device)
            output = model(**encoded_input)

            # Take for the 0 layer the average of the tokens minus the CLS one
            stacked = torch.stack(output.hidden_states)
            
            stacked[0,:,0,:] = (torch.sum(stacked[0], 1) - stacked[0,:,0,:]) / (stacked.shape[2]-1)

            # Take [CLS] token embedding of specific layers or the last one.
            cls_embeddings = stacked[:, :, 0, :].cpu()

            # Store the embeddings
            embeddings.append(cls_embeddings)

    return torch.concat(embeddings, dim=1)

In [10]:
class EmbeddingsDataset(Dataset):
    def __init__(self, bert_embeddings, targets, features_embeddings=None):
        self.bert_embeddings = bert_embeddings
        self.features_embeddings = features_embeddings
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        embeddings = [self.bert_embeddings[idx]]
        if self.features_embeddings is not None:
            embeddings.append(self.features_embeddings[idx])
        return torch.cat(embeddings), self.targets[idx]

In [11]:
def evaluate(model, dl, loss_fn):
    # Evaluate
    data_loss = 0
    y_true = []
    y_pred = []

    with torch.no_grad():
        for batch in tqdm_notebook(dl):
            # Get batch
            x, y = batch

            # Get predictions
            y_hat = model.forward(x)

            # Compute loss
            loss = loss_fn(y_hat, y)

            # Update loss
            data_loss += loss.item()

            # Update predictions
            y_true.extend(y.tolist())
            y_pred.extend(y_hat.argmax(dim=1).tolist())

    # Calculate accuracy
    acc = np.mean(np.array(y_true) == np.array(y_pred))

    return data_loss, acc, y_pred, y_true

## Load in the Model

In [140]:
# Replace the following string with the last bit of the Wandb.ai runpath
# you want to take the model from.
api = wandb.Api()

path = "nils2/dl4nlp/runs/"
run_path = path+"1g90wn05"

run = api.run(run_path)
run_args = run.config

print("The following experiment has been selected:",run.name)

The following experiment has been selected: mbert-bbs32-unic


In [141]:
print(run_args)

{'seed': 13331, 'data_dir': 'data/wili-2018-split', 'features': ['Feature.UNICODE_CATEGORY'], 'momentum': 0.9, 'optimizer': 'Optimizer.SGD', 'activation': 'Activation.ReLU', 'batch_size': 1024, 'hidden_dims': [], 'dropout_prob': 0, 'learning_rate': 0.2, 'embeddings_dir': 'embeddings', 'bert_batch_size': 32, 'bert_model_name': 'bert-base-multilingual-cased', 'experiment_name': 'mbert-bbs32-unic', 'embeddings_layer': 12, 'unicode_categories': ['Ll', 'Zs', 'Lu', 'Po', 'Pd', 'Lo', 'Mn', 'Ps', 'Pe', 'Mc']}


In [142]:
model_name = run_args['bert_model_name']

# Load the correct model.
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_hidden_states=True).to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [143]:
# Create the correct dataloader for the new dataset with the run parameters. 
if 'normalize' in run_args:
    if run_args['normalize']:
        print("Normalization flag was used this run.")
        normalize=True

if run_args['features']:
    print("Features were detected in this run.")
    features = run_args['unicode_categories']
    BERT_DIM = len(features)
    features_embeddings=metadata_collector(test, device, features, normalize=True)
else:
    features_embeddings=None
    BERT_DIM = 0

# Get the embeddings from the same layer.
bert_embeddings = generate_bert_embeddings(model, tokenizer, test_dl, batch_size)[run_args['embeddings_layer']]
BERT_DIM += bert_embeddings.shape[-1]

test_loader = torch.utils.data.DataLoader(
        EmbeddingsDataset(bert_embeddings.to(device), 
                          torch.Tensor(y_test_id).to(device).long(), 
                          features_embeddings), 
        batch_size=batch_size, shuffle=False
    )

Features were detected in this run.


100%|█████████████████████████████████████████| 313/313 [00:18<00:00, 17.23it/s]


In [144]:
# Load in the parameters from the corresponding saved file in WandB.
best_model = wandb.restore('best_model.pth', run_path=run_path, replace=True)

# use the "name" attribute of the returned object if your framework expects a filename, e.g. as in Keras
LMclassifier = ClassifierHead(BERT_DIM, len(langs)).to(device)
LMclassifier.load_state_dict(torch.load(best_model.name, map_location=device))

<All keys matched successfully>

## Results

In [145]:
test_loss, test_accuracy, y_pred, y_true = evaluate(LMclassifier, test_loader, torch.nn.CrossEntropyLoss())

  0%|          | 0/313 [00:00<?, ?it/s]

In [146]:
print(classification_report(y_true, y_pred, labels=np.unique(y_true), target_names=langs_orderd, digits=4))

              precision    recall  f1-score   support

         ara     1.0000    0.8540    0.9213       500
         bul     0.9979    0.9640    0.9807       500
         deu     1.0000    0.2440    0.3923       500
         ell     1.0000    0.9940    0.9970       500
         eng     0.9861    0.1420    0.2483       500
         fra     1.0000    0.1980    0.3306       500
         hin     1.0000    0.8820    0.9373       500
         ita     1.0000    0.5500    0.7097       500
         jpn     1.0000    0.9960    0.9980       500
         nld     1.0000    0.7340    0.8466       500
         pol     1.0000    0.9120    0.9540       500
         por     1.0000    0.8800    0.9362       500
         rus     0.9929    0.8340    0.9065       500
         spa     1.0000    0.1520    0.2639       500
         swa     0.9956    0.8960    0.9432       500
         tha     1.0000    0.9440    0.9712       500
         tur     1.0000    0.6880    0.8152       500
         urd     0.9957    

In [147]:
print(accuracy_score(y_true, y_pred))

0.6976


## Print the model's failure modes

In [None]:
# Print the paragraphs that were classified wrongly of the test set
for i, (pred, true) in enumerate(zip(y_pred, y_true)):
    if pred != true:
        pred_lang, true_lang = index_to_language[pred], index_to_language[true]
        print(f"Paragraph {i} was classified as '{pred_lang}' but is actually '{true_lang}'")
        print(test[i])
        print()