# **Language Detector Training**

This notebook has been used to train and test 2 models on 2 different datasets, the first one is a high resources language detection dataset while the second one is a low resources language detection dataset.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q datasets transformers[sentencepiece] langid watermark
!pip install accelerate -U
# !nvidia-smi
%load_ext watermark
%watermark -p torch,datasets,sklearn,transformers,langid

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for langid (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo

In [None]:
import time
from pathlib import Path

import langid
import torch
from datasets import load_dataset
# from sklearn.metrics import f1_score, precision, recall, accuracy_score, classification_report
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    pipeline,
    Trainer,
    TrainingArguments
)

from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn
import torch.nn.functional as F

from tqdm import tqdm

gdrive_dir = Path('./')

### Get and split the high resources dataset
I have used papluca/language-identification dataset which contains 90k samples in 20 different high resource languages

In [None]:
dataset = load_dataset("papluca/language-identification")
ds_train = dataset['train']
ds_valid = dataset['validation']
ds_test = dataset['test']

print(f"Train / valid / test samples: {len(ds_train)} / {len(ds_valid)} / {len(ds_test)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.99k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.69M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Train / valid / test samples: 70000 / 10000 / 10000


## Tokenization

I truncate all the sentences with more than 128 tokens up to the limit 128.
Then i map all the labels to the relative class number and print some statistics.

In [None]:
model_ckpt = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
def tokenize_text(sequence):
    """Tokenize input sequence."""
    return tokenizer(sequence["text"], truncation=True, max_length=128, return_tensors="pt",
                                    padding=True)

In [None]:
def encode_labels(example):
    """Map string labels to integers."""
    example["labels"] = label2id[example["labels"]]
    return example

In [None]:
label2id = {
    'ja': 0,
    'nl': 1,
    'ar': 2,
    'pl': 3,
    'de': 4,
    'it': 5,
    'pt': 6,
    'tr': 7,
    'es': 8,
    'hi': 9,
    'el': 10,
    'ur': 11,
    'bg': 12,
    'en': 13,
    'fr': 14,
    'zh': 15,
    'ru': 16,
    'th': 17,
    'sw': 18,
    'vi': 19,
}

In [None]:
id2label = {v: k for k, v in label2id.items()}
id2label

{0: 'ja',
 1: 'nl',
 2: 'ar',
 3: 'pl',
 4: 'de',
 5: 'it',
 6: 'pt',
 7: 'tr',
 8: 'es',
 9: 'hi',
 10: 'el',
 11: 'ur',
 12: 'bg',
 13: 'en',
 14: 'fr',
 15: 'zh',
 16: 'ru',
 17: 'th',
 18: 'sw',
 19: 'vi'}

In [None]:
tok_train = ds_train.map(tokenize_text, batched=True)
tok_valid = ds_valid.map(tokenize_text, batched=True)
tok_test = ds_test.map(tokenize_text, batched=True)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
tok_train = tok_train.map(encode_labels, batched=False)
tok_valid = tok_valid.map(encode_labels, batched=False)
tok_test = tok_test.map(encode_labels, batched=False)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
from statistics import mean, stdev

_len = [len(sample) for sample in tok_train['input_ids']]
avg_len, std_len = mean(_len), stdev(_len)
min_len, max_len = min(_len), max(_len)

print('-'*10 + ' Corpus statistics ' + '-'*10)
print(f'\nAvg. length: {avg_len:.1f} (std. {std_len:.1f})')
print('Min. length:', min_len)
print('Max. length:', max_len)

---------- Corpus statistics ----------

Avg. length: 128.0 (std. 0.0)
Min. length: 128
Max. length: 128


In [None]:
def compute_metrics(pred, labels):
    """Custom metric to be used during training."""

    acc = accuracy_score(labels, preds)  # Accuracy
    f1 = f1_score(labels, preds, average="weighted")  # F1-score
    return {
        "accuracy": acc,
        "f1": f1
        }

# **High resources**

## Model
I used a Roberta transformer to get the sentence embedding and a Classifier module with 2 linear layer to classify the sentence

In [None]:
class Classifier(nn.Module):
    def __init__(self, num_classes, dropout=0.1):

        super(Classifier, self).__init__()

        self.dropout = nn.Dropout(dropout)
        self.dense = nn.Linear(768, 768)
        self.out_proj = nn.Linear(768, num_classes)

    def forward(self, x):

        x = self.dropout(self.dense(x))
        logits = self.out_proj(x)

        return logits

In [None]:
class TextClassifier(nn.Module):

    def __init__(self, language_model, num_classes, dropout=0.1):

        super(TextClassifier, self).__init__()

        self.roberta = AutoModel.from_pretrained(language_model, add_pooling_layer=False)
        self.classifier = Classifier(num_classes)

        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask, labels = None, compute_predictions = False, compute_loss = False):

        pooled_output = self.roberta(input_ids = input_ids, attention_mask = attention_mask).last_hidden_state[:,0,:]

        logits = self.classifier(pooled_output)

        output = {"logits": logits}

        if compute_predictions:
            predictions = logits.argmax(dim=-1)
            output["predictions"] = predictions

        if compute_loss and labels is not None:
            output["loss"] = self.compute_loss(logits, labels)

        return output

    def compute_loss(self, logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:

        return F.cross_entropy(
            logits.view(-1, self.num_classes),
            labels.view(-1).to(torch.long),
            ignore_index=-100,
        )

## Hyperparameters

In [None]:
#hyperparameters
class hypers:
    save_model = False
    save_model_path = ''

    load_model = False
    load_model_path = ''

    input_size = 768
    embedding_dim = 768
    num_classes = 20
    learning_rate = 1e-3
    epochs = 1
    batch_size = 16
    print_step = 100
    device = 'cuda' if torch.cuda.is_available() else 'cpu'


## Dataloader

In [None]:
def collate_fn(data):

  rets = {
      'labels': [d['labels'] for d in data],
      'text': [d['text'] for d in data],
      'input_ids': [d['input_ids'] for d in data],
      'attention_mask': [d['attention_mask'] for d in data]
  }

  return rets

In [None]:
dataloaderTest = DataLoader(
    tok_test,
    batch_size=hypers.batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

dataloaderTrain = DataLoader(
    tok_train,
    batch_size=hypers.batch_size,
    shuffle=False,
    collate_fn=collate_fn
)


dataloaderValid = DataLoader(
    tok_valid,
    batch_size=hypers.batch_size,
    shuffle=False,
    collate_fn=collate_fn
)




## Train

In [None]:
language_model = "xlm-roberta-base"
model_weights_path = "/content/drive/MyDrive/Magistrale/Natural Language Processing/NLP-HW-BONUS/model_params.pth"

my_text_classifier = TextClassifier(language_model, 20).to(hypers.device)
my_text_classifier.load_state_dict(torch.load(model_weights_path))

optimizer = torch.optim.Adam(my_text_classifier.parameters(), lr=hypers.learning_rate)

Some weights of XLMRobertaModel were not initialized from the model checkpoint at papluca/xlm-roberta-base-language-detection and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#Training
from tqdm import tqdm
my_text_classifier.train()

num_batches = len(dataloaderTrain)
losses = []

bar_tqdm_epoch = tqdm(range(hypers.epochs), total = hypers.epochs, position = 1, leave = True)
for epoch in bar_tqdm_epoch:
  loss_mean = 0

  bar_tqdm = tqdm(enumerate(dataloaderTrain), total = num_batches, position = 0, leave = False)
  for i, data in bar_tqdm:

    optimizer.zero_grad()
    torch.cuda.empty_cache()

    text = data['text']
    labels = data['labels']

    batch = {
        'input_ids': torch.tensor(data['input_ids']).to(hypers.device),
        'attention_mask': torch.tensor(data['attention_mask']).to(hypers.device),
        'labels':  torch.tensor(data['labels']).to(hypers.device),
        'compute_loss': True,
        'compute_predictions': False
    }

    outputs = my_text_classifier(**batch)
    loss = outputs["loss"]
    loss.backward()
    optimizer.step()

    loss_mean += loss.item()

    if i % hypers.print_step == 0:
      bar_tqdm.set_postfix({'loss': loss.item()})

    # if i == 0:
    #   break

  losses.append(loss_mean / num_batches)
  bar_tqdm_epoch.set_postfix({'loss': losses[-1]})

In [None]:
# #Evaluation on Validation set
# my_text_classifier.eval()
# for i, data in enumerate(dataloaderValid):
#   text = data['text']
#   labels = data['labels']

#   batch = {
#       'input_ids': torch.tensor(data['input_ids']).to(hypers.device),
#       'attention_mask': torch.tensor(data['attention_mask']).to(hypers.device)
#   }

#   with torch.no_grad():
#     outputs = my_text_classifier(**batch)
#   logits = outputs['logits']
#   print(logits.shape)

#   if i == 0:
#     break

torch.Size([16, 20])


## Test

In [None]:
#import the pre-trained params
language_model = "xlm-roberta-base"

model_weights_path = "/content/drive/MyDrive/Magistrale/Natural Language Processing/NLP-HW-BONUS/model_params.pth"
my_text_classifier = TextClassifier(language_model, 20).to(hypers.device)
my_text_classifier.load_state_dict(torch.load(model_weights_path))


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

<All keys matched successfully>

In [None]:
my_text_classifier.eval()

num_batches = len(dataloaderTest)
bar_tqdm = tqdm(enumerate(dataloaderTest), total = num_batches, position = 0, leave = True)

for i, data in bar_tqdm:

  batch = {
        'input_ids': torch.tensor(data['input_ids']).to(hypers.device),
        'attention_mask': torch.tensor(data['attention_mask']).to(hypers.device),
        'labels':  torch.tensor(data['labels']).to(hypers.device),
        'compute_loss': True,
        'compute_predictions': True
  }


  with torch.no_grad():
    outputs = my_text_classifier(**batch)

  print(outputs['predictions'])
  print(batch['labels'])
  print(outputs['loss'])

  del batch

  if i == 0:
    break

  0%|          | 0/625 [00:00<?, ?it/s]

tensor([ 1,  1,  8,  5,  2, 16,  7,  1, 14,  8, 18, 10,  8,  3,  2,  0],
       device='cuda:0')
tensor([ 1,  1,  8,  5,  2, 16,  7,  1, 14,  8,  5, 10,  8,  3,  2,  0],
       device='cuda:0')


  0%|          | 0/625 [00:01<?, ?it/s]

tensor(0.3457, device='cuda:0')





# **Low Resources**
I created a custom dataset for the low resources language detection dataset, which is a collection of sentences retrieved from Koran and Bible translations.
The dataset is composed by 38 different languages.

## Import

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !pip install beautifulsoup4
!pip install -U sentence-transformers
!pip install langcodes
!pip install language_data
!pip install -q pytorch-lightning
!pip install gdown==4.6.0

Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/132.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m112.6/132.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1
Collecting language_data
  Downloading language_data-1.1-py3-none-any.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting marisa-trie<0.8.0,>=0.7.7 (from language_data)
  Downloading

In [None]:
!gdown 1PftiUKsWN_gOwAWMcUtXUWgACzytZTqH
!gdown 1L1FnX0FYxFUrI3nkICHd7EFOT8wFB00W
!gdown 1kE8fLb2_296FWQ1ZH1Oq2RNsQT293ilX

Downloading...
From: https://drive.google.com/uc?id=1PftiUKsWN_gOwAWMcUtXUWgACzytZTqH
To: /content/nlp_hw4_dataset_corano_reduced.csv
100% 14.7M/14.7M [00:00<00:00, 125MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1L1FnX0FYxFUrI3nkICHd7EFOT8wFB00W
To: /content/nlp_hw4_dataset_complete.csv
100% 26.5M/26.5M [00:00<00:00, 118MB/s]
Downloading...
From: https://drive.google.com/uc?id=1kE8fLb2_296FWQ1ZH1Oq2RNsQT293ilX
To: /content/language_dataset.csv
100% 18.2M/18.2M [00:00<00:00, 62.0MB/s]


In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from sentence_transformers import SentenceTransformer
from langcodes import Language
from torch.utils.data import Dataset, DataLoader, random_split

import pytorch_lightning as pl
from pytorch_lightning import Trainer
import torchmetrics
from sklearn.metrics import classification_report

import torch.nn.functional as F

SEED: int = 42
pl.seed_everything(SEED)
# import wandb
# from pytorch_lightning.loggers import WandbLogger

from typing import Any, Tuple
from typing import Dict, Iterator, List, Union, Optional

from tqdm import tqdm

import zipfile

import gzip
import shutil

import xml.etree.ElementTree as ET

from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    pipeline,
    Trainer,
    TrainingArguments
)

from torch import nn

INFO:lightning_fabric.utilities.seed:Seed set to 42


## Creazione Dataset
In this section i created the custom low resources langugage detection dataset.

In [None]:
def extract_sentences(body):
  '''
  This function preprocess the sentences retrieve from get requests
  '''
  sentences = []
  copro = str(list(body)).split('<p>')[2]
  cleaned_text = re.sub(r'<hr/>', '', copro)
  for idx, token in enumerate(cleaned_text.split('\n')):
    tok = token.split('=')
    if tok[0] == '(trg)':
      # print(tok[1])
      tok_1 = tok[1].split(' ')[1:]
      final_sent = ' '.join(tok_1).replace('<br/>', '')
      sentences.append(final_sent)
  # print(sentences)
  return sentences

In [None]:
#All the languages and relative symbols used
#I removed some languages ​​from the initial dataset as they had too few samples or were already included in the high resources dataset

language_list = ['es',	'fa',	'fr',	'ha',	'hi',	'id',	'it',	'ja',	'ko',	'ku','ml','ms','nl','no','pl'	,'pt'	,'ro','ru',	'sd',	'so',	'sq',	'sv',	'sw',	'ta',	'tg','th',	'tr',	'tt',	'ug',	'ur',	'uz',	'zh']
language_name = ['Spanish', 'Persian', 'French', 'Hausa', 'Hindi', 'Indonesian', 'Italian', 'Japanese', 'Korean', 'Kurdish', 'Malayalam', 'Malay', 'Dutch', 'Norwegian', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Sindhi', 'Somali', 'Albanian', 'Swedish', 'Swahili', 'Tamil', 'Tajik', 'Thai', 'Turkish', 'Tatar', 'Uyghur', 'Urdu', 'Uzbek', 'Chinese']

language_name_removed = ['fa','ha','hi','ja','ko','ku','ml','ru','sd','so','sw','ta','tg','th','tt','ug','uz','zh']
language_list_removed = ['Persian','Hausa','Hindi','Japanese','Korean','Kurdish','Malayalam','Russian','Sindhi','Somali','Swahili','Tamil','Tajik','Thai','Tatar','Uyghur','Uzbek','Chinese']



bible_language_symbol = ['it', 'fi', 'et', 'eu', 'gd', 'hr', 'hu', 'hy', 'is', 'la', 'lt', 'lv', 'mi', 'mr', 'my', 'ne', 'pes', 'pot', 'sk', 'sl', 'sn', 'sr', 'ss', 'syr', 'te', 'tl', 'tmh', 'uk', 'usp', 'vi', 'wal', 'wo', 'xh', 'zu']
bible_language_name = ['Italian', 'Finnish', 'Estonian', 'Basque', 'Scottish Gaelic', 'Croatian', 'Hungarian', 'Armenian', 'Icelandic', 'Latin', 'Lithuanian', 'Latvian', 'Māori', 'Marathi', 'Burmese', 'Nepali', 'Iranian Persian', 'Potawatomi', 'Slovak', 'Slovenian', 'Shona', 'Serbian', 'Swati', 'Syriac', 'Telugu', 'Filipino', 'Tamashek', 'Ukrainian', 'Uspanteco', 'Vietnamese', 'Wolaytta', 'Wolof', 'Xhosa', 'Zulu']



# for l_sym in language_list:
#   language_name.append(Language.get(l_sym).language_name())
# print(language_name)

In [None]:
# Init del DataFrame
df = pd.DataFrame({
    'idx': 0,
    'sentence': 'Ciao a tutti io mi chiamo Lorenzo.',
    'language': ['Italian'],
    'language_symbol': ['it']
})

df.head()

Unnamed: 0,idx,sentence,language,language_symbol
0,0,Ciao a tutti io mi chiamo Lorenzo.,Italian,it


### samples

In [None]:
for i, sym in enumerate(language_list):
  url = f'https://opus.nlpl.eu/Tanzil/v1/en-{sym}_sample.html'
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')
  body = soup.body

  try:
    sentences = extract_sentences(body)
  except:
    print(f'language: {language_name[i]}, url: {url}, body: {body}')
    continue
  # print(sentences)
  sentences.pop(7)

  # Aggiunta delle nuove frasi al DataFrame con il nuovo language e language_symbol
  new_entries = pd.DataFrame({
      'idx': range(len(df), len(df) + len(sentences)),
      'sentence': sentences,
      'language': [language_name[i]] * len(sentences),
      'language_symbol': [sym] * len(sentences)
  })

  # Concatenazione con il DataFrame esistente
  df = pd.concat([df, new_entries], ignore_index=True)

In [None]:
for i, sym in enumerate(bible_language_symbol):
  url = f'https://opus.nlpl.eu/bible-uedin/v1/en-{sym}_sample.html'
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')
  body = soup.body

  try:
    sentences = extract_sentences(body)
  except:
    print(f'language: {bible_language_name[i]}, url: {url}, body: {body}')
    continue
  # print(sentences)

  # Aggiunta delle nuove frasi al DataFrame con il nuovo language e language_symbol
  new_entries = pd.DataFrame({
      'idx': range(len(df), len(df) + len(sentences)),
      'sentence': sentences,
      'language': [bible_language_name[i]] * len(sentences),
      'language_symbol': [sym] * len(sentences)
  })

  # Concatenazione con il DataFrame esistente
  df = pd.concat([df, new_entries], ignore_index=True)

### xmls

In [None]:
for i, sym in tqdm(enumerate(bible_language_symbol), total=len(bible_language_symbol)):

  url = f'https://opus.nlpl.eu/download.php?f=Tanzil/v1/tmx/en-{sym}.tmx.gz'
  response = requests.get(url)
  f = open('shit.gz', 'wb')
  f.write(response.content)
  f.close()

  with gzip.open('shit.gz', 'rb') as file_in:

    with open('shit.tmx', 'wb') as file_out:
            # Copia il contenuto decompresso nel file di destinazione
            shutil.copyfileobj(file_in, file_out)

  tree = ET.parse('shit.tmx')
  root = tree.getroot()

  # Estrazione delle frasi con lang="it"
  sentences = [tuv.find('seg').text for tu in root.findall('.//tu') for tuv in tu.findall('tuv') if tuv.get('{http://www.w3.org/XML/1998/namespace}lang') == sym]
  sentences = sentences[:10000]

  # Aggiunta delle nuove frasi al DataFrame con il nuovo language e language_symbol
  new_entries = pd.DataFrame({
      'idx': range(len(df), len(df) + len(sentences)),
      'sentence': sentences,
      'language': [language_name[i]] * len(sentences),
      'language_symbol': [sym] * len(sentences)
  })

  # Concatenazione con il DataFrame esistente
  df = pd.concat([df, new_entries], ignore_index=True)

In [None]:


for i, sym in tqdm(enumerate(bible_language_symbol), total=len(bible_language_symbol)):

  url = f'https://object.pouta.csc.fi/OPUS-bible-uedin/v1/moses/{sym}-en.txt.zip'
  response = requests.get(url)
  f = open('shit.zip', 'wb')
  f.write(response.content)
  f.close()

  with gzip.open('shit.gz', 'rb') as file_in:

    with open('shit.tmx', 'wb') as file_out:
            # Copia il contenuto decompresso nel file di destinazione
            shutil.copyfileobj(file_in, file_out)

  tree = ET.parse('shit.tmx')
  root = tree.getroot()

  # Estrazione delle frasi con lang="it"
  sentences = [tuv.find('seg').text for tu in root.findall('.//tu') for tuv in tu.findall('tuv') if tuv.get('{http://www.w3.org/XML/1998/namespace}lang') == sym]
  sentences = sentences[:10000]

  # Aggiunta delle nuove frasi al DataFrame con il nuovo language e language_symbol
  new_entries = pd.DataFrame({
      'idx': range(len(df), len(df) + len(sentences)),
      'sentence': sentences,
      'language': [language_name[i]] * len(sentences),
      'language_symbol': [sym] * len(sentences)
  })

  # Concatenazione con il DataFrame esistente
  df = pd.concat([df, new_entries], ignore_index=True)

### txt

In [None]:
import requests
import zipfile
import io

for i, sym in tqdm(enumerate(bible_removed), total=len(bible_removed)):
  # URL del file ZIP
  url = f'https://object.pouta.csc.fi/OPUS-bible-uedin/v1/moses/en-{sym}.txt.zip'

  # Effettua la richiesta per ottenere il file
  response = requests.get(url)
  response.raise_for_status()  # Solleva un errore se la richiesta non è andata a buon fine

  # Usa io.BytesIO per simulare un file in memoria con il contenuto del file ZIP
  file_zip = io.BytesIO(response.content)

  # Estrai il file ZIP
  with zipfile.ZipFile(file_zip, 'r') as zip_ref:
      zip_ref.extractall('/content/shit')

  f = open(f'shit/bible-uedin.en-{sym}.{sym}', 'r')
  lines = f.readlines()
  f.close()

  sentences = [s.rstrip('\n') for s in lines]
  sentences = sentences[:2000]

  # Aggiunta delle nuove frasi al DataFrame con il nuovo language e language_symbol
  new_entries = pd.DataFrame({
      'idx': range(len(df), len(df) + len(sentences)),
      'sentence': sentences,
      'language': [bible_language_name[i]] * len(sentences),
      'language_symbol': [sym] * len(sentences)
  })

  # Concatenazione con il DataFrame esistente
  df = pd.concat([df, new_entries], ignore_index=True)

100%|██████████| 32/32 [01:20<00:00,  2.52s/it]


In [None]:
df = df[df['language_symbol'] != 'gd']


In [None]:
df = df.reset_index()

### Results

In [None]:
# df.head()
# def sample_n_per_group(subframe):
#     return subframe.sample(n=min(len(subframe), 2000), random_state=1)

# # take only 2000 samples for each language
# df_sampled = df.groupby('language').apply(sample_n_per_group).reset_index(drop=True)
# df = df_sampled
# df.to_csv('nlp_hw4_dataset_complete.csv')


## Import dataset e divisione dataset
In this section i import a pre-created dataset.

In [None]:
df = pd.read_csv('language_dataset.csv')

In [None]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2

train_size = int(train_ratio * len(df))
val_size = int(val_ratio * len(df))

train_df = df[:train_size]
val_df = df[train_size:(train_size + val_size)]
test_df = df[(train_size + val_size):]

val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

len(train_df),len(val_df),len(test_df)

(45600, 15200, 15200)

In [None]:
languages=list(df['language_symbol'].unique())
num_languages = len(languages)
languages,num_languages
lan2idx = dict()
idx2lan = dict()
for lan in languages:
  if lan not in lan2idx:
    lan2idx[lan]=len(lan2idx)

idx2lan = {v: k for k, v in lan2idx.items()}
print(lan2idx)
print(idx2lan)
print(num_languages)

sym_to_languages = {}
for elem in lan2idx.keys():
  language_name = Language.get(elem).language_name()
  sym_to_languages[elem] = language_name

{'zh': 0, 'so': 1, 'mr': 2, 'ja': 3, 'hy': 4, 'uk': 5, 'xh': 6, 'ha': 7, 'te': 8, 'sw': 9, 'la': 10, 'syr': 11, 'tmh': 12, 'sn': 13, 'uz': 14, 'tg': 15, 'mi': 16, 'hu': 17, 'tt': 18, 'ta': 19, 'zu': 20, 'is': 21, 'usp': 22, 'eu': 23, 'et': 24, 'sd': 25, 'my': 26, 'th': 27, 'ko': 28, 'tl': 29, 'wal': 30, 'ss': 31, 'ru': 32, 'hi': 33, 'ml': 34, 'pot': 35, 'ne': 36, 'ug': 37}
{0: 'zh', 1: 'so', 2: 'mr', 3: 'ja', 4: 'hy', 5: 'uk', 6: 'xh', 7: 'ha', 8: 'te', 9: 'sw', 10: 'la', 11: 'syr', 12: 'tmh', 13: 'sn', 14: 'uz', 15: 'tg', 16: 'mi', 17: 'hu', 18: 'tt', 19: 'ta', 20: 'zu', 21: 'is', 22: 'usp', 23: 'eu', 24: 'et', 25: 'sd', 26: 'my', 27: 'th', 28: 'ko', 29: 'tl', 30: 'wal', 31: 'ss', 32: 'ru', 33: 'hi', 34: 'ml', 35: 'pot', 36: 'ne', 37: 'ug'}
38


## Models
In this section i defined the model which is composed by a distiluse-base-multilingual-cased-v2 + classifier.

### distiluse-base-multilingual-cased-v2 + Classifier

In [None]:
import torch
import torch.nn.functional as F

class TextClassifier(pl.LightningModule):
    def __init__(self, hypers, num_labels: int, fine_tune_lm: bool = True, *args, **kwargs) -> None:
        super().__init__()
        self.num_labels = num_labels
        self.hypers = hypers
        self.batch_size = self.hypers.batch_size

        self.preds = []
        self.targets = []

        self.accuracy = torchmetrics.Accuracy(task = 'multiclass', num_classes=num_labels)

        # layers
        self.transformer_model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
        # self.transformer_model = SentenceTransformer('sentence-transformers/LaBSE')
        if not fine_tune_lm:
            for param in self.transformer_model.parameters():
                param.requires_grad = False
        self.dropout = torch.nn.Dropout(0.2)
        self.relu = torch.nn.ReLU()
        self.fc1 = torch.nn.Linear(512, 512, bias=True)
        self.fc2 = torch.nn.Linear(512, 128, bias=True)
        self.classifier = torch.nn.Linear(128, num_labels, bias=True)

    def forward(
        self,
        sentences = None,
        labels: torch.Tensor = None,
        compute_predictions: bool = False,
        compute_loss: bool = True,
        *args,
        **kwargs,
    ) -> torch.Tensor:

        # tokenized = tokenizer(sentences, return_tensors="pt",
        #                             padding='longest',
        #                             is_split_into_words=True)
        # input_ids = tokenized['input_ids']
        # attention_mask = tokenized['attention_mask']

        # transformers_outputs = self.transformer_model.encode(input_ids = input_ids, attention_mask = attention_mask) # batch_size X 768

        transformers_outputs = self.transformer_model.encode(sentences) # batch_size X 768
        transformers_outputs = torch.tensor(transformers_outputs).to(self.hypers.device)
        transformers_outputs = self.dropout(transformers_outputs)

        transformers_outputs = self.dropout(self.relu(self.fc1(transformers_outputs)))
        transformers_outputs = self.dropout(self.relu(self.fc2(transformers_outputs)))
        logits = self.classifier(transformers_outputs)

        output = {"logits": logits}

        if compute_predictions:
            predictions = logits.argmax(dim=-1)
            output["predictions"] = predictions

        if compute_loss and labels is not None:
            output["loss"] = self.compute_loss(logits, labels)

        return output

    def training_step(self, batch, batch_idx):
        sentence = batch['sentence']
        labels = torch.tensor(batch['labels']).to(self.hypers.device)

        output = self(sentence)
        predictions = output['logits']

        loss = self.compute_loss(predictions, labels)
        accuracy = self.accuracy(predictions, labels)
        self.log('train_loss', loss, on_step=False, on_epoch=True, logger=True, batch_size=self.batch_size)
        self.log('train_acc', accuracy, on_step=False, on_epoch=True, logger=True, batch_size=self.batch_size)
        return loss

    def validation_step(self, batch, batch_idx):
        sentence = batch['sentence']
        labels = torch.tensor(batch['labels']).to(self.hypers.device)

        output = self(sentence)
        predictions = output['logits']

        loss = self.compute_loss(predictions, labels)
        accuracy = self.accuracy(predictions, labels)
        self.log('val_loss', loss, on_step=False, on_epoch=True, logger=True, batch_size=self.batch_size)
        self.log('val_acc', accuracy, on_step=False, on_epoch=True, logger=True, batch_size=self.batch_size)
        # return loss

    def test_step(self, batch, batch_idx):
        sentence = batch['sentence']
        labels = torch.tensor(batch['labels']).to(self.hypers.device)

        output = self(sentence)
        predictions = output['logits']

        self.preds += list(predictions)
        self.targets += list(labels)

        loss = self.compute_loss(predictions, labels)
        accuracy = self.accuracy(predictions, labels)
        self.log('test_loss', loss, on_step=False, on_epoch=True, logger=True, batch_size=self.batch_size)
        self.log('test_acc', accuracy, on_step=False, on_epoch=True, logger=True, batch_size=self.batch_size)
        # return loss

    def compute_loss(
        self, logits: torch.Tensor, labels: torch.Tensor
    ) -> torch.Tensor:
        """
        Compute the loss of the model.
        Args:
            logits (`torch.Tensor`):
                The logits of the model.
            labels (`torch.Tensor`):
                The labels of the model.
        Returns:
            obj:`torch.Tensor`: The loss of the model.
        """
        return F.cross_entropy(
            logits.view(-1, self.num_labels),
            labels.view(-1).to(torch.long),
            ignore_index=-100,
        )

    def configure_optimizers(self):
        # Puoi personalizzare l'ottimizzatore e i suoi parametri qui
        return torch.optim.Adam(self.parameters(), lr=1e-3)

    def get_preds_targets(self):
      return self.preds, self.targets

In [None]:
# SMALLER VERSION
# import torch
# import torch.nn.functional as F

# class TextClassifier(pl.LightningModule):
#     def __init__(self, hypers, num_labels: int, fine_tune_lm: bool = True, *args, **kwargs) -> None:
#         super().__init__()
#         self.num_labels = num_labels
#         self.hypers = hypers
#         self.batch_size = self.hypers.batch_size

#         self.preds = []
#         self.targets = []

#         self.accuracy = torchmetrics.Accuracy(task = 'multiclass', num_classes=num_labels)

#         # layers
#         self.transformer_model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
#         if not fine_tune_lm:
#             for param in self.transformer_model.parameters():
#                 param.requires_grad = False
#         self.dropout = torch.nn.Dropout(0.2)
#         self.relu = torch.nn.ReLU()
#         # self.fc1 = torch.nn.Linear(768, 512, bias=True)
#         # self.fc2 = torch.nn.Linear(512, 128, bias=True)
#         self.classifier = torch.nn.Linear(512, num_labels, bias=True)

#     def forward(
#         self,
#         sentences = None,
#         labels: torch.Tensor = None,
#         compute_predictions: bool = False,
#         compute_loss: bool = True,
#         *args,
#         **kwargs,
#     ) -> torch.Tensor:

#         # tokenized = tokenizer(sentences, return_tensors="pt",
#         #                             padding='longest',
#         #                             is_split_into_words=True)
#         # input_ids = tokenized['input_ids']
#         # attention_mask = tokenized['attention_mask']

#         # transformers_outputs = self.transformer_model.encode(input_ids = input_ids, attention_mask = attention_mask) # batch_size X 768

#         transformers_outputs = self.transformer_model.encode(sentences) # batch_size X 768
#         transformers_outputs = torch.tensor(transformers_outputs).to(self.hypers.device)
#         transformers_outputs = self.dropout(transformers_outputs)

#         # transformers_outputs = self.dropout(self.relu(self.fc1(transformers_outputs)))
#         # transformers_outputs = self.dropout(self.relu(self.fc2(transformers_outputs)))
#         logits = self.classifier(transformers_outputs)

#         output = {"logits": logits}

#         if compute_predictions:
#             predictions = logits.argmax(dim=-1)
#             output["predictions"] = predictions

#         if compute_loss and labels is not None:
#             output["loss"] = self.compute_loss(logits, labels)

#         return output

#     def training_step(self, batch, batch_idx):
#         sentence = batch['sentence']
#         labels = torch.tensor(batch['labels']).to(self.hypers.device)

#         output = self(sentence)
#         predictions = output['logits']

#         loss = self.compute_loss(predictions, labels)
#         accuracy = self.accuracy(predictions, labels)
#         self.log('train_loss', loss, on_step=False, on_epoch=True, logger=True, batch_size=self.batch_size)
#         self.log('train_acc', accuracy, on_step=False, on_epoch=True, logger=True, batch_size=self.batch_size)
#         return loss

#     def validation_step(self, batch, batch_idx):
#         sentence = batch['sentence']
#         labels = torch.tensor(batch['labels']).to(self.hypers.device)

#         output = self(sentence)
#         predictions = output['logits']

#         loss = self.compute_loss(predictions, labels)
#         accuracy = self.accuracy(predictions, labels)
#         self.log('val_loss', loss, on_step=False, on_epoch=True, logger=True, batch_size=self.batch_size)
#         self.log('val_acc', accuracy, on_step=False, on_epoch=True, logger=True, batch_size=self.batch_size)
#         # return loss

#     def test_step(self, batch, batch_idx):
#         sentence = batch['sentence']
#         labels = torch.tensor(batch['labels']).to(self.hypers.device)

#         output = self(sentence)
#         predictions = output['logits']

#         self.preds += list(predictions)
#         self.targets += list(labels)

#         loss = self.compute_loss(predictions, labels)
#         accuracy = self.accuracy(predictions, labels)
#         self.log('test_loss', loss, on_step=False, on_epoch=True, logger=True, batch_size=self.batch_size)
#         self.log('test_acc', accuracy, on_step=False, on_epoch=True, logger=True, batch_size=self.batch_size)
#         # return loss

#     def compute_loss(
#         self, logits: torch.Tensor, labels: torch.Tensor
#     ) -> torch.Tensor:
#         """
#         Compute the loss of the model.
#         Args:
#             logits (`torch.Tensor`):
#                 The logits of the model.
#             labels (`torch.Tensor`):
#                 The labels of the model.
#         Returns:
#             obj:`torch.Tensor`: The loss of the model.
#         """
#         return F.cross_entropy(
#             logits.view(-1, self.num_labels),
#             labels.view(-1).to(torch.long),
#             ignore_index=-100,
#         )

#     def configure_optimizers(self):
#         # Puoi personalizzare l'ottimizzatore e i suoi parametri qui
#         return torch.optim.Adam(self.parameters(), lr=1e-3)

#     def get_preds_targets(self):
#       return self.preds, self.targets

## Dataset

In [None]:
class LanDataset(Dataset):
  def __init__(self,
               df,
               vocab_languages):


    self.df = df
    self.vocab_languages= vocab_languages
    self.data = []
    for index, row in df.iterrows():
      text = row['sentence']
      label = row['language_symbol']
      int_label = self.vocab_languages[label]

      item = {
          'sentence' : text.strip(),
          'label' : int(int_label),
          'str_label' : label
      }
      self.data.append(item)


  def __len__(self):
    return len(self.data)


  def __getitem__(self,index):
    item = self.data[index]

    return item

In [None]:
test = LanDataset(val_df,lan2idx)

In [None]:
test[15]

{'sentence': 'Մարդն ասաց նրան. «Ի՞նչ է քո անունը»: Նա պատասխանեց նրան՝ Յակոբ:',
 'label': 4,
 'str_label': 'hy'}

## Datamodule
In this section i define the Datamodule class used by pytorch lightning.

### Lan Datamodule

In [None]:
class LanDatamodule(pl.LightningDataModule):
    """
    """

    def __init__(
        self,
        batch_size : int,
        lan2idx: Dict
        ) -> None:
        super().__init__()


        self.batch_size = batch_size
        self.lan2idx = lan2idx

    def setup(self, stage: Optional[str] = None) -> None:
        if stage == 'fit':
            self.train_dataset = LanDataset(train_df,self.lan2idx)
            self.validation_dataset = LanDataset(val_df,self.lan2idx)
        elif stage == 'test':
            self.test_dataset = LanDataset(test_df,self.lan2idx)

    def train_dataloader(self, *args, **kwargs) -> DataLoader:
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True, num_workers=2, collate_fn = self.prepare_batch)

    def val_dataloader(self, *args, **kwargs) -> DataLoader:
        return DataLoader(self.validation_dataset, batch_size=self.batch_size, shuffle=False, drop_last=True, num_workers = 2, collate_fn = self.prepare_batch)

    def test_dataloader(self, *args, **kwargs) -> DataLoader:
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, drop_last=True, num_workers = 2, collate_fn = self.prepare_batch)

    def prepare_batch(self,batch):

      """
      the collate_fn function, prepares the batch by giving a dictionary that will be the input of the forward function of the model
      """

      prepared_batch = {}

      prepared_batch['sentence'] = [item['sentence'] for item in batch]

      prepared_batch['labels'] = [item['label'] for item in batch]

      return prepared_batch

## Hyperparameters

In [None]:
#hyperparameters
class hypers:
    save_model = False
    save_model_path = ''

    load_model = True

    load_model_path = ''
    embedding_dim = 768
    input_size = 768
    learning_rate = 1e-3
    epochs = 5
    batch_size = 8
    print_step = 100
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Init Trainer

In [None]:
early_stopping = pl.callbacks.EarlyStopping(
    monitor='val_acc',
    patience=5,
    verbose=True,
    mode='max',
)

In [None]:
trainer = pl.Trainer(
    max_epochs=50,
    callbacks=early_stopping

)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
mymodel = TextClassifier(
    hypers = hypers,
    num_labels = len(lan2idx),
    fine_tune_lm = True
)

lan_dm = LanDatamodule(
    batch_size = hypers.batch_size,
    lan2idx = lan2idx
  )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.69k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

### Train

In [None]:
trainer.fit(model = mymodel, datamodule = lan_dm)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type                | Params
----------------------------------------------------------
0 | accuracy          | MulticlassAccuracy  | 0     
1 | transformer_model | SentenceTransformer | 135 M 
2 | dropout           | Dropout             | 0     
3 | relu              | ReLU                | 0     
4 | fc1               | Linear              | 262 K 
5 | fc2               | Linear              | 65.7 K
6 | classifier        | Linear              | 4.9 K 
----------------------------------------------------------
135 M     Trainable params
0         Non-trainable params
135 M     Total params
541.844   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved. New best score: 0.838


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.051 >= min_delta = 0.0. New best score: 0.889


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.010 >= min_delta = 0.0. New best score: 0.898


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.011 >= min_delta = 0.0. New best score: 0.909


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.008 >= min_delta = 0.0. New best score: 0.917


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.003 >= min_delta = 0.0. New best score: 0.920


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.003 >= min_delta = 0.0. New best score: 0.923


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.002 >= min_delta = 0.0. New best score: 0.925


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.002 >= min_delta = 0.0. New best score: 0.927


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.001 >= min_delta = 0.0. New best score: 0.928


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.000 >= min_delta = 0.0. New best score: 0.928


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.003 >= min_delta = 0.0. New best score: 0.932


Validation: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# trainer.save_checkpoint('/content/drive/MyDrive/AI/NLP_HWBONUS/labse_0.ckpt')
# trainer.save_checkpoint('/content/drive/MyDrive/AI/NLP_HWBONUS/low_roberta_0.ckpt')
# trainer.save_checkpoint('/content/drive/MyDrive/AI/NLP_HWBONUS/distiluse_0.ckpt')
# trainer.save_checkpoint('/content/drive/MyDrive/AI/NLP_HWBONUS/distiluse_filtered_0.ckpt')
trainer.save_checkpoint('/content/drive/MyDrive/AI/NLP_HWBONUS/distiluse_bible_corano_filt_5.ckpt')

In [None]:
mymodel = TextClassifier.load_from_checkpoint(checkpoint_path='/content/drive/MyDrive/AI/NLP_HWBONUS/distiluse_bible_corano_filt_0.ckpt', hypers = hypers,
    num_labels = len(lan2idx),
    fine_tune_lm = False)


### Test

In [None]:
trainer.test(model = mymodel, datamodule = lan_dm)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.23820599913597107, 'test_acc': 0.9267105460166931}]

## Report

In [None]:
preds,targets = mymodel.get_preds_targets()
preds = [pred.argmax() for pred in preds]
preds_cpu = [pred.cpu().numpy()for pred in preds]
targets_cpu = [target.cpu().numpy() for target in targets]
int_vocab = {value: key for key, value in lan2idx.items()}
class_labels = [sym_to_languages[int_vocab[i]] for i in range(len(int_vocab))]
report = classification_report(targets_cpu, preds_cpu, target_names=class_labels)

print(report)

              precision    recall  f1-score   support

  Potawatomi       0.99      1.00      1.00       348
      Telugu       1.00      1.00      1.00       382
       Xhosa       0.80      0.92      0.86       373
       Swati       0.93      0.79      0.85       419
       Tajik       0.89      0.99      0.93       430
      Syriac       1.00      1.00      1.00       380
      Nepali       0.96      0.96      0.96       408
    Wolaytta       0.97      0.93      0.95       406
     Chinese       0.80      0.82      0.81       422
     Marathi       0.89      0.92      0.90       409
       Tamil       0.99      0.99      0.99       430
        Zulu       0.91      0.78      0.84       407
    Estonian       0.95      0.88      0.91       396
     Swahili       0.78      0.93      0.85       401
      Sindhi       0.96      0.98      0.97       401
      Korean       0.72      0.91      0.81       398
       Latin       0.99      1.00      1.00       375
    Japanese       0.84    