In [None]:
!pip install torch
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[K     |████████████████████████████████| 401 kB 29.1 MB/s 
Collecting sqlitedict>=1.6.0
  Downloading sqlitedict-2.1.0.tar.gz (21 kB)
Collecting bpemb>=0.3.2
  Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.6 MB/s 
Collecting janome
  Downloading Janome-0.4.2-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 1.9 MB/s 
[?25hCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 49.7 MB/s 
[?25hCollecting konoha<5.0.0,>=4.0.0
  Downloading konoha-4.6.5-py3-none-any.whl (20 kB)
Collecting sentencepiece==0.1.95


In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import torch
from flair.datasets import CSVClassificationCorpus
from flair.data import Corpus
from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

In [None]:
# mount the google drive
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [None]:
# Import the training and test sets
df_train = pd.read_json("./classifier_data_train.json", lines=True)[["text","lang"]]
df_test = pd.read_json("./classifier_data_eval.json", lines=True)[["text","lang"]]
print(f"Original overall dataset length: {len(df_train)+len(df_test)}")

Original overall dataset length: 43958


In [None]:
# Check for duplicates in whole dataset and remove if existent (34 dupls in train_set, 10 in test_set)
df_total = pd.concat([df_train, df_test]).reset_index(drop=True)
dupls = df_total[df_total.duplicated(keep="first")]
print(f"Total number of duplicates: {len(dupls)}")
df_total = df_total.drop_duplicates()
print(f"Overall dataset length after duplicate removal: {len(df_total)}")

Total number of duplicates: 44
Overall dataset length after duplicate removal: 43914


In [None]:
# Split dataset in train, dev, test and shuffle
df_test = df_total.iloc[-4871:].reset_index(drop=True)
df_test = df_test.sample(frac=1, random_state=1000).reset_index(drop=True)
#print(len(df_test))

df_total = df_total.drop(index=df_total.index[-4871:], axis=0)
df_train = df_total.sample(frac=1, random_state=1000).reset_index(drop=True)
#print(len(df_train))

df_dev = df_train.iloc[-4871:].reset_index(drop=True)
df_dev = df_dev.sample(frac=1, random_state=1000).reset_index(drop=True)
#print(len(df_dev))

df_train = df_train.drop(index=df_train.index[-4871:], axis=0)
print(f"Train: {len(df_train)}, Dev: {len(df_dev)}, Test: {len(df_test)}")

Train: 34172, Dev: 4871, Test: 4871


In [None]:
# Save in csv format
df_train.to_csv("train.csv", index=False)
df_dev.to_csv("dev.csv", index=False)
df_test.to_csv("test.csv", index=False)


In [None]:
# What label do we want to predict?
label_type = 'language_identification'

# this is the folder in which train, test and dev files reside
data_folder = '/content/data'

# column format indicating which columns hold the text and label(s)
column_name_map = {0: "text", 1: "label_lang"}

# load corpus containing training, test and dev data and if CSV has a header, you can skip it
corpus: Corpus = CSVClassificationCorpus(data_folder=data_folder,
                                         column_name_map=column_name_map,
                                         skip_header=True,
                                         delimiter=',',
                                         label_type=label_type)

2023-01-02 15:55:52,458 Reading data from /content/data
2023-01-02 15:55:52,460 Train: /content/data/train.csv
2023-01-02 15:55:52,463 Dev: /content/data/dev.csv
2023-01-02 15:55:52,465 Test: /content/data/test.csv


In [None]:
label_dict = corpus.make_label_dictionary(label_type=label_type)


2023-01-02 15:56:01,284 Computing label dictionary. Progress:


34172it [00:41, 830.15it/s] 

2023-01-02 15:56:42,496 Dictionary created for label 'language_identification' with 9 values: en (seen 17630 times), de (seen 10128 times), da (seen 5873 times), fr (seen 243 times), it (seen 203 times), unknown (seen 91 times), hu (seen 3 times), sv (seen 1 times)





In [None]:
# initialize transformer document embeddings (many models are available)
document_embeddings = TransformerDocumentEmbeddings('xlm-roberta-base', fine_tune=True)

# create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type)


# initialize trainer
trainer = ModelTrainer(classifier, corpus)

# run training with fine-tuning
trainer.fine_tune('/content/gdrive/MyDrive/models/language_identification_letters',
                  learning_rate=5e-5,
                  mini_batch_size=8,
                  max_epochs=10,
                  optimizer=torch.optim.AdamW,
                  embeddings_storage_mode='none',
                  checkpoint=True,
                  write_weights=True,
                  use_final_model_for_eval=False
                  )

2023-01-02 15:59:49,017 ----------------------------------------------------------------------------------------------------
2023-01-02 15:59:49,024 Model: "TextClassifier(
  (decoder): Linear(in_features=768, out_features=9, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
  (locked_dropout): LockedDropout(p=0.0)
  (word_dropout): WordDropout(p=0.0)
  (loss_function): CrossEntropyLoss()
  (document_embeddings): TransformerDocumentEmbeddings(
    (model): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0): XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self

100%|██████████| 609/609 [00:41<00:00, 14.68it/s]

2023-01-02 16:23:15,286 Evaluating as a multi-label problem: False





2023-01-02 16:23:15,333 DEV : loss 0.004855041857808828 - f1-score (micro avg)  0.9961
2023-01-02 16:23:20,626 BAD EPOCHS (no improvement): 4
2023-01-02 16:23:35,684 saving best model
2023-01-02 16:23:52,472 ----------------------------------------------------------------------------------------------------
2023-01-02 16:26:07,647 epoch 2 - iter 427/4272 - loss 0.00567982 - samples/sec: 26.22 - lr: 0.000049
2023-01-02 16:28:24,379 epoch 2 - iter 854/4272 - loss 0.00522102 - samples/sec: 26.62 - lr: 0.000049
2023-01-02 16:30:40,445 epoch 2 - iter 1281/4272 - loss 0.00493946 - samples/sec: 26.75 - lr: 0.000048
2023-01-02 16:32:57,602 epoch 2 - iter 1708/4272 - loss 0.00506788 - samples/sec: 26.54 - lr: 0.000048
2023-01-02 16:35:15,283 epoch 2 - iter 2135/4272 - loss 0.00480833 - samples/sec: 26.42 - lr: 0.000047
2023-01-02 16:37:31,332 epoch 2 - iter 2562/4272 - loss 0.00474412 - samples/sec: 26.68 - lr: 0.000047
2023-01-02 16:39:47,020 epoch 2 - iter 2989/4272 - loss 0.00471931 - sample

100%|██████████| 609/609 [00:41<00:00, 14.62it/s]

2023-01-02 16:47:20,527 Evaluating as a multi-label problem: False





2023-01-02 16:47:20,578 DEV : loss 0.0031003127805888653 - f1-score (micro avg)  0.9961
2023-01-02 16:47:25,568 BAD EPOCHS (no improvement): 4
2023-01-02 16:47:39,885 ----------------------------------------------------------------------------------------------------
2023-01-02 16:49:52,583 epoch 3 - iter 427/4272 - loss 0.00380427 - samples/sec: 26.64 - lr: 0.000044
2023-01-02 16:52:09,082 epoch 3 - iter 854/4272 - loss 0.00433140 - samples/sec: 26.68 - lr: 0.000043
2023-01-02 16:54:25,539 epoch 3 - iter 1281/4272 - loss 0.00350408 - samples/sec: 26.59 - lr: 0.000043
2023-01-02 16:56:41,076 epoch 3 - iter 1708/4272 - loss 0.00406211 - samples/sec: 26.93 - lr: 0.000042
2023-01-02 16:58:57,469 epoch 3 - iter 2135/4272 - loss 0.00425862 - samples/sec: 26.67 - lr: 0.000042
2023-01-02 17:01:13,948 epoch 3 - iter 2562/4272 - loss 0.00440340 - samples/sec: 26.59 - lr: 0.000041
2023-01-02 17:03:29,441 epoch 3 - iter 2989/4272 - loss 0.00468224 - samples/sec: 26.83 - lr: 0.000041
2023-01-02 17

100%|██████████| 609/609 [00:42<00:00, 14.34it/s]

2023-01-02 17:11:03,843 Evaluating as a multi-label problem: False
2023-01-02 17:11:03,890 DEV : loss 0.004632778000086546 - f1-score (micro avg)  0.9895





2023-01-02 17:11:08,990 BAD EPOCHS (no improvement): 4
2023-01-02 17:11:23,609 ----------------------------------------------------------------------------------------------------
2023-01-02 17:13:35,676 epoch 4 - iter 427/4272 - loss 0.00485971 - samples/sec: 26.71 - lr: 0.000038
2023-01-02 17:15:52,674 epoch 4 - iter 854/4272 - loss 0.00420765 - samples/sec: 26.72 - lr: 0.000038
2023-01-02 17:18:09,672 epoch 4 - iter 1281/4272 - loss 0.00449899 - samples/sec: 26.57 - lr: 0.000037
2023-01-02 17:20:25,528 epoch 4 - iter 1708/4272 - loss 0.00473229 - samples/sec: 26.74 - lr: 0.000037
2023-01-02 17:22:41,011 epoch 4 - iter 2135/4272 - loss 0.00457907 - samples/sec: 26.79 - lr: 0.000036
2023-01-02 17:24:57,553 epoch 4 - iter 2562/4272 - loss 0.00486137 - samples/sec: 26.65 - lr: 0.000036
2023-01-02 17:27:12,779 epoch 4 - iter 2989/4272 - loss 0.00490120 - samples/sec: 26.86 - lr: 0.000035
2023-01-02 17:29:28,191 epoch 4 - iter 3416/4272 - loss 0.00493467 - samples/sec: 26.88 - lr: 0.00003

100%|██████████| 609/609 [00:41<00:00, 14.75it/s]

2023-01-02 17:34:45,924 Evaluating as a multi-label problem: False
2023-01-02 17:34:45,972 DEV : loss 0.005278266966342926 - f1-score (micro avg)  0.9908





2023-01-02 17:34:50,921 BAD EPOCHS (no improvement): 4
2023-01-02 17:35:06,011 ----------------------------------------------------------------------------------------------------
2023-01-02 17:37:19,269 epoch 5 - iter 427/4272 - loss 0.00230700 - samples/sec: 26.50 - lr: 0.000033
2023-01-02 17:39:35,131 epoch 5 - iter 854/4272 - loss 0.00487898 - samples/sec: 26.79 - lr: 0.000032
2023-01-02 17:41:48,974 epoch 5 - iter 1281/4272 - loss 0.00590718 - samples/sec: 27.21 - lr: 0.000032
2023-01-02 17:44:04,035 epoch 5 - iter 1708/4272 - loss 0.00583948 - samples/sec: 26.88 - lr: 0.000031
2023-01-02 17:46:20,538 epoch 5 - iter 2135/4272 - loss 0.00555001 - samples/sec: 26.64 - lr: 0.000031
2023-01-02 17:48:37,575 epoch 5 - iter 2562/4272 - loss 0.00490773 - samples/sec: 26.55 - lr: 0.000030
2023-01-02 17:50:53,571 epoch 5 - iter 2989/4272 - loss 0.00457035 - samples/sec: 26.76 - lr: 0.000029
2023-01-02 17:53:09,019 epoch 5 - iter 3416/4272 - loss 0.00488828 - samples/sec: 26.80 - lr: 0.00002

100%|██████████| 609/609 [00:41<00:00, 14.56it/s]

2023-01-02 17:58:29,037 Evaluating as a multi-label problem: False





2023-01-02 17:58:29,079 DEV : loss 0.0043687643483281136 - f1-score (micro avg)  0.991
2023-01-02 17:58:34,067 BAD EPOCHS (no improvement): 4
2023-01-02 17:58:48,408 ----------------------------------------------------------------------------------------------------
2023-01-02 18:01:00,624 epoch 6 - iter 427/4272 - loss 0.00415968 - samples/sec: 26.76 - lr: 0.000027
2023-01-02 18:03:16,546 epoch 6 - iter 854/4272 - loss 0.00666168 - samples/sec: 26.81 - lr: 0.000027
2023-01-02 18:05:32,732 epoch 6 - iter 1281/4272 - loss 0.00627924 - samples/sec: 26.61 - lr: 0.000026
2023-01-02 18:07:47,993 epoch 6 - iter 1708/4272 - loss 0.00747138 - samples/sec: 26.92 - lr: 0.000026
2023-01-02 18:10:05,087 epoch 6 - iter 2135/4272 - loss 0.00739465 - samples/sec: 26.48 - lr: 0.000025
2023-01-02 18:12:19,370 epoch 6 - iter 2562/4272 - loss 0.00728222 - samples/sec: 27.07 - lr: 0.000024
2023-01-02 18:14:36,331 epoch 6 - iter 2989/4272 - loss 0.00720937 - samples/sec: 26.56 - lr: 0.000024
2023-01-02 18:

100%|██████████| 609/609 [00:41<00:00, 14.73it/s]

2023-01-02 18:22:11,588 Evaluating as a multi-label problem: False





2023-01-02 18:22:11,664 DEV : loss 0.00709431292489171 - f1-score (micro avg)  0.9897
2023-01-02 18:22:17,902 BAD EPOCHS (no improvement): 4
2023-01-02 18:22:32,828 ----------------------------------------------------------------------------------------------------
2023-01-02 18:24:45,587 epoch 7 - iter 427/4272 - loss 0.00646092 - samples/sec: 26.41 - lr: 0.000022
2023-01-02 18:27:02,209 epoch 7 - iter 854/4272 - loss 0.00647616 - samples/sec: 26.82 - lr: 0.000021
2023-01-02 18:27:53,046 ----------------------------------------------------------------------------------------------------
2023-01-02 18:27:53,051 Exiting from training early.
2023-01-02 18:27:53,054 Saving model ...
2023-01-02 18:28:07,902 Done.
2023-01-02 18:28:07,904 ----------------------------------------------------------------------------------------------------
2023-01-02 18:28:07,911 loading file /content/gdrive/MyDrive/models/language_identification_letters/best-model.pt


100%|██████████| 609/609 [00:44<00:00, 13.61it/s]

2023-01-02 18:29:23,522 Evaluating as a multi-label problem: False





2023-01-02 18:29:23,621 0.9951	0.9951	0.9951	0.9951
2023-01-02 18:29:23,622 
Results:
- F-score (micro) 0.9951
- F-score (macro) 0.8062
- Accuracy 0.9951

By class:
              precision    recall  f1-score   support

          en     0.9984    0.9972    0.9978      2517
          de     0.9965    0.9993    0.9979      1436
          da     0.9941    1.0000    0.9970       836
          fr     0.8780    1.0000    0.9351        36
          it     0.8571    0.9677    0.9091        31
     unknown     0.0000    0.0000    0.0000        15

    accuracy                         0.9951      4871
   macro avg     0.7874    0.8274    0.8062      4871
weighted avg     0.9922    0.9951    0.9936      4871

2023-01-02 18:29:23,625 ----------------------------------------------------------------------------------------------------


{'test_score': 0.9950728803120509,
 'dev_score_history': [0.9960993635803737,
  0.9960993635803737,
  0.9895298706631082,
  0.9907616505850955,
  0.99096694723876,
  0.9897351673167727],
 'train_loss_history': [0.016210260567105218,
  0.004237746983002985,
  0.005485479443716876,
  0.00465479899320337,
  0.005137028935071099,
  0.0073376248485700814],
 'dev_loss_history': [0.004855041857808828,
  0.0031003127805888653,
  0.004632778000086546,
  0.005278266966342926,
  0.0043687643483281136,
  0.00709431292489171]}