In [1]:
!pip install torch
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[K     |████████████████████████████████| 401 kB 15.4 MB/s 
Collecting konoha<5.0.0,>=4.0.0
  Downloading konoha-4.6.5-py3-none-any.whl (20 kB)
Collecting transformers>=4.0.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 55.4 MB/s 
[?25hCollecting janome
  Downloading Janome-0.4.2-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 1.2 MB/s 
[?25hCollecting conllu>=4.0
  Downloading conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.1 MB/s 
[?25hCollecting sentencepiece==0.1.95
  Downloading sentencepiece-0.1.95-cp38-cp3

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import torch
from flair.datasets import CSVClassificationCorpus
from flair.data import Corpus
from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

In [None]:
# mount the google drive
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [None]:
# Import the training and test sets
df_train = pd.read_json("./classifier_data_train.json", lines=True)[["text","author"]]
df_test = pd.read_json("./classifier_data_eval.json", lines=True)[["text","author"]]
print(f"Original overall dataset length: {len(df_train)+len(df_test)}")

Original overall dataset length: 43958


In [None]:
# Check for duplicates in whole dataset and remove if existent (34 dupls in train_set, 10 in test_set)
df_total = pd.concat([df_train, df_test]).reset_index(drop=True)
dupls = df_total[df_total.duplicated(keep="first")]
print(f"Total number of duplicates: {len(dupls)}")
df_total = df_total.drop_duplicates()
print(f"Overall dataset length after duplicate removal: {len(df_total)}")

Total number of duplicates: 44
Overall dataset length after duplicate removal: 43914


In [None]:
# Split dataset in train, dev, test and shuffle
df_test = df_total.iloc[-4871:].reset_index(drop=True)
df_test = df_test.sample(frac=1, random_state=1000).reset_index(drop=True)
#print(len(df_test))

df_total = df_total.drop(index=df_total.index[-4871:], axis=0)
df_train = df_total.sample(frac=1, random_state=1000).reset_index(drop=True)
#print(len(df_train))

df_dev = df_train.iloc[-4871:].reset_index(drop=True)
df_dev = df_dev.sample(frac=1, random_state=1000).reset_index(drop=True)
#print(len(df_dev))

df_train = df_train.drop(index=df_train.index[-4871:], axis=0)
print(f"Train: {len(df_train)}, Dev: {len(df_dev)}, Test: {len(df_test)}")

Train: 34172, Dev: 4871, Test: 4871


In [None]:
# Save in csv format
df_train.to_csv("train.csv", index=False)
df_dev.to_csv("dev.csv", index=False)
df_test.to_csv("test.csv", index=False)


In [None]:
# What label do we want to predict?
label_type = 'author_classification'

# this is the folder in which train, test and dev files reside
data_folder = '/content/data'

# column format indicating which columns hold the text and label(s)
column_name_map = {0: "text", 1: "label_author"}

# load corpus containing training, test and dev data and if CSV has a header, you can skip it
corpus: Corpus = CSVClassificationCorpus(data_folder=data_folder,
                                         column_name_map=column_name_map,
                                         skip_header=True,
                                         delimiter=',',
                                         label_type=label_type)

2023-01-02 18:41:23,439 Reading data from /content/data
2023-01-02 18:41:23,442 Train: /content/data/train.csv
2023-01-02 18:41:23,443 Dev: /content/data/dev.csv
2023-01-02 18:41:23,444 Test: /content/data/test.csv


In [None]:
label_dict = corpus.make_label_dictionary(label_type=label_type)


2023-01-02 18:41:27,621 Computing label dictionary. Progress:


34172it [00:45, 743.99it/s]

2023-01-02 18:42:13,593 Dictionary created for label 'author_classification' with 8 values: Virginia Woolf (seen 13325 times), Henrik Ibsen (seen 6274 times), James Joyce (seen 4758 times), Wilhelm Busch (seen 4386 times), Franz Kafka (seen 1950 times), Friedrich Schiller (seen 1847 times), Johann Wolfgang von Goethe (seen 1632 times)





In [None]:
# initialize transformer document embeddings (many models are available)
document_embeddings = TransformerDocumentEmbeddings('xlm-roberta-base', fine_tune=True)

# create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type)

# initialize trainer
trainer = ModelTrainer(classifier, corpus)

# run training with fine-tuning
trainer.fine_tune('/content/gdrive/MyDrive/models/author_classification_letters',
                  learning_rate=5e-5,
                  mini_batch_size=8,
                  max_epochs=10,
                  optimizer=torch.optim.AdamW,
                  embeddings_storage_mode='none',
                  checkpoint=True,
                  write_weights=True,
                  use_final_model_for_eval=False
                  )

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

2023-01-02 18:43:06,486 ----------------------------------------------------------------------------------------------------
2023-01-02 18:43:06,493 Model: "TextClassifier(
  (decoder): Linear(in_features=768, out_features=8, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
  (locked_dropout): LockedDropout(p=0.0)
  (word_dropout): WordDropout(p=0.0)
  (loss_function): CrossEntropyLoss()
  (document_embeddings): TransformerDocumentEmbeddings(
    (model): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0): XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self

100%|██████████| 609/609 [00:43<00:00, 14.02it/s]

2023-01-02 19:06:58,405 Evaluating as a multi-label problem: False
2023-01-02 19:06:58,451 DEV : loss 0.022618360817432404 - f1-score (micro avg)  0.9431





2023-01-02 19:07:03,847 BAD EPOCHS (no improvement): 4
2023-01-02 19:07:18,475 saving best model
2023-01-02 19:07:33,966 ----------------------------------------------------------------------------------------------------
2023-01-02 19:09:51,312 epoch 2 - iter 427/4272 - loss 0.02986288 - samples/sec: 25.81 - lr: 0.000049
2023-01-02 19:12:10,228 epoch 2 - iter 854/4272 - loss 0.02989740 - samples/sec: 26.16 - lr: 0.000049
2023-01-02 19:14:28,252 epoch 2 - iter 1281/4272 - loss 0.02953275 - samples/sec: 26.37 - lr: 0.000048
2023-01-02 19:16:47,140 epoch 2 - iter 1708/4272 - loss 0.02915938 - samples/sec: 26.20 - lr: 0.000048
2023-01-02 19:19:05,049 epoch 2 - iter 2135/4272 - loss 0.02827047 - samples/sec: 26.34 - lr: 0.000047
2023-01-02 19:21:22,599 epoch 2 - iter 2562/4272 - loss 0.02863396 - samples/sec: 26.41 - lr: 0.000047
2023-01-02 19:23:39,969 epoch 2 - iter 2989/4272 - loss 0.02802061 - samples/sec: 26.54 - lr: 0.000046
2023-01-02 19:25:58,922 epoch 2 - iter 3416/4272 - loss 0.0

100%|██████████| 609/609 [00:43<00:00, 14.02it/s]

2023-01-02 19:31:25,848 Evaluating as a multi-label problem: False
2023-01-02 19:31:25,886 DEV : loss 0.021992607042193413 - f1-score (micro avg)  0.9413





2023-01-02 19:31:30,945 BAD EPOCHS (no improvement): 4
2023-01-02 19:31:45,941 ----------------------------------------------------------------------------------------------------
2023-01-02 19:34:01,397 epoch 3 - iter 427/4272 - loss 0.01848477 - samples/sec: 26.06 - lr: 0.000044
2023-01-02 19:36:20,001 epoch 3 - iter 854/4272 - loss 0.01981283 - samples/sec: 26.26 - lr: 0.000043
2023-01-02 19:38:37,773 epoch 3 - iter 1281/4272 - loss 0.01962434 - samples/sec: 26.31 - lr: 0.000043
2023-01-02 19:40:56,128 epoch 3 - iter 1708/4272 - loss 0.01949512 - samples/sec: 26.25 - lr: 0.000042
2023-01-02 19:43:14,795 epoch 3 - iter 2135/4272 - loss 0.01959222 - samples/sec: 26.24 - lr: 0.000042
2023-01-02 19:45:31,420 epoch 3 - iter 2562/4272 - loss 0.01963661 - samples/sec: 26.59 - lr: 0.000041
2023-01-02 19:47:51,233 epoch 3 - iter 2989/4272 - loss 0.01931898 - samples/sec: 26.00 - lr: 0.000041
2023-01-02 19:50:10,822 epoch 3 - iter 3416/4272 - loss 0.01905721 - samples/sec: 26.09 - lr: 0.00004

100%|██████████| 609/609 [00:43<00:00, 13.93it/s]

2023-01-02 19:55:37,439 Evaluating as a multi-label problem: False
2023-01-02 19:55:37,478 DEV : loss 0.022349374368786812 - f1-score (micro avg)  0.9476





2023-01-02 19:55:42,353 BAD EPOCHS (no improvement): 4
2023-01-02 19:55:57,764 saving best model
2023-01-02 19:56:13,679 ----------------------------------------------------------------------------------------------------
2023-01-02 19:58:28,059 epoch 4 - iter 427/4272 - loss 0.02725962 - samples/sec: 26.33 - lr: 0.000038
2023-01-02 20:00:45,831 epoch 4 - iter 854/4272 - loss 0.02663342 - samples/sec: 26.37 - lr: 0.000038
2023-01-02 20:03:04,106 epoch 4 - iter 1281/4272 - loss 0.02437123 - samples/sec: 26.18 - lr: 0.000037
2023-01-02 20:05:21,959 epoch 4 - iter 1708/4272 - loss 0.02204759 - samples/sec: 26.36 - lr: 0.000037
2023-01-02 20:07:39,816 epoch 4 - iter 2135/4272 - loss 0.02109155 - samples/sec: 26.37 - lr: 0.000036
2023-01-02 20:09:58,981 epoch 4 - iter 2562/4272 - loss 0.02004438 - samples/sec: 26.05 - lr: 0.000036
2023-01-02 20:12:20,020 epoch 4 - iter 2989/4272 - loss 0.01950064 - samples/sec: 25.74 - lr: 0.000035
2023-01-02 20:14:38,423 epoch 4 - iter 3416/4272 - loss 0.0

100%|██████████| 609/609 [00:43<00:00, 13.91it/s]

2023-01-02 20:20:03,994 Evaluating as a multi-label problem: False
2023-01-02 20:20:04,034 DEV : loss 0.02040998823940754 - f1-score (micro avg)  0.9509





2023-01-02 20:20:09,018 BAD EPOCHS (no improvement): 4
2023-01-02 20:20:23,719 saving best model
2023-01-02 20:20:39,374 ----------------------------------------------------------------------------------------------------
2023-01-02 20:22:54,819 epoch 5 - iter 427/4272 - loss 0.01236977 - samples/sec: 26.09 - lr: 0.000033
2023-01-02 20:25:11,555 epoch 5 - iter 854/4272 - loss 0.01346137 - samples/sec: 26.61 - lr: 0.000032
2023-01-02 20:27:29,737 epoch 5 - iter 1281/4272 - loss 0.01233652 - samples/sec: 26.30 - lr: 0.000032
