## Imports


In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
import gc
import torch

gc.collect()

torch.cuda.empty_cache()

In [3]:
from datasets import Dataset
import pandas as pd
import numpy as np

from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import evaluate

In [4]:
data_dir = "/kaggle/input/longlistops/"

## Importing Data

In [5]:
train_df = pd.read_csv(data_dir + "basic_train.tsv", sep="\t", nrows=3000)
train_df = pd.DataFrame(train_df)

val_df = pd.read_csv(data_dir + "basic_val.tsv", sep="\t")
val_df = pd.DataFrame(val_df)

test_df = pd.read_csv(data_dir + "basic_test.tsv", sep="\t")
test_df = pd.DataFrame(test_df)


train_dataset = Dataset.from_pandas(train_df, split="train")
val_dataset = Dataset.from_pandas(val_df, split="val")
test_dataset = Dataset.from_pandas(test_df, split="test")



In [6]:
from transformers import AutoTokenizer

model_name = "allenai/longformer-base-4096"
batch_size = 1
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples, func_tokenizer):
    return func_tokenizer(examples["Source"], truncation=True, max_length=2048)

tokenized_train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=20,
        fn_kwargs={'func_tokenizer': tokenizer}
)

tokenized_val_dataset = val_dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=20,
        fn_kwargs={'func_tokenizer': tokenizer}
)

tokenized_test_dataset = test_dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=20,
        fn_kwargs={'func_tokenizer': tokenizer}
)

tokenized_train_dataset = tokenized_train_dataset.rename_column("Source", "text")
tokenized_train_dataset = tokenized_train_dataset.rename_column("Target", "label")

tokenized_val_dataset = tokenized_val_dataset.rename_column("Source", "text")
tokenized_val_dataset = tokenized_val_dataset.rename_column("Target", "label")

tokenized_test_dataset = tokenized_test_dataset.rename_column("Source", "text")
tokenized_test_dataset = tokenized_test_dataset.rename_column("Target", "label")

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=10,
)

training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        num_train_epochs=10,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        report_to='none'    
)

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    return {"acc": acc}

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

trainer.train()

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  trainer = Trainer(
Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss,Acc
1,2.2438,2.270059,{'accuracy': 0.1715}
2,2.2825,2.26945,{'accuracy': 0.1595}
3,2.2752,2.265049,{'accuracy': 0.1595}
4,2.236,2.272488,{'accuracy': 0.1595}
5,2.2278,2.269869,{'accuracy': 0.1595}
6,2.2591,2.261579,{'accuracy': 0.1715}
7,2.2236,2.273327,{'accuracy': 0.1595}
8,2.2587,2.262258,{'accuracy': 0.1595}
9,2.2524,2.260611,{'accuracy': 0.1595}
10,2.2542,2.261911,{'accuracy': 0.1595}


Input ids are automatically padded to be a multiple of `config.attention_window`: 512


TrainOutput(global_step=30000, training_loss=2.2557617635091147, metrics={'train_runtime': 18914.3723, 'train_samples_per_second': 1.586, 'train_steps_per_second': 1.586, 'total_flos': 3.848911988995572e+16, 'train_loss': 2.2557617635091147, 'epoch': 10.0})

In [9]:
trainer.evaluate(tokenized_test_dataset)


{'eval_loss': 2.241058826446533,
 'eval_acc': {'accuracy': 0.1725},
 'eval_runtime': 277.4814,
 'eval_samples_per_second': 7.208,
 'eval_steps_per_second': 7.208,
 'epoch': 10.0}