## Imports


In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
from datasets import Dataset
import pandas as pd
import numpy as np

from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import evaluate

In [3]:
data_dir = "/kaggle/input/longlistops/"

## Importing Data

In [4]:
train_df = pd.read_csv(data_dir + "basic_train.tsv", sep="\t")
train_df = pd.DataFrame(train_df)

val_df = pd.read_csv(data_dir + "basic_val.tsv", sep="\t")
val_df = pd.DataFrame(val_df)

test_df = pd.read_csv(data_dir + "basic_test.tsv", sep="\t")
test_df = pd.DataFrame(test_df)


train_dataset = Dataset.from_pandas(train_df, split="train")
val_dataset = Dataset.from_pandas(val_df, split="val")
test_dataset = Dataset.from_pandas(test_df, split="test")

print(train_df.shape)

(96000, 2)


In [5]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
def preprocess_function(examples, func_tokenizer):
    return func_tokenizer(examples["Source"], truncation=True)

tokenized_train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=20,
        fn_kwargs={'func_tokenizer': tokenizer}
)

tokenized_val_dataset = val_dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=20,
        fn_kwargs={'func_tokenizer': tokenizer}
)

tokenized_test_dataset = test_dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=20,
        fn_kwargs={'func_tokenizer': tokenizer}
)

tokenized_train_dataset = tokenized_train_dataset.rename_column("Source", "text")
tokenized_train_dataset = tokenized_train_dataset.rename_column("Target", "label")

tokenized_val_dataset = tokenized_val_dataset.rename_column("Source", "text")
tokenized_val_dataset = tokenized_val_dataset.rename_column("Target", "label")

tokenized_test_dataset = tokenized_test_dataset.rename_column("Source", "text")
tokenized_test_dataset = tokenized_test_dataset.rename_column("Target", "label")

Map:   0%|          | 0/96000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=10,
)

training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=3,
        num_train_epochs=5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        report_to="none"
)

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    return {"my_accuracy": acc}

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,My Accuracy
1,1.5353,1.507102,{'accuracy': 0.411}
2,1.4983,1.472628,{'accuracy': 0.418}
3,1.4701,1.45236,{'accuracy': 0.432}
4,1.4444,1.445397,{'accuracy': 0.4275}
5,1.4486,1.436991,{'accuracy': 0.429}


TrainOutput(global_step=7500, training_loss=1.5001018961588541, metrics={'train_runtime': 12645.9579, 'train_samples_per_second': 37.957, 'train_steps_per_second': 0.593, 'total_flos': 6.3593422848e+16, 'train_loss': 1.5001018961588541, 'epoch': 5.0})

In [8]:
trainer.evaluate(tokenized_test_dataset)

{'eval_loss': 1.4227956533432007,
 'eval_my_accuracy': {'accuracy': 0.4375},
 'eval_runtime': 16.502,
 'eval_samples_per_second': 121.198,
 'eval_steps_per_second': 1.939,
 'epoch': 5.0}