## Imports


In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
from datasets import Dataset
import pandas as pd
import numpy as np

from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import evaluate

In [3]:
data_dir = "/kaggle/input/longlistops/"

## Importing Data

In [4]:
train_df = pd.read_csv(data_dir + "basic_train.tsv", sep="\t", nrows=5000)
train_df = pd.DataFrame(train_df)

val_df = pd.read_csv(data_dir + "basic_val.tsv", sep="\t")
val_df = pd.DataFrame(val_df)

test_df = pd.read_csv(data_dir + "basic_test.tsv", sep="\t")
test_df = pd.DataFrame(test_df)


train_dataset = Dataset.from_pandas(train_df, split="train")
val_dataset = Dataset.from_pandas(val_df, split="val")
test_dataset = Dataset.from_pandas(test_df, split="test")



In [5]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


tokenizer(test_df.iloc[0]['Source'])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (5922 > 512). Running this sequence through the model will result in indexing errors


{'input_ids': [101, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1031, 19960, 1021, 1007, 1023, 1007, 1017, 1007, 1015, 1007, 1023, 1007, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1031, 4098, 1017, 1007, 1018, 1007, 1023, 1007, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1031, 8117, 1015, 1007, 1017, 1007, 1016, 1007, 1017, 1007, 1023, 1007, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1031, 19960, 1006, 1006, 1006, 1006, 1006, 1031, 4098, 1022, 1007, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1031, 8117, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1031, 8117, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1031, 15488, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1031, 19960, 1022, 1007, 1019, 1007, 1015, 1007, 1018, 1007, 1022, 1007, 1023, 1007, 1016, 1007, 1033, 1007, 1007, 1014, 1007, 1018, 1007, 1021, 1007, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1006, 1031, 15488, 1023, 1007, 1020, 1007, 1022, 100

In [6]:
def preprocess_function(examples, func_tokenizer):
    return func_tokenizer(examples["Source"], truncation=True)

tokenized_train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=20,
        fn_kwargs={'func_tokenizer': tokenizer}
)

tokenized_val_dataset = val_dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=20,
        fn_kwargs={'func_tokenizer': tokenizer}
)

tokenized_test_dataset = test_dataset.map(
        preprocess_function,
        batched=True,
        # num_proc=20,
        fn_kwargs={'func_tokenizer': tokenizer}
)

tokenized_train_dataset = tokenized_train_dataset.rename_column("Source", "text")
tokenized_train_dataset = tokenized_train_dataset.rename_column("Target", "label")

tokenized_val_dataset = tokenized_val_dataset.rename_column("Source", "text")
tokenized_val_dataset = tokenized_val_dataset.rename_column("Target", "label")

tokenized_test_dataset = tokenized_test_dataset.rename_column("Source", "text")
tokenized_test_dataset = tokenized_test_dataset.rename_column("Target", "label")

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=10,
)

training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        num_train_epochs=10,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        report_to="none"
)

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    return {"acc": acc}

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Acc
1,No log,2.014358,{'accuracy': 0.2995}
2,No log,1.743839,{'accuracy': 0.353}
3,No log,1.711038,{'accuracy': 0.358}
4,No log,1.691696,{'accuracy': 0.347}
5,No log,1.673671,{'accuracy': 0.365}
6,No log,1.656882,{'accuracy': 0.36}
7,1.784900,1.646209,{'accuracy': 0.3585}
8,1.784900,1.638484,{'accuracy': 0.359}
9,1.784900,1.636253,{'accuracy': 0.3665}
10,1.784900,1.633981,{'accuracy': 0.3675}


TrainOutput(global_step=790, training_loss=1.7178703211530855, metrics={'train_runtime': 1503.1632, 'train_samples_per_second': 33.263, 'train_steps_per_second': 0.526, 'total_flos': 6624314880000000.0, 'train_loss': 1.7178703211530855, 'epoch': 10.0})

In [8]:
trainer.evaluate(tokenized_test_dataset)

{'eval_loss': 1.6036309003829956,
 'eval_acc': {'accuracy': 0.3835},
 'eval_runtime': 16.7161,
 'eval_samples_per_second': 119.645,
 'eval_steps_per_second': 1.914,
 'epoch': 10.0}