In [1]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%pip install datasets evaluate transformers[sentencepiece]
%pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [3]:
import os

import evaluate
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch.nn as nn
from datasets import DatasetDict, load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

In [4]:
%cd /content/drive/MyDrive

/content/drive/MyDrive


In [5]:
INPUT_DIR = "./"
OUTPUT_DIR = "./output/xlm-roberta"
CHECKPOINT = "symanto/xlm-roberta-base-snli-mnli-anli-xnli"

In [6]:
# Load data
data = load_dataset(
    "csv",
    data_files=os.path.join(INPUT_DIR, "train.csv"),
    trust_remote_code=True,
    split="train",
)
data = data.train_test_split(0.2)

data = DatasetDict(
    {
        "train": data["train"],
        "val": data["test"],
        "test": load_dataset(
            "csv",
            data_files=os.path.join(INPUT_DIR, "test.csv"),
            trust_remote_code=True,
            split="train",
        ),
    }
)

data

DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'label'],
        num_rows: 9696
    })
    val: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'label'],
        num_rows: 2424
    })
    test: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language'],
        num_rows: 5195
    })
})

In [7]:
df = data["train"].to_pandas()
df.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,97e2230006,Un bateau qui s'était approché depuis le rivag...,L'Arabella est un bateau à l'extérieur rouge.,fr,French,0
1,87c90a6e7d,"Dilbilgisi ve cazibe, tarihsel olarak aynı söz...",Kaiser L'nin herşeyini çaldığında 1910'larda k...,tr,Turkish,1
2,5538208b11,طبيعي بما يكفي، إذا، بدأ هذا الانغماس في الحرب...,كانت سماء كندا أكثر خطورة.,ar,Arabic,2
3,de5b153e83,Beyond the facade there are cavernous empty ro...,The rooms past the facade are cluttered with f...,en,English,2
4,801fb29857,Krugman's column will henceforth be known as T...,Krugman writes novels.,en,English,2


In [8]:
df[["premise", "hypothesis", "lang_abv", "language"]].describe()

Unnamed: 0,premise,hypothesis,lang_abv,language
count,9696,9696,9696,9696
unique,7079,9695,15,15
top,well uh what do you think about taxes do you t...,I am not sure.,en,English
freq,3,2,5455,5455


In [9]:
df[["lang_abv", "language"]].value_counts()

lang_abv  language  
en        English       5455
zh        Chinese        331
ar        Arabic         328
fr        French         319
sw        Swahili        316
ru        Russian        309
el        Greek          308
hi        Hindi          306
ur        Urdu           302
th        Thai           295
tr        Turkish        289
vi        Vietnamese     289
es        Spanish        287
de        German         286
bg        Bulgarian      276
Name: count, dtype: int64

In [10]:
df.isna().sum()

id            0
premise       0
hypothesis    0
lang_abv      0
language      0
label         0
dtype: int64

In [49]:
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT, truncation=True)
model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT, num_labels=3)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


class CNNCls(nn.Module):
    def __init__(self):
        super(CNNCls, self).__init__()
        self.conv1 = nn.Conv1d(
            in_channels=768, out_channels=256, kernel_size=3, padding=1
        )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(256, 3)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.relu(x)
        x = x.permute(0, 2, 1)
        x = x.mean(dim=1)
        x = self.dropout(x)
        x = self.fc(x)
        return x


class LSTMCls(nn.Module):
    def __init__(self):
        super(LSTMCls, self).__init__()
        self.lstm = nn.LSTM(
            input_size=768, hidden_size=256, num_layers=1, batch_first=True
        )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(256, 3)

    def forward(self, x):
        # LSTM返回两个元组，(output, (h_n, c_n))
        # output: (batch_size, seq_length, num_directions * hidden_size)
        # h_n: (num_layers * num_directions, batch_size, hidden_size)
        # c_n: (num_layers * num_directions, batch_size, hidden_size)
        output, (h_n, c_n) = self.lstm(x)
        x = self.relu(output[:, -1, :])  # 取序列的最后一个输出
        x = self.dropout(x)
        x = self.fc(x)
        return x


class GRUCls(nn.Module):
    def __init__(self):
        super(GRUCls, self).__init__()
        self.gru = nn.GRU(
            input_size=768, hidden_size=256, num_layers=1, batch_first=True
        )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(256, 3)

    def forward(self, x):
        # GRU的输出 (output, h_n)
        output, h_n = self.gru(x)  # output: (batch_size, seq_length, hidden_size)
        x = self.relu(output[:, -1, :])  # 取序列的最后一个输出
        x = self.dropout(x)
        x = self.fc(x)
        return x


class RNNCls(nn.Module):
    def __init__(self):
        super(RNNCls, self).__init__()
        self.rnn = nn.RNN(
            input_size=768, hidden_size=256, num_layers=1, batch_first=True
        )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(256, 3)

    def forward(self, x):
        # RNN的输出包括 (output, h_n)
        output, h_n = self.rnn(x)  # output: (batch_size, seq_length, hidden_size)
        x = self.relu(output[:, -1, :])  # 取序列的最后一个输出
        x = self.dropout(x)
        x = self.fc(x)
        return x


class MultiheadAttentionCls(nn.Module):
    def __init__(self):
        super(MultiheadAttentionCls, self).__init__()
        self.attention = nn.MultiheadAttention(
            embed_dim=768, num_heads=8, batch_first=True
        )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(768, 3)

    def forward(self, x):
        # 假设x的形状是(batch_size, seq_length, embed_dim)
        # 多头注意力需要输入key, query, value，这里我们使用同一个x作为三者
        attn_output, _ = self.attention(x, x, x)
        # 取多头注意力输出的平均或者最后一个时间步的输出
        x = self.relu(attn_output.mean(dim=1))
        x = self.dropout(x)
        x = self.fc(x)
        return x


model.classifier = MultiheadAttentionCls()


model



XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [50]:
def preprocess_data(batch):
    batch["text"] = [
        x1 + f" {tokenizer.sep_token} " + x2
        for x1, x2 in zip(batch["premise"], batch["hypothesis"])
    ]
    return tokenizer(batch["text"])


data = data.map(preprocess_data, batched=True)
data

Map:   0%|          | 0/9696 [00:00<?, ? examples/s]

Map:   0%|          | 0/2424 [00:00<?, ? examples/s]

Map:   0%|          | 0/5195 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 9696
    })
    val: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 2424
    })
    test: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'text', 'input_ids', 'attention_mask'],
        num_rows: 5195
    })
})

In [51]:
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=labels)


train_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "results"),
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    report_to=[],
    evaluation_strategy="steps",
    eval_steps=200,
    logging_steps=200,
    save_steps=200,
    per_device_eval_batch_size=16,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.005,
    num_train_epochs=5,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=3,
    save_only_model=True,
)

train_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=200,
evaluation_strategy=steps,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_

In [52]:
trainer = Trainer(
    model=model,
    args=train_args,
    data_collator=data_collator,
    train_dataset=data["train"],
    eval_dataset=data["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(3)],
)

trainer.train()

torch.Size([16, 69, 768])


Step,Training Loss,Validation Loss


torch.Size([16, 94, 768])
torch.Size([16, 85, 768])
torch.Size([16, 103, 768])
torch.Size([16, 113, 768])
torch.Size([16, 88, 768])
torch.Size([16, 83, 768])
torch.Size([16, 73, 768])
torch.Size([16, 77, 768])
torch.Size([16, 60, 768])
torch.Size([16, 85, 768])
torch.Size([16, 97, 768])
torch.Size([16, 74, 768])
torch.Size([16, 77, 768])
torch.Size([16, 74, 768])
torch.Size([16, 82, 768])
torch.Size([16, 75, 768])
torch.Size([16, 87, 768])
torch.Size([16, 82, 768])
torch.Size([16, 74, 768])
torch.Size([16, 69, 768])
torch.Size([16, 76, 768])
torch.Size([16, 98, 768])
torch.Size([16, 74, 768])
torch.Size([16, 76, 768])
torch.Size([16, 96, 768])
torch.Size([16, 85, 768])
torch.Size([16, 77, 768])
torch.Size([16, 56, 768])
torch.Size([16, 70, 768])
torch.Size([16, 74, 768])
torch.Size([16, 81, 768])
torch.Size([16, 60, 768])
torch.Size([16, 52, 768])
torch.Size([16, 93, 768])
torch.Size([16, 69, 768])
torch.Size([16, 138, 768])
torch.Size([16, 83, 768])
torch.Size([16, 78, 768])
torch.Siz

KeyboardInterrupt: 

In [None]:
log = trainer.state.log_history
log

In [None]:
steps = []
train_loss = []
eval_loss = []
eval_acc = []

for i in range(0, len(log) - 1, 2):
    train_log = log[i]
    eval_log = log[i + 1]
    steps.append(train_log["step"])
    train_loss.append(train_log["loss"])
    eval_loss.append(eval_log["eval_loss"])
    eval_acc.append(eval_log["eval_accuracy"])

plt.figure(figsize=(10, 5))


plt.subplot(1, 2, 1)
plt.title("xlm-roberta loss vs step")
plt.xlabel("# steps")
plt.ylabel("loss")
plt.plot(steps, train_loss, label="training loss")
plt.plot(steps, eval_loss, label="eval loss")
plt.legend()

plt.subplot(1, 2, 2)
plt.title("xlm-roberta acc vs step")
plt.xlabel("# steps")
plt.ylabel("acc")
plt.plot(steps, eval_acc, label="eval acc")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
preds = trainer.predict(data["test"])
preds

In [None]:
pd.DataFrame(
    {"id": data["test"]["id"], "prediction": np.argmax(preds.predictions, axis=1)}
).to_csv(os.path.join(OUTPUT_DIR, "submission.csv"), header=True, index=False)