In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json
from underthesea import word_tokenize

from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import datasets
device = "cuda" if torch.cuda.is_available() else "cpu"


import re

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [3]:
class config:
    train_path = 'data/train.csv'
    model_name = 'vinai/phobert-base-v2'
    max_length = 256
    batch_size = 64
    num_workers = 32

In [4]:
df = pd.read_csv('data/train.csv')
df = df[df.evidence.notnull()]

In [5]:
label2id = {'SUPPORTED': 0, 'REFUTED': 1}
id2label = {0: 'SUPPORTED', 1:'REFUTED'}

In [6]:
df['verdict_label'] = df.verdict.apply(lambda x: label2id[x])

In [7]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
model = AutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=2, id2label=id2label, label2id=label2id).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
df_train, df_val = train_test_split(df, test_size=0.05, random_state=42)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
train_dataset = datasets.Dataset.from_pandas(df_train)
val_dataset = datasets.Dataset.from_pandas(df_val)

In [9]:
def PreprocessDataset(examples):    
    inputs =  tokenizer(
            text=examples['top_tfdif'],
            text_pair=examples['claim_tokenizer'],
            max_length=config.max_length,
            padding='max_length',
            truncation='only_first',
            return_tensors='pt',
            
        )
    labels = examples['verdict_label']
    inputs.update({'labels': labels})
    return inputs

In [10]:
train_datasets = train_dataset.map(PreprocessDataset, batched=True, batch_size=64,remove_columns=train_dataset.column_names)
valid_datasets = val_dataset.map(PreprocessDataset, batched=True,  batch_size=64,remove_columns=train_dataset.column_names)

Map:   0%|          | 0/23733 [00:00<?, ? examples/s]

Map:   0%|          | 0/1250 [00:00<?, ? examples/s]

In [11]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='micro')  # Có thể thay 'weighted' bằng 'micro', 'macro', hoặc None tùy vào yêu cầu của bạn
    
    return {
        'accuracy': accuracy,
        'f1_score': f1
    }


In [13]:
training_args = TrainingArguments(
    output_dir='models/dou/model1_v1',          # output directory
    num_train_epochs=15,              # total number of training epochs
    learning_rate=1e-5,              # learning rate
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    # gradient_accumulation_steps=2,   # Number of updates steps to accumulate before performing a backward/update pass.
    warmup_steps=250,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=250,
    eval_steps=250,
    evaluation_strategy='steps',
    load_best_model_at_end=True,
    fp16=True,
    metric_for_best_model='f1_score',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_datasets,
    eval_dataset=valid_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,F1 Score
250,0.6454,0.496405,0.7648,0.7648
500,0.337,0.294904,0.8936,0.8936
750,0.2109,0.287199,0.8952,0.8952
1000,0.1486,0.27023,0.904,0.904
1250,0.1121,0.275114,0.9096,0.9096
1500,0.0874,0.277577,0.9112,0.9112
1750,0.0653,0.284355,0.924,0.924
2000,0.0552,0.291232,0.928,0.928
2250,0.0481,0.312831,0.9248,0.9248
2500,0.0426,0.331174,0.9208,0.9208




TrainOutput(global_step=2790, training_loss=0.1606944425986232, metrics={'train_runtime': 1473.3213, 'train_samples_per_second': 241.628, 'train_steps_per_second': 1.894, 'total_flos': 4.68331100764416e+16, 'train_loss': 0.1606944425986232, 'epoch': 15.0})