In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json
from underthesea import word_tokenize

from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import datasets
device = "cuda" if torch.cuda.is_available() else "cpu"


import re

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [2]:
class config:
    train_path = 'data/train.csv'
    model_name = '/media/bbsw/Data1/Hung-ws/lazy/v2/models/dou/model1_v1/checkpoint-500'
    max_length = 256
    batch_size = 64
    num_workers = 32

In [3]:
df = pd.read_csv('data/train.csv')

In [4]:
label2id = {'SUPPORTED': 0, 'NEI': 1}
id2label = {0: 'SUPPORTED', 1: 'NEI'}

In [5]:
df['verdict_label'] = df['verdict'].apply(lambda x: 1 if x == 'NEI' else 0)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
model = AutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=2, id2label=id2label, label2id=label2id).to(device)

In [7]:
df_train, df_val = train_test_split(df, test_size=0.05, random_state=42)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
train_dataset = datasets.Dataset.from_pandas(df_train)
val_dataset = datasets.Dataset.from_pandas(df_val)

In [8]:
def PreprocessDataset(examples):    
    inputs =  tokenizer(
            text=examples['top_tfdif'],
            text_pair=examples['claim_tokenizer'],
            max_length=config.max_length,
            padding='max_length',
            truncation='only_first',
            return_tensors='pt',
            
        )
    labels = examples['verdict_label']
    inputs.update({'labels': labels})
    return inputs

In [9]:
train_datasets = train_dataset.map(PreprocessDataset, batched=True, batch_size=64,remove_columns=train_dataset.column_names)
valid_datasets = val_dataset.map(PreprocessDataset, batched=True,  batch_size=64,remove_columns=train_dataset.column_names)

Map:   0%|          | 0/35423 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='micro')  # Có thể thay 'weighted' bằng 'micro', 'macro', hoặc None tùy vào yêu cầu của bạn
    
    return {
        'accuracy': accuracy,
        'f1_score': f1
    }


In [12]:
training_args = TrainingArguments(
    output_dir='models/dou/model1_v2',          # output directory
    num_train_epochs=15,              # total number of training epochs
    learning_rate=1e-5,              # learning rate
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    # gradient_accumulation_steps=2,   # Number of updates steps to accumulate before performing a backward/update pass.
    warmup_steps=250,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=250,
    eval_steps=250,
    evaluation_strategy='steps',
    load_best_model_at_end=True,
    fp16=True,
    metric_for_best_model='f1_score',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_datasets,
    eval_dataset=valid_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [13]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,F1 Score
250,0.5593,0.326245,0.875067,0.875067
500,0.2791,0.231517,0.908847,0.908847
750,0.2148,0.214125,0.916354,0.916354
1000,0.1661,0.208443,0.91689,0.91689
1250,0.1352,0.20397,0.928686,0.928686
1500,0.1168,0.215173,0.925469,0.925469
1750,0.0904,0.217293,0.934048,0.934048
2000,0.0757,0.238713,0.934584,0.934584
2250,0.0616,0.243153,0.933512,0.933512
2500,0.0539,0.243001,0.939946,0.939946




In [None]:
trainer.save_model('models/dou/model/model1')