In [1]:
import os
import json
import pandas as pd
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForMultipleChoice, Trainer, TrainingArguments

In [2]:
# set the file path
train_path = '/Users/hoshea/Documents/pythonProject/NLP/train.jsonl'
test_path =  '/Users/hoshea/Documents/pythonProject/NLP/eval.jsonl'

# read the file into a pandas DataFrame
# df_train = pd.read_json(train_path,lines = True)
# df_test = pd.read_json(test_path,lines = True)

In [3]:
#pip install 'transformers[torch]' accelerate -U

In [4]:
#pip install sentencepiece

In [5]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

data_files = {"train": train_path}
dataset = load_dataset('json', data_files=data_files)

# split the data
def train_test_split(dataset, test_size=0.2):
    train_test = dataset['train'].train_test_split(test_size=test_size)
    return DatasetDict({
        'train': train_test['train'],
        'test': train_test['test']
    })
dataset = train_test_split(dataset)

# import tokenizer
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    questions = examples["Question"]
    choices = [[examples["Alternative1"][i], examples["Alternative2"][i]] for i in range(len(questions))]
    labels = [label - 1 for label in examples["Answer"]]  # Â∞Ü1/2ËΩ¨Êç¢‰∏∫0/1

    for label in labels:
        if label not in [0, 1]:
            raise ValueError(f"Unexpected label value: {label}")

    contexts = []
    for question, choice_pair in zip(questions, choices):
        for choice in choice_pair:
            contexts.append((question, choice))

    tokenized_examples = tokenizer(*zip(*contexts), truncation=True, padding='max_length', max_length=128)

    input_ids = [tokenized_examples['input_ids'][i:i+2] for i in range(0, len(tokenized_examples['input_ids']), 2)]
    attention_mask = [tokenized_examples['attention_mask'][i:i+2] for i in range(0, len(tokenized_examples['attention_mask']), 2)]

    return {
        'input_ids': torch.tensor(input_ids),
        'attention_mask': torch.tensor(attention_mask),
        'labels': torch.tensor(labels)
    }

# preprocess
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)

# load model
model = AutoModelForMultipleChoice.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/11929 [00:00<?, ? examples/s]

Map:   0%|          | 0/2983 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # Âè™‰øùÁïôÊúÄÊñ∞ÁöÑÊ£ÄÊü•ÁÇπ
    load_best_model_at_end=True,
    learning_rate=2e-5,
    per_device_train_batch_size=2,  # Áî±‰∫éÂ§öÈÄâ‰ªªÂä°ÂÜÖÂ≠òÊ∂àËÄóËæÉÂ§ßÔºåÈÄÇÂΩìÂáèÂ∞èbatch size
    num_train_epochs=3,
    weight_decay=0.01,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
)

trainer.train()

# save the best model
trainer.save_model("./best_model")



Epoch,Training Loss,Validation Loss
1,0.6897,0.721564
2,0.4887,1.084918
3,0.1699,1.421945


In [7]:
import os
import json
import pandas as pd
import torch
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForMultipleChoice, Trainer

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

def load_eval_data(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    data = [json.loads(line) for line in lines]
    return data

eval_data = load_eval_data(test_path)

eval_df = pd.DataFrame(eval_data)

# import tokenizer and model
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMultipleChoice.from_pretrained("./best_model")  # ‰ΩøÁî®‰øùÂ≠òÂ•ΩÁöÑÊ®°ÂûãË∑ØÂæÑ

# Preprocess for evaluation
def preprocess_function(examples):
    questions = examples["Question"]
    choices = [[examples["Alternative1"][i], examples["Alternative2"][i]] for i in range(len(questions))]

    contexts = []
    for question, choice_pair in zip(questions, choices):
        for choice in choice_pair:
            contexts.append((question, choice))

    # Tokenize
    tokenized_examples = tokenizer(*zip(*contexts), truncation=True, padding='max_length', max_length=128)

    input_ids = [tokenized_examples['input_ids'][i:i+2] for i in range(0, len(tokenized_examples['input_ids']), 2)]
    attention_mask = [tokenized_examples['attention_mask'][i:i+2] for i in range(0, len(tokenized_examples['attention_mask']), 2)]

    return {
        'input_ids': torch.tensor(input_ids, dtype=torch.long),
        'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
    }

# Build dataset for evaluation
eval_dataset = Dataset.from_pandas(eval_df)
eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_df.columns.tolist())

trainer = Trainer(model=model, tokenizer=tokenizer)

predictions = trainer.predict(eval_dataset)

predicted_logits = np.array(predictions.predictions)
predicted_labels = np.argmax(predicted_logits, axis=1).tolist()


output = pd.DataFrame({
    'ID': eval_df['Id'],
    'Target': [label + 1 for label in predicted_labels]  # Â∞ÜÊ†áÁ≠æ‰ªé0/1ËΩ¨Êç¢Âõû1/2
})

# save the result
output.to_csv('eval_predictions.csv', index=False)

print("Predictions saved to eval_predictions.csv")




Map:   0%|          | 0/4261 [00:00<?, ? examples/s]

Predictions saved to eval_predictions.csv
