In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

dataDir = './assignment-2/'

if not os.path.exists(dataDir):
    os.makedirs(dataDir)


# Add header=None because the CSV file does not have column names
data = pd.read_csv('5000TravelQuestionsDataset.csv', encoding='ISO-8859-1', header=None)

# Set column names
data.columns = ['Question', 'Coarse', 'Fine']

#  column names to verify
print("Dataframe columns:", data.columns)

# remove fine-grained category column, keep only coarse-grained category
data = data[['Question', 'Coarse']]


data.columns = ['Question', 'Category']

#  only specified coarse-grained categories
allowedCategories = ['TTD', 'TGU', 'ACM', 'TRS', 'WTH', 'FOD', 'ENT']
data = data[data['Category'].isin(allowedCategories)].reset_index(drop=True)

#  dataset size
print(f"Total data size after filtering: {len(data)}")

# Split dataset
trainData, tempData = train_test_split(
    data, test_size=1000, random_state=42, stratify=data['Category']
)
valData, testData = train_test_split(
    tempData, test_size=700, random_state=42, stratify=tempData['Category']
)

print(f"Training set size: {len(trainData)}")
print(f"Validation set size: {len(valData)}")
print(f"Test set size: {len(testData)}")

#  dataset to instruction format
def convertToInstructionFormat(df):
    dfConverted = df.copy()
    dfConverted['instruction'] = 'Please determine the category (TTD, TGU, ACM, TRS, WTH, FOD, ENT) based on the content of the following question:'
    dfConverted['input'] = dfConverted['Question']
    dfConverted['output'] = dfConverted['Category']
    return dfConverted[['instruction', 'input', 'output']]

trainDataConverted = convertToInstructionFormat(trainData)
valDataConverted = convertToInstructionFormat(valData)
testDataConverted = convertToInstructionFormat(testData)

#  datasets to the specified directory
trainDataConverted.to_csv(os.path.join(dataDir, 'train_data.csv'), index=False)
valDataConverted.to_csv(os.path.join(dataDir, 'val_data.csv'), index=False)
testDataConverted.to_csv(os.path.join(dataDir, 'test_data.csv'), index=False)

print("Data has been saved to the specified directory.")

# Model Selection



# Load tokenizer and model
modelName = 'facebook/opt-350m' 
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModelForCausalLM.from_pretrained(modelName).to(device)


print(f"Tokenizer: {tokenizer.name_or_path}")
print(f"Vocabulary size: {len(tokenizer)}")
print(f"Model parameters: {model.num_parameters()}")
print(f"Model max length: {model.config.max_position_embeddings}")



#  prompt template
def generatePrompt(question, examples=[]):
    prompt = "Please determine the category (TTD, TGU, ACM, TRS, WTH, FOD, ENT) based on the content of the following question:\n\n"
    for ex in examples:
        prompt += f"Question: {ex['input']}\nAnswer: {ex['output']}\n\n"
    prompt += f"Question: {question}\nAnswer:"
    return prompt

# Zero-shot Testing

def zeroShotTest(testData):
    correct = 0
    total = 0
    for idx, row in testData.iterrows():
        prompt = generatePrompt(row['input'])
        inputs = tokenizer(prompt, return_tensors='pt').to(device)
        outputs = model.generate(
            **inputs, max_new_tokens=10, temperature=0.7, top_p=0.9, do_sample=True
        )
        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        predictedCategory = response.strip().split()[0]
        if predictedCategory == row['output']:
            correct += 1
        total += 1
        if total % 100 == 0:
            print(f"Processed {total} samples...")
    accuracy = correct / total
    return accuracy

# Few-shot Testing

def fewShotTest(testData, k):
    correct = 0
    total = 0
    examples = trainDataConverted.sample(k).to_dict('records')
    for idx, row in testData.iterrows():
        prompt = generatePrompt(row['input'], examples)
        inputs = tokenizer(prompt, return_tensors='pt').to(device)
        outputs = model.generate(
            **inputs, max_new_tokens=10, temperature=0.7, top_p=0.9, do_sample=True
        )
        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        predictedCategory = response.strip().split()[0]
        if predictedCategory == row['output']:
            correct += 1
        total += 1
        if total % 100 == 0:
            print(f"Processed {total} samples...")
    accuracy = correct / total
    return accuracy

#  Accuracy

print("Starting zero-shot testing...")
zeroShotAccuracy = zeroShotTest(testDataConverted)
print(f"Zero-shot Accuracy: {zeroShotAccuracy * 100:.2f}%")

print("Starting 1-shot testing...")
oneShotAccuracy = fewShotTest(testDataConverted, k=1)
print(f"1-shot Accuracy: {oneShotAccuracy * 100:.2f}%")

print("Starting 3-shot testing...")
threeShotAccuracy = fewShotTest(testDataConverted, k=3)
print(f"3-shot Accuracy: {threeShotAccuracy * 100:.2f}%")

# Supervised Fine-Tuning SFT

# Prepare fine-tuning data
trainDataset = Dataset.from_pandas(trainDataConverted)
valDataset = Dataset.from_pandas(valDataConverted)

#  data processing function
def preprocessFunction(examples):
    # Construct prompt
    prompts = [
        instruction + '\nQuestion: ' + inp + '\nAnswer:'
        for instruction, inp in zip(examples['instruction'], examples['input'])
    ]
    # Construct answers
    answers = [out + tokenizer.eos_token for out in examples['output']]

    # Encode prompts and answers
    tokenizedInputs = tokenizer(
        prompts,
        add_special_tokens=False,
        truncation=True,
        max_length=512,
        padding=False,
    )
    tokenizedAnswers = tokenizer(
        answers,
        add_special_tokens=False,
        truncation=True,
        max_length=10,
        padding=False,
    )

    # Concatenate inputs  labels
    inputIds = []
    labels = []
    for inputIdsPart, answerIds in zip(tokenizedInputs['input_ids'], tokenizedAnswers['input_ids']):
        inputId = inputIdsPart + answerIds
        label = [-100] * len(inputIdsPart) + answerIds

        # Truncate to maximum length
        inputId = inputId[:512]
        label = label[:512]

        inputIds.append(inputId)
        labels.append(label)

    # Return unpadded sequences
    return {'input_ids': inputIds, 'labels': labels}

# Process datasets
trainDataset = trainDataset.map(preprocessFunction, batched=True, remove_columns=trainDataset.column_names)
valDataset = valDataset.map(preprocessFunction, batched=True, remove_columns=valDataset.column_names)

#  custom data collator
def dataCollator(features):
    inputIds = [torch.tensor(f['input_ids'], dtype=torch.long) for f in features]
    labels = [torch.tensor(f['labels'], dtype=torch.long) for f in features]

    # Pad sequences
    inputIds = pad_sequence(inputIds, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    # Generate attention_mask
    attentionMask = (inputIds != tokenizer.pad_token_id).long()

    return {'input_ids': inputIds, 'labels': labels, 'attention_mask': attentionMask}

# Set training parameters
trainingArgs = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # You can increase the number of training epochs as needed
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=100,
    learning_rate=5e-5,
    fp16=torch.cuda.is_available(),  # Use fp16 if available
    push_to_hub=False,
)

#  Trainer
trainer = Trainer(
    model=model,
    args=trainingArgs,
    train_dataset=trainDataset,
    eval_dataset=valDataset,
    data_collator=dataCollator,
)

# Start fine-tuning
print("Starting model fine-tuning...")
trainer.train()

# Evaluate the fine-tuned model on the test set

def sftTest(testData):
    correct = 0
    total = 0
    for idx, row in testData.iterrows():
        prompt = generatePrompt(row['input'])
        inputs = tokenizer(prompt, return_tensors='pt').to(device)
        outputs = model.generate(
            **inputs, max_new_tokens=10, temperature=0.7, top_p=0.9, do_sample=True
        )
        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        predictedCategory = response.strip().split()[0]
        if predictedCategory == row['output']:
            correct += 1
        total += 1
        if total % 100 == 0:
            print(f"Processed {total} samples...")
    accuracy = correct / total
    return accuracy

print("Evaluating the fine-tuned model...")
sftAccuracy = sftTest(testDataConverted)
print(f"SFT Accuracy: {sftAccuracy * 100:.2f}%")

# Final Results

print("\nFinal Results:")
print(f"Zero-shot Accuracy: {zeroShotAccuracy * 100:.2f}%")
print(f"1-shot Accuracy: {oneShotAccuracy * 100:.2f}%")
print(f"3-shot Accuracy: {threeShotAccuracy * 100:.2f}%")
print(f"SFT Accuracy: {sftAccuracy * 100:.2f}%")


Using device: cuda
Dataframe columns: Index(['Question', 'Coarse', 'Fine'], dtype='object')
Total data size after filtering: 4994
Training set size: 3994
Validation set size: 300
Test set size: 700
Data has been saved to the specified directory.
Tokenizer: facebook/opt-350m
Vocabulary size: 50265
Model parameters: 331196416
Model max length: 2048
Starting zero-shot testing...
Processed 100 samples...
Processed 200 samples...
Processed 300 samples...
Processed 400 samples...
Processed 500 samples...
Processed 600 samples...
Processed 700 samples...
Zero-shot Accuracy: 0.00%
Starting 1-shot testing...
Processed 100 samples...
Processed 200 samples...
Processed 300 samples...
Processed 400 samples...
Processed 500 samples...
Processed 600 samples...
Processed 700 samples...
1-shot Accuracy: 19.57%
Starting 3-shot testing...
Processed 100 samples...
Processed 200 samples...
Processed 300 samples...
Processed 400 samples...
Processed 500 samples...
Processed 600 samples...
Processed 700 sam

Map:   0%|          | 0/3994 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Starting model fine-tuning...




Epoch,Training Loss,Validation Loss
1,0.1669,0.120324


Evaluating the fine-tuned model...
Processed 100 samples...
Processed 200 samples...
Processed 300 samples...
Processed 400 samples...
Processed 500 samples...
Processed 600 samples...
Processed 700 samples...
SFT Accuracy: 86.86%

Final Results:
Zero-shot Accuracy: 0.00%
1-shot Accuracy: 19.57%
3-shot Accuracy: 23.29%
SFT Accuracy: 86.86%
