In [17]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [7]:
import pandas as pd
import numpy as np

In [28]:
df = pd.read_csv('train_set.csv')[['question','interview_question','interview_answer','label']]

In [29]:
np.random.seed(2024)
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [26]:
import os
import torch
import torch.nn as nn
# import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
import torch
from torch.utils.data import Dataset
import json
import random
import transformers
import argparse
from datasets import load_dataset

class CustomTextDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        encoding = self.tokenizer(
            text,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }


mapping ={
    '1.1 Explicit': 'Direct Reply',
    '1.2 Implicit': 'Indirect',
    '2.1 Dodging': "Indirect",
    '2.2 Deflection': "Indirect",
    '2.3 Partial/half-answer': "Indirect",
    '2.4 General': "Indirect",
    '2.5 Contradictory': "Indirect",
    '2.6 Declining to answer': "Direct Non-Reply",
    '2.7 Claims ignorance': "Direct Non-Reply",
    '2.8 Clarification': "Direct Non-Reply",
    '2.9 Diffusion': "Indirect",
}


def load_qevasion_dataset(tokenizer, train_size = 900, annotator_ids = None, add_specific_labels = False):

    # dataset = pd.read_csv("dataset/QAEvasion.csv")

    dataset = train

    texts = []
    for _, row in dataset.iterrows():

        if annotator_ids != None and row["annotator_id"] not in annotator_ids:
            continue

        # if row["label"] not in mapping:
        #     continue

        text = "Based on a part of the interview where the interviewer asks a set of questions, classify the type of answer the interviewee provided for the following question.\n\n ### Part of the interview ###\n" + row["interview_question"] + "\n" + row["interview_answer"] + "\n\n" + "### Question ###\n\n"

        if add_specific_labels:
            text += row["question"] + "\nLabel: " + row["label"] + "\n\n"
        else:
            text += row["question"] + "\nLabel: " + mapping[row["label"]] + "\n\n"
        texts.append(text)

    print(texts[1])
    return texts, CustomTextDataset(texts[:train_size], tokenizer)


class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


def main(model_name, train_size, annotator_ids, output_model_dir, add_specific_labels):

    cache_dir = ""

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,
        device_map='auto',
        torch_dtype=torch.float16,
        cache_dir=cache_dir
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})


    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir, trust_remote_code=True,)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})


    for param in model.parameters():
        param.requires_grad = False  # freeze the model - train adapters later
        if param.ndim == 1:
            # cast the small parameters (e.g. layernorm) to fp32 for stability
            param.data = param.data.to(torch.float32)

    model.gradient_checkpointing_enable()  # reduce number of stored activations
    model.enable_input_require_grads()

    model.lm_head = CastOutputToFloat(model.lm_head)

    config = LoraConfig(
        r=16, #attention heads
        lora_alpha=32, #alpha scaling
        #target_modules=["q_proj", "v_proj"], #if you know the
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
    )

    model = get_peft_model(model, config)
    print_trainable_parameters(model)

    # load data
    texts, data = load_qevasion_dataset(tokenizer, train_size = train_size, annotator_ids = annotator_ids, add_specific_labels=add_specific_labels)

    print (f"Found {len(data)} instances for training and {len(texts) - len(data)} instances for validation.")

    # train model
    print ("Training . . . ")
    out_dir = output_model_dir.split("/")[-1]
    trainer = transformers.Trainer(
        model=model,
        train_dataset=data,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=1,
            gradient_accumulation_steps=1,
            warmup_steps=100,
            max_steps=len(data) * 5,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=1,
            output_dir=f'outputs_{out_dir}'
        ),
        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
    trainer.train()
    # Save the model
    model.save_pretrained(output_model_dir)

    # Optionally, save the tokenizer as well
    tokenizer.save_pretrained(output_model_dir)


In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Training script with arguments.")
    parser.add_argument("--model_name", type=str, help="Name of the model e.g. bigscience/bloom-3b")
    parser.add_argument("--train_size", type=int, help="Number of instances in  the training dataset", default = 900)
    parser.add_argument("--annotator_ids", nargs="*", type=int, default=None, help="Ids of the annotators that will be used during training. Default value None, which means all the instances will be used, regardless the annotator!")
    parser.add_argument("--output_model_dir", type=str, help="Directory to save the trained model")
    parser.add_argument("--add_specific_labels", action="store_true", help="Include this flag to indicate whether specific labels (e.g. General, Partial etc) should be added or not")


    args = parser.parse_args()

    print(args.model_name, args.train_size, args.annotator_ids, args.output_model_dir, args.add_specific_labels)

    print (type(args.add_specific_labels))
    main(args.model_name, args.train_size, args.annotator_ids, args.output_model_dir, args.add_specific_labels)

In [34]:
main('meta-llama/Llama-3.1-8B',len(train),annotator_ids = None, output_model_dir = "./data", add_specific_labels= True)

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.1-8B.
403 Client Error. (Request ID: Root=1-674653f4-5a41f875654e775055aad9a3;df1cf54b-c9e2-44e3-ae38-9271a2d6269c)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Llama-3.1-8B to ask for access.

In [32]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
The token `thesis` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `thesis`
