In [1]:
# !pip install evaluate

# Fetching Datasets

In [1]:
#imports
import pandas as pd
import os, ipdb
import random, evaluate

import random
import string

# Fine-Tune Llama2-7b on custom dataset
import os, ipdb
from dataclasses import dataclass, field
from typing import Optional
import numpy as np
import torch, random
from datasets import DatasetDict, Dataset, load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer\
, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, TrainerCallback

from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset


# from ../evaluation_metrics import Metrics
seed = 42
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

os.environ["TOKENIZERS_PARALLELISM"] = "false" # or "true", depending on your needs

# pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

device = 'cuda' if torch.cuda.is_available() else "cpu"
device

2023-09-05 23:07:18.081992: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'cuda'

# Inference

In [2]:
@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"})
    log_with: Optional[str] = field(default="wandb", metadata={"help": "use 'wandb' to log with wandb"})

    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})

    max_steps: Optional[int] = field(default=500, metadata={"help": "the maximum number of sgd steps"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
    save_steps: Optional[int] = field(default=10, metadata={"help": "the saving frequency"})
    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "the per device train batch size"})
    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "the per device eval batch size"})
    gradient_accumulation_steps: Optional[int] = field(default=2, metadata={"help": "the gradient accumulation steps"})
    gradient_checkpointing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use gradient checkpointing"}
    )
    group_by_length: Optional[bool] = field(default=False, metadata={"help": "whether to group by length"})
    packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"})

    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})

    learning_rate: Optional[float] = field(default=1e-4, metadata={"help": "the learning rate"})
    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
    num_warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})

    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})


parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses([])[0]

if script_args.group_by_length and script_args.packing:
    raise ValueError("Cannot use both packing and group by length")

In [3]:
ckpt = "/checkpoint-3350"

script_args.dataset_name = "../data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2"
# script_args.output_dir = f"../model_ckpt/docteat_tdm_f2_all_template{ckpt}"
script_args.output_dir = f"../model_ckpt{ckpt}"
script_args.run_name = "sft_llama2_docteat_tdm_f2_all_Template"
script_args.seq_length = 1024
script_args.per_device_train_batch_size = 6
script_args.gradient_accumulation_steps = 2
# multi GPU


# script_args.dataset_name = "../data/LLLM_LONG_TDMS_ALL_TEMPLATE/fold1"
# script_args.output_dir = f"../model_ckpt/long_tdms_f1_all_template{ckpt}"
# script_args.run_name = "sft_llama2_long_tdms_f1_all_Template"
# script_args.seq_length = 2400
# script_args.per_device_train_batch_size = 1
# script_args.gradient_accumulation_steps = 1

script_args.save_steps = 50
script_args.logging_steps = 50
script_args.streaming = False
script_args.num_train_epochs = 5
script_args.save_total_limit = 10

In [4]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    # text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
    text = f"{example['prompt']}\n{example['answer']}"
    return text


def create_datasets(tokenizer, args):
    # dataset = load_dataset(
    #     args.dataset_name,
    #     data_dir=args.subset,
    #     split=args.split,
    #     use_auth_token=True,
    #     num_proc=args.num_workers if not args.streaming else None,
    #     streaming=args.streaming,
    # )
    
    dataset = DatasetDict.load_from_disk(f"{args.dataset_name}")
    dataset = dataset.shuffle(seed=seed)
    
    # if args.streaming:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(args.size_valid_set)
    #     train_data = dataset.skip(args.size_valid_set)
    #     train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None)
    # else:
    
    # dataset = dataset.train_test_split(test_size=0.005, seed=None)
    train_data = dataset["train"]
    valid_data = dataset["validation"]
    print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data, tokenizer, nb_examples=400)
    # chars_per_token = chars_token_ratio(train_data, tokenizer, nb_examples=len(train_data)//2)
    # 3.70
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=prepare_sample_text,
        infinite=False,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [5]:
script_args.output_dir

'../model_ckpt/checkpoint-3350'

In [6]:
model = AutoPeftModelForCausalLM.from_pretrained(
    script_args.output_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    load_in_4bit=True,
    token="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",
)
# # model = model.cpu()
# model = model.to("cpu")

# # torch.cuda.empty_cache()
# # # model.generate()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [7]:
dataset = DatasetDict.load_from_disk(f"{script_args.dataset_name}")
    
    # if args.streaming:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(args.size_valid_set)
    #     train_data = dataset.skip(args.size_valid_set)
    #     train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None)
    # else:
    
# dataset = dataset.train_test_split(test_size=0.005, seed=None)
train_data = dataset["train"].shuffle(seed=42)
valid_data = dataset["validation"].shuffle(seed=42)

train_data[0]

{'prompt': 'Making the Invisible Visible: Action Recognition Through Walls and Occlusions Understanding people\'s actions and interactions typically depends on seeing them. Automating the process of action recognition from visual data has been the topic of much research in the computer vision community. But what if it is too dark, or if the person is occluded or behind a wall? In this paper, we introduce a neural network model that can detect human actions through walls and occlusions, and in poor lighting conditions. Our model takes radio frequency (RF) signals as input, generates 3D human skeletons as an intermediate representation, and recognizes actions and interactions of multiple people overtime. By translating the input to an intermediate skeleton-based representation, our model can learn from both vision-based and RF-based datasets, and allow the two tasks to help each other. We show that our model achieves comparable accuracy to vision-based action recognition systems in visib

In [8]:
valid_data[10]

{'prompt': 'Read this: Temporal Segment Networks for Action Recognition in Videos Deep convolutional networks have achieved great success for image recognition. However, for action recognition in videos, their advantage over traditional methods is not so evident. We present a general and flexible video-level framework for learning action models in videos. This method, called temporal segment network (TSN), aims to model long-range temporal structures with anew segment-based sampling and aggregation module. This unique design enables our TSN to efficiently learn action models by using the whole action videos. The learned models could be easily adapted for action recognition in both trimmed and untrimmed videos with simple average pooling and multi-scale temporal window integration, respectively. We also study a series of good practices for the instantiation of temporal segment network framework given limited training samples. Our approach obtains the state-the-of-art performance on four

In [9]:
df_valid_data = valid_data.to_pandas()
df_valid_data.tail()

Unnamed: 0,prompt,answer,__index_level_0__
35275,Read this: Applying Physical-layer Network Cod...,unanswerable,34957
35276,Understanding Humans in Crowded Scenes: Deep N...,[{'LEADERBOARD': {'Task': 'Multi-Human Parsing...,4802
35277,UFPR-Periocular: A Periocular Dataset Collecte...,"[{'LEADERBOARD': {'Task': 'Face Recognition', ...",5668
35278,Pose-driven Deep Convolutional Model for Perso...,[{'LEADERBOARD': {'Task': 'Person Re-Identific...,22113
35279,Building Language Models for Text with Named E...,"[{'LEADERBOARD': {'Task': 'Code Generation', '...",9864


In [10]:
str(df_valid_data.at[35275, 'prompt'])

'Read this: Applying Physical-layer Network Coding in Wireless Networks A main distinguishing feature of a wireless network compared with a wired network is its broadcast nature, in which the signal transmitted by anode may reach several other nodes, and anode may receive signals from several other nodes, simultaneously. Rather than a blessing, this feature is treated more as an interference-inducing nuisance inmost wireless networks today (e.g., IEEE 802.11). This paper shows that the concept of network coding can be applied at the physical layer to turn the broadcast property into a capacity-boosting advantage in wireless ad hoc networks. Specifically, we propose a physical-layer network coding (PNC) scheme to coordinate transmissions among nodes. In contrast to "straightforward" network coding which performs coding arithmetic on digital bit streams after they have been received, PNC makes use of the additive nature of simultaneously arriving electromagnetic (EM) waves for equivalent

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'

tokenizer = AutoTokenizer.from_pretrained(
    script_args.output_dir,
    token="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",
)

# tokenizer.cpu()

# tokenizer = AutoTokenizer.from_pretrained(model_name)

# inputs = tokenizer.encode(f"{train_data[100]['prompt'][-7000:]}", return_tensors="pt").to(device)
# inputs = tokenizer.encode(f"{train_data[0]['prompt']}", return_tensors="pt").to(device)
inputs = tokenizer.encode(f"Question: {valid_data[10]['prompt']}", return_tensors="pt").to(device)

# text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
# generate_kwargs = dict(
#     input_ids=inputs,
#     temperature=0.2, 
#     top_p=0.90, 
#     top_k=40,
#     max_new_tokens=script_args.seq_length,
#     repetition_penalty=1.1
# )

generate_kwargs = dict(
    input_ids=inputs,
    temperature=0.2, 
    top_p=0.95, 
    top_k=40,
    max_new_tokens=500,
    repetition_penalty=1.3
)

outputs = model.generate(**generate_kwargs)
predictions = tokenizer.decode(outputs[0])
print(predictions.split("Answer: ")[-1])

[{'LEADERBOARD': {'Task': 'Action Classification', 'Dataset': 'ActivityNet Challenge', 'Metric': 'mAP'}}, {'LEADERBOARD': {'Task': 'Video Classification', 'Dataset': 'YouTubeVIDEO', 'Metric': 'Top-1 Accuracy'}}]</s>


In [12]:
predictions

'<s> Question: Read this: Temporal Segment Networks for Action Recognition in Videos Deep convolutional networks have achieved great success for image recognition. However, for action recognition in videos, their advantage over traditional methods is not so evident. We present a general and flexible video-level framework for learning action models in videos. This method, called temporal segment network (TSN), aims to model long-range temporal structures with anew segment-based sampling and aggregation module. This unique design enables our TSN to efficiently learn action models by using the whole action videos. The learned models could be easily adapted for action recognition in both trimmed and untrimmed videos with simple average pooling and multi-scale temporal window integration, respectively. We also study a series of good practices for the instantiation of temporal segment network framework given limited training samples. Our approach obtains the state-the-of-art performance on f

# Compute metrics

In [13]:
len(valid_data)

35280

In [None]:
labels = []
preds = []
for i, valid_ex in tqdm(enumerate(valid_data), total=len(valid_data)):
    # if i <= 100 :
    #     continue 
        
    inputs = tokenizer.encode(f"Question: {valid_ex['prompt']}", return_tensors="pt").to(device)

    # text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
    # generate_kwargs = dict(
    #     input_ids=inputs,
    #     temperature=0.2, 
    #     top_p=0.90, 
    #     top_k=40,
    #     max_new_tokens=script_args.seq_length,
    #     repetition_penalty=1.1
    # )
    
    generate_kwargs = dict(
        input_ids=inputs,
        temperature=0.2, 
        top_p=0.95, 
        top_k=40,
        max_new_tokens=500,
        repetition_penalty=1.3
    )
    
    outputs = model.generate(**generate_kwargs)
    predictions = tokenizer.decode(outputs[0])

    labels.append(valid_ex['answer'])
    preds.append(predictions.split("Answer: ")[-1] if len(predictions.split("Answer: ")) == 2 else "")

    if i >= 10000:
        break
    
    # ipdb.set_trace()

  2%|▏         | 877/35280 [49:53<27:31:17,  2.88s/it] 

In [None]:
len(labels)

In [None]:
len(preds)

In [None]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

clf_metrics.compute(predictions=[1 if "unanswerable" == x.replace("</s>", "") else 0 for x in preds], 
                    references=[1 if "unanswerable" == x else 0 for x in labels]
                    # references=[1 for df['answer'].tolist()]
)


In [None]:
rouge = evaluate.load('rouge')

results = rouge.compute(
    predictions=[pred.replace("</s>", "") for pred in preds],
    # predictions=preds,
    references=labels
)
results

In [60]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

clf_metrics.compute(predictions=[1 if "unanswerable" == x.replace("</s>", "") else 0 for x in preds], 
                    references=[1 if "unanswerable" == x else 0 for x in labels]
                    # references=[1 for df['answer'].tolist()]
)


{'accuracy': 0.9325,
 'f1': 0.9102990033222591,
 'precision': 0.9256756756756757,
 'recall': 0.8954248366013072}

In [61]:
rouge = evaluate.load('rouge')

results = rouge.compute(
    predictions=[pred.replace("</s>", "") for pred in preds],
    # predictions=preds,
    references=labels
)
results

{'rouge1': 0.573803131539657,
 'rouge2': 0.1328034207469432,
 'rougeL': 0.5677132015381601,
 'rougeLsum': 0.566594144394203}

In [52]:
results

{'rouge1': 0.5480447617783779,
 'rouge2': 0.1553206013465882,
 'rougeL': 0.5392013741283679,
 'rougeLsum': 0.5402437835863861}

In [44]:
i = 0
for x in tqdm(preds):
    if "unanswerable" in x:
        ipdb.set_trace()
    else:
        i+=1
        
        # print(x)

  0%|          | 0/101 [00:00<?, ?it/s]

> [0;32m/tmp/ipykernel_423316/1180858725.py[0m(2)[0;36m<module>[0;34m()[0m
[0;32m      1 [0;31m[0mi[0m [0;34m=[0m [0;36m0[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 2 [0;31m[0;32mfor[0m [0mx[0m [0;32min[0m [0mtqdm[0m[0;34m([0m[0mpreds[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      3 [0;31m    [0;32mif[0m [0;34m"unanswerable"[0m [0;32min[0m [0mx[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  print(i)


6


ipdb>  print(x)


unanswerable</s>


ipdb>  q


  6%|▌         | 6/101 [00:36<09:41,  6.12s/it]
