In [25]:
# %%capture

# # This command installs several Python packages that are necessary for fine-tuning and optimizing language models:
!pip install accelerate       # A PyTorch extension for distributed training and performance optimization.
!pip install peft             # PEFT (Parameter-efficient Fine-tuning): Techniques for efficient fine-tuning of models.
!pip install bitsandbytes     # Library for efficient training of deep learning models, focusing on memory and speed optimization.
!pip install transformers     # Provides thousands of pre-trained models to perform tasks on texts such as classification, information extraction, and more.
!pip install trl              # Token-level Reinforcement Learning for training language models.

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable




In [26]:
!which python

/sw/pkgs/arc/python3.9-anaconda/2021.11/bin/python


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
# Importing essential modules and functions from various libraries:
import os                        # os module: Provides a way of using operating system dependent functionality.
import torch                     # PyTorch: A deep learning framework for tensor computation and automatic differentiation.

# datasets: A library from Hugging Face for loading and processing datasets.
from datasets import load_dataset

# AutoModelForCausalLM: Loads a model for causal language modeling (like GPT) from a pre-trained model.
# AutoTokenizer: Loads the tokenizer that corresponds to a pre-trained model.
# BitsAndBytesConfig: Configuration for bitsandbytes to optimize model training.
# TrainingArguments: Defines training parameters.
# pipeline: Simplifies the process of making predictions with models.
# logging: Used for logging events in the transformers library.
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)

# LoraConfig: Configuration for LoRA (Low-Rank Adaptation), a technique for parameter-efficient training.
from peft import LoraConfig

# SFTTrainer: Trainer class from the trl (Token-level Reinforcement Learning) library for model training.
from trl import SFTTrainer


In [2]:
from datasets import DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# Loading dataset

In [63]:
df = pd.read_csv('data/all_data.csv')

In [64]:
df = df.dropna()



> Check prompt if possible. Might have to finetune it a little bit. Think a bit about whether system prompt should come before <HUMAN> tag or after.


In [65]:
def create_prompt_and_response(row):
    label_text = "inspirational" if row['Label'] == 1 else "non-inspirational"
    prompt = f"<HUMAN>: Please classify the following text into its relevant country (india or uk), source of data (real or generated), and whether it is inspirational or not: {row['Text']}\n<ASSISTANT>: {row['Country']}, {row['Source']}, {label_text}"
    return prompt

In [66]:
# Apply the function to each row and add a new column
df['Prompt_Response'] = df.apply(create_prompt_and_response, axis=1)

In [67]:
df

Unnamed: 0,Text,Country,Source,Label,Prompt_Response
0,Palaeontologist.,uk,real,0,<HUMAN>: Please classify the following text in...
1,\n> We have the capability of being better. B...,uk,real,1,<HUMAN>: Please classify the following text in...
2,"Only just noticed it myself, ha! Or Ha, accord...",uk,real,0,<HUMAN>: Please classify the following text in...
3,Mate all he does is play football and misses p...,uk,real,1,<HUMAN>: Please classify the following text in...
4,"Me neither. But then, I'm not sure Cleese woul...",uk,real,0,<HUMAN>: Please classify the following text in...
...,...,...,...,...,...
5995,"Often, I find my inspiration in the diverse an...",india,generated,1,<HUMAN>: Please classify the following text in...
5996,I often find inspiration in people overcoming ...,india,generated,1,<HUMAN>: Please classify the following text in...
5997,What inspires me most is the sheer determinati...,india,generated,1,<HUMAN>: Please classify the following text in...
5998,"Just the other day, I watched a fascinating do...",india,generated,1,<HUMAN>: Please classify the following text in...


In [68]:
df = df.drop(['Text', 'Country', 'Source', 'Label'], axis = 1)

In [69]:
df = df.rename(columns = {'Prompt_Response': 'Text'})

In [70]:
# Split the DataFrame into train and test sets
# df_train, df_test = train_test_split(df, test_size=0.2, random_state=42) #default split
df_train, df_test = train_test_split(df, test_size=0.9, random_state=42) # few-shot

In [71]:
# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df_train, preserve_index=False)

# Create a DatasetDict object with a single 'train' split
data = DatasetDict({'train': dataset})

# Print the summary of the dataset dict
print(data)

DatasetDict({
    train: Dataset({
        features: ['Text'],
        num_rows: 599
    })
})


#Training

In [72]:
# Model from Hugging Face hub
base_model = "NousResearch/Llama-2-7b-chat-hf"
# base_model = "reecursion/Llama-2-7b-multicultural-inspiration" #default fine-tuned
# base_model = "reecursion/Llama-2-7b-multicultural-inspiration-fewshot" #few-shot fine-tuned

In [73]:
# Fine-tuned model name
new_model = "reecursion/Llama-2-7b-multicultural-inspiration-fewshot"

In [74]:
# Get the PyTorch data type for floating point 16-bit precision.
# This is typically used to reduce the memory and computation requirements
# without significantly impacting the model's performance accuracy.
compute_dtype = getattr(torch, "float16")

# Configure BitsAndBytes for quantization.
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable loading model weights in 4-bit precision. This reduces the model's memory footprint further.
    bnb_4bit_quant_type="nf4",  # Set the quantization type to "nf4", which stands for "near-float 4-bit". This is a quantization scheme designed to maintain high accuracy with lower bit rates.
    bnb_4bit_compute_dtype=compute_dtype,  # Specify the data type for computation. Here it uses 16-bit floating points as defined above.
    bnb_4bit_use_double_quant=False,  # Determines whether to use double quantization. Setting this to False uses single quantization, which is simpler and faster.
)

In [75]:
# Load the pre-trained model from the Hugging Face Model Hub.
# The model is specified by the 'base_model' variable, which should contain the model identifier.
# 'quantization_config' applies the quantization settings defined earlier to optimize the model.
# 'device_map' specifies the mapping of model parts to devices. Here, it assigns the model to GPU 0.
# Note: Adjust this according to your hardware setup, especially if you don't have a GPU or have a different configuration.
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)

# Disable caching of past hidden states.
# This is useful for saving memory during training or inference, especially in generative tasks where caching is less beneficial.
model.config.use_cache = False

# Set 'pretraining_tp' to 1 in the model's configuration.
# This option might be specific to the model's architecture or training procedure.
# Since it's not a standard configuration option in Hugging Face's Transformers, it's likely specific to this model or a custom addition.
model.config.pretraining_tp = 1


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [76]:
# Load the tokenizer for the specified base model from Hugging Face's Model Hub.
# 'trust_remote_code=True' allows for the execution of custom tokenization logic from the remote model repository, if available.
# This can be important for models with custom tokenization processes.
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# Set the padding token to be the same as the end-of-sequence (eos) token.
# This is common in models designed for generative tasks, where the eos token can also serve to pad sequences to a uniform length.
tokenizer.pad_token = tokenizer.eos_token

# Specify that padding should be added to the right of the sequences.
# This means if a batch of sequences is shorter than the longest sequence, padding tokens will be added to the end until they match the longest sequence's length.
tokenizer.padding_side = "right"

In [77]:
# Configure PEFT using LoRA for efficient fine-tuning of the model.
peft_params = LoraConfig(
    lora_alpha=16,    # The learning rate multiplier for LoRA parameters. This amplifies the updates applied to the adapted parameters.
    lora_dropout=0.1, # Dropout rate applied to the LoRA projections. Helps in preventing overfitting by randomly dropping units from the projections during training.
    r=2,              # Rank of the low-rank matrices in LoRA. A higher rank allows for more complex adaptations but increases the number of parameters to be trained.
    bias="none",       # Specifies how biases are handled in LoRA adaptations. "none" means that biases are not adapted as part of the LoRA process.
    task_type="CAUSAL_LM", # Indicates the type of task the model is being fine-tuned for. Here, it specifies a causal language modeling task.
)

In [78]:
training_params = TrainingArguments(
    output_dir="./results",  # Directory where the training results and model checkpoints will be saved.
    num_train_epochs=1,  # The total number of training epochs. One epoch means the model has seen the entire dataset once.
    per_device_train_batch_size=2,  # Batch size per device during training. Adjust based on your GPU memory.
    gradient_accumulation_steps=1,  # Number of steps to accumulate gradients before performing a backward/update pass.
    optim="paged_adamw_32bit",  # The optimizer to use. "paged_adamw_32bit" is an optimized version of AdamW for 32-bit precision.
    save_steps=1000,  # Save a checkpoint of the model every 25 steps.
    logging_steps=1000,  # Log training information every 25 steps.
    learning_rate=2e-4,  # The initial learning rate for AdamW.
    weight_decay=0.001,  # Weight decay rate to apply for regularization and prevent overfitting.
    fp16=False,  # Whether to use 16-bit (mixed) precision instead of 32-bit. False means 32-bit is used.
    bf16=False,  # Whether to use bfloat16 precision. False means it's not used, maintaining higher precision calculations.
    max_grad_norm=0.3,  # Maximum norm of the gradients for gradient clipping. Helps prevent the exploding gradient problem.
    max_steps=-1,  # If positive, set total number of training steps to perform. Overrides `num_train_epochs`. -1 means use `num_train_epochs`.
    warmup_ratio=0.03,  # Proportion of training to perform linear learning rate warmup. Helps stabilize the model's early training stages.
    group_by_length=True,  # Whether to group samples of similar lengths together. Improves training efficiency.
    lr_scheduler_type="constant",  # The learning rate scheduler type. "constant" means the learning rate does not change.
#     report_to="tensorboard"  # Logging the results to TensorBoard. Helps in monitoring the training process visually.
)

In [79]:
trainer = SFTTrainer(
    model=model,  # The pre-trained model you've loaded and possibly modified with quantization settings.
    train_dataset=dataset,  # The training dataset you've loaded, which the model will learn from.
    peft_config=peft_params,  # The PEFT (Parameter-efficient Fine-tuning) configuration, specifically for LoRA in this context.
    dataset_text_field="Text",  # The field in your dataset that contains the text data for training.
    max_seq_length=None,  # The maximum sequence length for model inputs. 'None' means the model's default or the dataset's max length is used.
    tokenizer=tokenizer,  # The tokenizer for processing text data to model-compatible inputs.
    args=training_params,  # The training arguments specifying how the model should be fine-tuned.
    packing=False,  # Whether to use data packing. Data packing can improve training efficiency but is set to 'False' here.
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/599 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [80]:
# Train model
trainer.train()

{'train_runtime': 208.3539, 'train_samples_per_second': 2.875, 'train_steps_per_second': 1.44, 'train_loss': 1.3312919108072916, 'epoch': 1.0}


TrainOutput(global_step=300, training_loss=1.3312919108072916, metrics={'train_runtime': 208.3539, 'train_samples_per_second': 2.875, 'train_steps_per_second': 1.44, 'train_loss': 1.3312919108072916, 'epoch': 1.0})

In [81]:
# Save the fine-tuned model to the specified directory. This will create the directory if it doesn't exist.
trainer.model.save_pretrained(new_model)

# Save the tokenizer to the same directory as the model. This ensures consistency between the model and tokenizer for future use.
trainer.tokenizer.save_pretrained(new_model)



('reecursion/Llama-2-7b-multicultural-inspiration-fewshot/tokenizer_config.json',
 'reecursion/Llama-2-7b-multicultural-inspiration-fewshot/special_tokens_map.json',
 'reecursion/Llama-2-7b-multicultural-inspiration-fewshot/tokenizer.json')

If it creates the directory locally, and we want to instead save it to huggingface

In [27]:
# from huggingface_hub import notebook_login
# notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [82]:
# trainer.push_to_hub() #error login?

# Testing

In [82]:
logging.set_verbosity(logging.CRITICAL)

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1000)

In [83]:
gt = []
texts = []
data = list(df_test['Text'])   
for elem in data:
    input_, gt_data = elem.split("\n<ASSISTANT>:")
    texts.append(input_ + "\n<ASSISTANT>: ")
    gt.append(gt_data.strip())

In [None]:
predicted = pipe(texts)

In [None]:
import pickle
with open('predicted.pkl', 'wb') as f: 
    pickle.dump(predicted, f) 