load the pre-processed dataset

In [None]:
df = pd.read_csv("df.csv")

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
def transform_conversation(text):
    segments = text.split('###')
    reformatted_segments = []

    # Iterate over pairs of segments
    for i in range(1, len(segments) - 1, 2):
        human_text = segments[i].strip().replace('Human:', '').strip()

        # Check if there is a corresponding assistant segment
        if i + 1 < len(segments):
            assistant_text = segments[i+1].strip().replace('Assistant:', '').strip()
            reformatted_segments.append(f'<s>[INST] {human_text} [/INST] {assistant_text} </s>')
        else:
            # Handle the case where there is no corresponding assistant segment
            reformatted_segments.append(f'<s>[INST] {human_text} [/INST] </s>')

    return ''.join(reformatted_segments)


In [None]:
import pandas as pd

# Assuming df2 is your DataFrame and is already loaded
def transform_dataframe(df):
    # Initialize a list to store the transformed text
    transformed_texts = []

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        limitation_text = row['Text'].strip()  # This corresponds to 'assistant_text'

        # Apply the new template
        transformed_text = f'<s>[/INST] {limitation_text} </s>'
        transformed_texts.append(transformed_text)

    # Return a new DataFrame with the transformed texts
    return pd.DataFrame({'transformed_text': transformed_texts})

# Apply the transformation
transformed_df2 = transform_dataframe(df2)


In [None]:
# split the data train and test
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_df, test_df = train_test_split(transformed_df2, test_size=0.8, random_state=42)  # 80% for testing, 20% for training

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
new_model = "Llama-2-7b-chat-finetune"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# df_short = pd.read_csv("gdrive/My Drive/limitations_dataset/df_volume_2_short_papers.csv")

# Output directory where the model predictions and checkpoints will be stored
output_dir = "gdrive/My Drive/limitations_dataset/results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# optim = AdamW(model.parameters(), lr=5e-5)

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
from transformers import AutoTokenizer

# Assuming you have already loaded your model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'  # Ensure padding is added to the right

# Function to encode the data
def encode_data(texts):
    return tokenizer(texts, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

# Assuming 'df2' is your DataFrame and 'transformed_text' contains the text to be processed
encoded_texts = encode_data(train_df['transformed_text'].tolist())

# Creating a PyTorch dataset from encoded texts
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        # Returning a dictionary matching the expected format
        return {key: val[idx] for key, val in self.encodings.items()}

train_dataset = TextDataset(encoded_texts)


In [None]:
# Load dataset (you can process it here)
# dataset = load_dataset(dataset_name, split="train")

# Load tokenizer and model with QLoRA configuration
# dataset = df['Text'].tolist()
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training



In [None]:
import os
import torch
# from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
from peft import LoraConfig, get_peft_model

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

In [None]:
from torch.utils.data import DataLoader

# Assuming train_dataset is an instance of TextDataset
loader = DataLoader(train_dataset, batch_size=2, shuffle=True)


In [None]:
import torch

# Set device to GPU (CUDA) if available, otherwise fall back to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Check GPU availability and name if CUDA is used
if device.type == 'cuda':
    print("GPU Name:", torch.cuda.get_device_name(0))


Using device: cuda
GPU Name: NVIDIA L4


In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)




In [None]:
for batch in loader:
    input_ids = batch['input_ids'].to(device)  # Make sure it's a tensor
    attention_mask = batch['attention_mask'].to(device)  # Make sure it's a tensor

    # Now, ensure the model call uses tensors
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()


In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
# Ensure you've imported model, dataset, SFTTrainer, and other dependencies correctly

# Assuming 'dataset', 'model', and other necessary variables are already defined

# Initial Training Stage
loader = DataLoader(dataset, batch_size=4, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(1):  # Adjust the range for multiple epochs as needed
    for batch in loader:
        input_ids, attention_mask = batch
        # Ensure your model accepts these inputs and adjust accordingly
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Loss: {loss.item()}")




Loss: 14.49356746673584
Loss: 11.248224258422852
Loss: 9.169376373291016
Loss: 9.23759937286377
Loss: 9.878296852111816
Loss: 9.570183753967285
Loss: 13.038642883300781
Loss: 12.36505126953125
Loss: 8.082674026489258
Loss: 5.099785327911377
Loss: 8.535409927368164
Loss: 5.842607021331787
Loss: 3.983592987060547
Loss: 5.319604396820068
Loss: 5.745856761932373
Loss: 5.043037414550781
Loss: 4.312027454376221
Loss: 2.270676374435425
Loss: 2.395481586456299
Loss: 2.3846752643585205
Loss: 2.865147113800049
Loss: 1.998740792274475
Loss: 2.822251319885254
Loss: 1.133853793144226
Loss: 0.9509677290916443
Loss: 1.4362621307373047
Loss: 2.3221731185913086
Loss: 1.3305747509002686
Loss: 1.248058795928955
Loss: 1.7190442085266113
Loss: 1.0149418115615845
Loss: 0.9176511764526367
Loss: 1.4457051753997803
Loss: 1.3818963766098022
Loss: 0.9718351364135742
Loss: 0.9448085427284241
Loss: 1.5937494039535522
Loss: 1.0303008556365967
Loss: 0.8055521845817566
Loss: 1.3624333143234253
Loss: 1.021134257316589

ValueError: Can only automatically infer lengths for datasets whose items are dictionaries with an 'input_ids' key.

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
# Ensure you've imported model, dataset, SFTTrainer, and other dependencies correctly

# Assuming 'dataset', 'model', and other necessary variables are already defined

# Initial Training Stage
loader = DataLoader(dataset, batch_size=4, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(1):  # Adjust the range for multiple epochs as needed
    for batch in loader:
        input_ids, attention_mask = batch
        # Ensure your model accepts these inputs and adjust accordingly
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Loss: {loss.item()}")

# Assume peft_config, training_arguments, and other necessary variables are defined for SFTTrainer
# Fine-Tuning Stage with SFTTrainer


# trainer = SFTTrainer(
#     model=model,
#     train_dataset=dataset,
#     peft_config=peft_config,
#     dataset_text_field="text",
#     max_seq_length=512,  # Example value, adjust as needed
#     #tokenizer=tokenizer,  # Uncomment if tokenizer is defined and needed
#     args=training_arguments,
#     packing=True,  # Example value, adjust based on your setup
# )

# # Train model with SFTTrainer
# trainer.train()




Loss: 14.49356746673584
Loss: 11.248224258422852
Loss: 9.169376373291016
Loss: 9.23759937286377
Loss: 9.878296852111816
Loss: 9.570183753967285
Loss: 13.038642883300781
Loss: 12.36505126953125
Loss: 8.082674026489258
Loss: 5.099785327911377
Loss: 8.535409927368164
Loss: 5.842607021331787
Loss: 3.983592987060547
Loss: 5.319604396820068
Loss: 5.745856761932373
Loss: 5.043037414550781
Loss: 4.312027454376221
Loss: 2.270676374435425
Loss: 2.395481586456299
Loss: 2.3846752643585205
Loss: 2.865147113800049
Loss: 1.998740792274475
Loss: 2.822251319885254
Loss: 1.133853793144226
Loss: 0.9509677290916443
Loss: 1.4362621307373047
Loss: 2.3221731185913086
Loss: 1.3305747509002686
Loss: 1.248058795928955
Loss: 1.7190442085266113
Loss: 1.0149418115615845
Loss: 0.9176511764526367
Loss: 1.4457051753997803
Loss: 1.3818963766098022
Loss: 0.9718351364135742
Loss: 0.9448085427284241
Loss: 1.5937494039535522
Loss: 1.0303008556365967
Loss: 0.8055521845817566
Loss: 1.3624333143234253
Loss: 1.021134257316589

ValueError: Can only automatically infer lengths for datasets whose items are dictionaries with an 'input_ids' key.

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] What is a large language model? [/INST]  A large language model is a type of artificial intelligence (AI) model that is trained on a large dataset of text to generate language outputs that are coherent and natural-sounding. everybody. These models are typically trained on vast amounts of text data, such as books, articles, and websites, and are designed to learn the patterns and structures of language.

Large language models are often used in natural language processing (NLP) tasks such as language translation, text summarization, and language generation. They are also used in chatbots, virtual assistants, and other applications where language understanding and generation is required.

Some of the key features of large language models include:

1. Deep learning architecture: Large language models are typically built using deep learning architectures such as recurrent neural networks (RNNs), long short-term memory (LSTM) networks,


In [None]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Can you generate 30–35 topics? Generate each topic title and summary within 150-180 words, putting more emphasis on limitations.?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] Can you generate 30–35 topics? Generate each topic title and summary within 150-180 words, putting more emphasis on limitations.? [/INST]  Sure! Here are 30-35 potential topics, along with a brief title and summary for each one:

1. "The Impact of Social Media on Mental Health" - Summary: Examine the correlation between social media use and mental health issues, including depression, anxiety, and loneliness. Discuss the potential causes and consequences of this relationship.
2. "The Ethics of Artificial Intelligence" - Summary: Explore the ethical considerations surrounding the development and use of artificial intelligence, including issues of privacy, bias, and accountability.
3. "The Future of Work in the Age of Automation" - Summary: Discuss the potential impact of autom


In [None]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "No you generated 3 topics. Can you generate 30–35 topics? Generate each topic title and summary within 150-180 words, putting more emphasis on limitations.?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] No you generated 3 topics. Can you generate 30–35 topics? Generate each topic title and summary within 150-180 words, putting more emphasis on limitations.? [/INST]  Of course! Here are 30-35 potential topics for a research paper on the topic "Limitations" with a 150-180 word summary for each:

1. Limitations of Artificial Intelligence: Exploring the boundaries of AI's capabilities and the challenges of developing more advanced AI systems.
2. The Limits of Human Knowledge: Investigating the cognitive biases and limitations of human understanding, and the implications for scientific inquiry.
3. The Limits of Economic Growth: Analyzing the environmental, social, and economic costs of unchecked economic growth, and the need for sustainable development.


In [None]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Can you generate 10-12 topics? Generate each topic title and summary within 150-180 words, putting more emphasis on limitations.?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Can you generate 10-12 topics? Generate each topic title and summary within 150-180 words, putting more emphasis on limitations.? [/INST]  Sure! Here are 10-12 potential topics for a research paper on the limitations of AI, along with a brief title and summary for each:

1. "Limits of Deep Learning: Exploring the Shortcomings of Neural Networks"
Summary: Despite their impressive performance in various tasks, deep learning models have limitations that hinder their effectiveness. This paper investigates the shortcomings of neural networks, including their inability to generalize to unseen data, the need for large amounts of labeled training data, and the potential for overfitting.
2. "The Ethical Implications of AI Bias: Understanding the Limits of AI in Decision-Making
