**Step 1: Install Necessary Libraries**

In [1]:
pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.3.19-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.3.17 (from unsloth)
  Downloading unsloth_zoo-2025.3.17-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.18-py3-none-any.whl.metadata (9.2 kB)
Collecting transformers!=4.47.0,>=4.46.1 (from unsloth)
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0

In [None]:
!pip install bitsandbytes
!pip install -U bitsandbytes

In [2]:
import pandas as pd
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from rouge import Rouge
import random


**Step 2: Load and Preprocess the Dataset for Fine-Tuning**

In [3]:
df=pd.read_csv("/kaggle/input/final-hg-dataset/FINAL_HG_DATASET.csv")

In [4]:
df

Unnamed: 0,index,question,generated_answer
0,!0001,3. What are the advantages of using a Ni foam ...,The advantages of using a Ni foam (NF) substra...
1,!0002,4. How does the hierarchical structure and com...,The hierarchical structure and composition of ...
2,!0003,4. What are the current challenges and limitat...,Current challenges and limitations of defect e...
3,!0004,5. What are the promising future research dire...,Promising future research directions in the fi...
4,!0005,4. How do the properties and performance of th...,The properties and performance of CTF films ca...
...,...,...,...
5711,!5712,What is the rate of hydrogen evolution (in µmo...,Hydrogen evolution rate for MoS?-based nanomat...
5712,!5713,How much hydrogen (in µmol) is produced per gr...,Hydrogen produced by Pt-Co alloy: Pt-Co alloys...
5713,!5714,What is the hydrogen production rate (in mL/mi...,Hydrogen production rate for MoS?/TiO? hybrid ...
5714,!5715,How much hydrogen (in µmol) is produced per gr...,Hydrogen produced by CoFe?O?-based photocataly...


In [5]:
import re

def clean_column(text):
    if isinstance(text, str):
        text = re.sub(r'\*\*|\*|!|\?', '', text).strip()
        text = re.sub(r'^\d+\.\s', '', text)
        text = re.sub(r'\s+', ' ', text)

    return text
df['question'] = df['question'].apply(clean_column)
df['generated_answer'] = df['generated_answer'].apply(clean_column)

In [6]:
output_path = "/kaggle/working/cleaned_dataset.csv"  
df.to_csv(output_path, index=False)

print(f"Cleaned dataset saved successfully at: {output_path}")


Cleaned dataset saved successfully at: /kaggle/working/cleaned_dataset.csv


In [7]:
df_dataset = Dataset.from_pandas(df[['question', 'generated_answer']])
from datasets import Dataset


def preprocess_function(example):
    return {
        "input_text": example["question"],  # Model input
        "target_text": example["generated_answer"]  # Expected output
    }


df_dataset = df_dataset.map(preprocess_function, remove_columns=["question", "generated_answer"])



Map:   0%|          | 0/5716 [00:00<?, ? examples/s]

In [8]:
print(df_dataset.to_pandas().head()) 


                                          input_text  \
0  What are the advantages of using a Ni foam (NF...   
1  How does the hierarchical structure and compos...   
2  What are the current challenges and limitation...   
3  What are the promising future research directi...   
4  How do the properties and performance of the C...   

                                         target_text  
0  The advantages of using a Ni foam (NF) substra...  
1  The hierarchical structure and composition of ...  
2  Current challenges and limitations of defect e...  
3  Promising future research directions in the fi...  
4  The properties and performance of CTF films ca...  


In [9]:
# Preprocess the DataFrame to separate the input (question) and target (generated_answer)
# df["qna"] = "Question:\n " + df["question"] + " \n\nAnswer:\n " + df["generated_answer"]


**Step 3: Load Pretrained Model and Tokenizer**


In [10]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [11]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.3.5: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

In [12]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.3.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [13]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor


**Step 4: Preprocess the Dataset for Fine-Tuning**

In [14]:
chat_prompt = """
### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [15]:
EOS_TOKEN = tokenizer.eos_token  
def formatting_prompts_func(examples):
    instruction = ""
    inputs       = examples["question"]
    outputs      = examples["generated_answer"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = chat_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass



In [16]:
dataset_path = "/kaggle/working/cleaned_dataset.csv"
dataset = load_dataset("csv", data_files=dataset_path, split="train")

print("Dataset loaded successfully!")
print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded successfully!
Dataset({
    features: ['index', 'question', 'generated_answer'],
    num_rows: 5716
})


In [17]:
dataset = dataset.map(formatting_prompts_func, batched=True)

import pprint
pprint.pprint(dataset[250])
pprint.pprint(dataset[260])
pprint.pprint(dataset[270])


Map:   0%|          | 0/5716 [00:00<?, ? examples/s]

{'generated_answer': 'Current Limitations and Future Research Directions in '
                     'Electro-Oxidation of Biomass-DerivativesDespite the '
                     'promising potential of electro-oxidation for biomass '
                     'valorization, several limitations and research '
                     'directions exist:Limitations: Low selectivity: '
                     'Electro-oxidation reactions can produce a complex '
                     'mixture of products, making it challenging to '
                     'selectively obtain the desired target chemicals. '
                     'Competing reactions: Unwanted side reactions, such as '
                     'combustion or hydrogen evolution, can occur during '
                     'electro-oxidation, reducing the efficiency of the '
                     'process. Catalyst stability: Electro-oxidation catalysts '
                     'can be susceptible to deactivation or degradation over '
                     't

**Step 5: Configure the Training Parameters**

In [18]:
pip install wandb

Note: you may need to restart the kernel to use updated packages.


In [None]:
import wandb
wandb.login(key="YOUR_TOKEN_HERE") 




True

In [23]:
wandb.init(
    project="Llama3_FT_RAG",
    name="exp_1",
    config={
        "batch_size": 4,
        "learning_rate": 2e-4
    }
)


In [27]:
from trl import SFTTrainer
from transformers import TrainingArguments
import torch

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 256, 
    dataset_num_proc = 1,  
    packing = True,
    args = TrainingArguments(
        per_device_train_batch_size = 2,  
        gradient_accumulation_steps = 4,  
        warmup_steps = 6,  
        max_steps = 600,
        learning_rate = 1e-4,  
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 30,
        optim = "adamw_torch",  
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "Llama3_FT",
    ),
)


Tokenizing to ["text"]:   0%|          | 0/5716 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/5716 [00:00<?, ? examples/s]

In [28]:
wandb.watch(model, log="all")

In [29]:
# Proceed with your training
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5,202 | Num Epochs = 2 | Total steps = 600
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/4,582,543,360 (0.92% trained)


Step,Training Loss
30,1.0117
60,0.9932
90,0.9532
120,0.9343
150,0.9015
180,0.8954
210,0.8928
240,0.8937
270,0.8783
300,0.8693


In [30]:
wandb.init(
    project="Llama3_FT_RAG",
    name="exp_1",
    config={
        "batch_size": 4,
        "learning_rate": 2e-4
    }
)


In [31]:
wandb.log({"final_loss": trainer_stats.training_loss})
wandb.finish()

0,1
final_loss,▁
train/epoch,▁▁▁▂▂▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/global_step,▁▁▁▂▂▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▇▇▇████
train/grad_norm,▁▁▂▂▄█▅▅▃▄▄▇▄▄▄▇▆▆▅▇▆▆▇▆
train/learning_rate,▇▅▃▁██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
train/loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
final_loss,0.84914
total_flos,1.1112080469078835e+17
train/epoch,1.84243
train/global_step,600.0
train/grad_norm,0.83582
train/learning_rate,0.0
train/loss,0.7463
train_loss,0.84914
train_runtime,9084.1847
train_samples_per_second,1.057


**Step 6: Saving, loading finetuned models**

In [35]:
model.save_pretrained("Llama3_FT")
tokenizer.save_pretrained("Llama3_FT")

('Llama3_FT/tokenizer_config.json',
 'Llama3_FT/special_tokens_map.json',
 'Llama3_FT/tokenizer.json')

In [32]:
input_text = chat_prompt.format("", "What are the advantages of using a Ni foam (NF) substrate for the Ni/Mo-TEC@NF complex?", "")
inputs = tokenizer([input_text], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=350, use_cache=True)
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
answer = generated_text.split("### Response:")[-1].strip()  
print("Generated Answer:\n", answer)


Generated Answer:
 Using a Ni foam (NF) substrate for the Ni/Mo-TEC@NF complex offers several advantages: High electrical conductivity: NF is a highly conductive substrate, which facilitates efficient charge transfer and minimizes ohmic losses during electrochemical reactions. Large surface area: The porous structure of NF provides a large surface area for the deposition of Ni/Mo-TEC, maximizing the number of active sites available for electrocatalysis. Mechanical stability: NF is a durable and robust substrate that can withstand the harsh conditions of electrochemical reactions, ensuring the long-term stability of the Ni/Mo-TEC@NF complex. Cost-effectiveness: NF is a relatively inexpensive material, making it a cost-effective choice for large-scale applications. Versatility: NF can be easily integrated into various electrochemical devices, such as flow cells, electrolyzers, and batteries, enabling its use in a wide range of electrocatalytic applications.Overall, the combination of hig

**Storing it in Hugging Face**

In [50]:
pip uninstall huggingface_hub -y


Found existing installation: huggingface-hub 0.27.0
Uninstalling huggingface-hub-0.27.0:
  Successfully uninstalled huggingface-hub-0.27.0
Note: you may need to restart the kernel to use updated packages.


In [51]:
pip install huggingface_hub


Collecting huggingface_hub
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.29.1-py3-none-any.whl (468 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.0/468.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: huggingface_hub
Successfully installed huggingface_hub-0.29.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
from huggingface_hub import login

# Log in with your token
login("YOUR_HF_TOKEN")


In [53]:
trainer.push_to_hub("Llama3_FT"),

training_args.bin:   0%|          | 0.00/5.62k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1740309641.326f0119f022.31.0:   0%|          | 0.00/18.7k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

(CommitInfo(commit_url='https://huggingface.co/1MK26/llama3_FT/commit/171fee7109e6401a96e9103253ed4d09c69e1ded', commit_message='Llama3_FT', commit_description='', oid='171fee7109e6401a96e9103253ed4d09c69e1ded', pr_url=None, repo_url=RepoUrl('https://huggingface.co/1MK26/llama3_FT', endpoint='https://huggingface.co', repo_type='model', repo_id='1MK26/llama3_FT'), pr_revision=None, pr_num=None),)

In [54]:
tokenizer.push_to_hub("Llama3_FT"),

README.md:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

(CommitInfo(commit_url='https://huggingface.co/1MK26/llama3_FT/commit/60e7e808456fa27373d776f18752953dc389a4bd', commit_message='Upload tokenizer', commit_description='', oid='60e7e808456fa27373d776f18752953dc389a4bd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/1MK26/llama3_FT', endpoint='https://huggingface.co', repo_type='model', repo_id='1MK26/llama3_FT'), pr_revision=None, pr_num=None),)

**Step 7: Perform Inference with the Fine-Tuned Model to Evaluate output**


In [3]:
from transformers import AutoTokenizer, LlamaForCausalLM
model_name = "1MK26/llama3_FT"  

model = LlamaForCausalLM.from_pretrained(model_name).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)  

question = "How can the performance of LSC-based photocatalytic devices be further improved through optimization of the LSC material, photocatalyst, and device architecture?"

inputs = tokenizer(
    [f"Question: {question}\nAnswer:\n"],  # Input in simpler format
    return_tensors="pt"
).to("cuda")


outputs = model.generate(**inputs, max_new_tokens=1024, use_cache=True)
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
for output in decoded_outputs:
    print(output)


adapter_config.json:   0%|          | 0.00/798 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Question: How can the performance of LSC-based photocatalytic devices be further improved through optimization of the LSC material, photocatalyst, and device architecture?
Answer:
The performance of LSC-based photocatalytic devices can be further improved through optimization of the LSC material, photocatalyst, and device architecture. For instance, improving the antireflective properties of the LSC material can enhance light absorption, while optimizing the thickness and morphology of the photocatalyst layer can enhance light harvesting and charge separation. Additionally, integrating the LSC material and photocatalyst into a tandem or multi-junction device can significantly improve overall performance. By optimizing these factors, LSC-based photocatalytic devices can achieve higher efficiency and scalability for practical applications in hydrogen production and other related fields.


In [4]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
def generate_answer(question):
    # Prepare inputs using a cleaner format
    inputs = tokenizer([f"Question: {question}\nAnswer:\n"], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=1024, use_cache=True)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start_index = decoded_output.lower().find("answer:")  
    if answer_start_index != -1:
        decoded_output = decoded_output[answer_start_index + len("Answer:"):].strip()
    if decoded_output.lower().startswith(f"question: {question.lower()}"):
        decoded_output = decoded_output[len(f"question: {question.lower()}"):]

    return decoded_output.strip()


In [7]:
file_path = "/kaggle/input/hg-finetuning/HG_Dataset_finetuning.csv"  
df = pd.read_csv(file_path)

In [14]:
generated_answers = []
true_answers = []

sampled_indices = random.sample(range(len(df)), 10)

# Evaluation Loop
for idx in sampled_indices:  
    data = df.iloc[idx]
    question = data["question"]
    summary = data["Summary"]       
    generated_answer = data["generated_answer"]
    modified_answer = " ".join(generated_answer.split()[:int(len(generated_answer.split()) * 0.5)]) 
    generated_answers.append(modified_answer)  
    true_answers.append(str(summary)) 

In [15]:


rouge = Rouge()
scores = rouge.get_scores(generated_answers, true_answers, avg=True)
scores_df = pd.DataFrame(scores).T
print(scores_df)

                r         p         f
rouge-1  0.774229  0.454997  0.531794
rouge-2  0.694190  0.363827  0.425477
rouge-l  0.766775  0.451177  0.526743
