In [None]:
!pip install -q accelerate peft bitsandbytes transformers trl GPUtil langchain_community pypdf

In [2]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaTokenizer
from langchain_community.document_loaders import PyPDFLoader
from huggingface_hub import notebook_login
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from peft import PeftModel

In [3]:
from google.colab import userdata
from huggingface_hub import login
login(token=userdata.get('secretKey'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##  Get the  Llama-2-7b with Quantiztion

In [4]:
base_model_id = "meta-llama/Llama-2-7b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,

    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id ,
    quantization_config=bnb_config,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [5]:

def print_model_size(model, label="Model"):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"{label} → Total Parameters: {total_params:,}, Trainable Parameters: {trainable_params:,}, Trainable %: {100 * trainable_params / total_params:.2f}%")


print_model_size(model, label="Original Model")


Original Model → Total Parameters: 3,500,412,928, Trainable Parameters: 262,410,240, Trainable %: 7.50%


In [6]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

print_model_size(model, label="LoRA Model")


LoRA Model → Total Parameters: 3,520,401,408, Trainable Parameters: 19,988,480, Trainable %: 0.57%


## Data Collection and Tokenization

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
train_dataset = load_dataset("text", data_files={"train":
                                                 ["/content/drive/MyDrive/wildFire/hawaii_wf_4.txt", "/content/drive/MyDrive/wildFire/hawaii_wf_2.txt"]}, split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# file_paths = [f"/content/drive/MyDrive/wildFire/hawaii_wf_{i}.txt" for i in range(1, 11)]
# train_dataset = load_dataset("text", data_files={"train": file_paths}, split="train")


Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
tokenizer = LlamaTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True, add_eos_token=True)

if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [10]:
tokenized_train_dataset = []
for phrase in train_dataset:
  tokenized_train_dataset.append(tokenizer(phrase["text"]))

In [11]:
tokenized_train_dataset[1]

{'input_ids': [1, 750, 4586, 25447, 297, 278, 23474, 304, 10169, 278, 3974, 29892, 5662, 3864, 896, 7450, 278, 11176, 14703, 27709, 23511, 29889, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# **Training**

In [12]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    args=transformers.TrainingArguments(
        output_dir="./finetunedModel",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        learning_rate=1e-4,
        max_steps=100,
        bf16=False,
        optim="paged_adamw_8bit",
        logging_dir="./log",
        save_strategy="epoch",
        save_steps=50,
        logging_steps=10

),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache=False
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmanojgnanapalam[0m ([33mmanojgnanapalam-national-forensic-sciences-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Step,Training Loss
10,3.4952
20,3.1314
30,3.0211
40,2.8012
50,2.6604
60,2.5077
70,1.9824
80,2.0138
90,2.023
100,2.0123


  return fn(*args, **kwargs)


TrainOutput(global_step=100, training_loss=2.5648461532592775, metrics={'train_runtime': 450.4701, 'train_samples_per_second': 0.888, 'train_steps_per_second': 0.222, 'total_flos': 430087343702016.0, 'train_loss': 2.5648461532592775, 'epoch': 1.6446280991735538})

In [13]:

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id ,
    quantization_config=bnb_config,
  )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
modelFinetuned = PeftModel.from_pretrained(base_model, "/content/finetunedModel/checkpoint-100")

#compare the fine-tuned LLaMA 2 model vs. the base LLaMA 2 model on wildfire-related questions

In [20]:
user_question = "how Radio Communications used during wild fire"

eval_prompt = f"Question: {user_question} Just answer this question accurately and concisely.\n"

promptTokenized = tokenizer(eval_prompt, return_tensors="pt").to(device)

modelFinetuned.eval()

with torch.no_grad():
  print(tokenizer.decode(modelFinetuned.generate(**promptTokenized, max_new_tokens=1024)[0], skip_special_tokens=True))
  torch.cuda.empty_cache()

Question: how Radio Communications used during wild fire Just answer this question accurately and concisely.
 Lahaina Police Department, Maui Police Department, and Hawaii National Guard were the primary agencies that utilized the RCC for the duration of the wildfire. The RCC served as the centralized communication hub for these agencies, providing a platform for efficient and effective communication.

Throughout the wildfire, the RCC received numerous calls for service, with officers and dispatchers coordinating with each other to manage the influx of calls. The RCC also provided critical interagency communication, with officers and dispatchers working together to ensure that all agencies were aware of the situation and were able to coordinate their efforts effectively.

The RCC also served as a hub for the Maui Police Department's Command Post, where officers were able to monitor the situation and coordinate their efforts with other agencies. This allowed officers to effectively mana

In [25]:
user_question = "When did wildfires start?"

eval_prompt = f"Question: {user_question} Just answer this question accurately and concisely.\n"

promptTokenized = tokenizer(eval_prompt, return_tensors="pt").to(device)

modelFinetuned.eval()

with torch.no_grad():
  print(tokenizer.decode(modelFinetuned.generate(**promptTokenized, max_new_tokens=500)[0], skip_special_tokens=True))
  torch.cuda.empty_cache()

Question: When did wildfires start? Just answer this question accurately and concisely.
 Lahaina Fire (August 8, 2023)
Location: Lahaina, Maui
Date: August 8, 2023
Time: 2:55 p.m. HST
Total area burned: 3,275 acres
Fire cause: Unknown
Fire summary: The Lahaina fire was reported on August 8, 2023, at 2:55 p.m. and quickly spread through the Lahaina area. The fire burned in various locations, including the Lahaina Cannery Mall, Lahaina Intermediate School, and Lahaina High School. The fire was reported to have started near the intersection of Honoapiilani Highway and Keawe Street. The fire was fully contained by 11:59 p.m. on August 9, 2023.
The fire resulted in one fatality, and a total of 18 individuals were injured. The fire destroyed 168 structures, including homes, businesses, and community facilities. The fire also caused significant damage to infrastructure, including power lines, poles, and communication lines. The fire resulted in a total of 1,283 acres burned. The cause of the 

In [21]:
base_model.eval()

with torch.no_grad():
  print(tokenizer.decode(base_model.generate(**promptTokenized, max_new_tokens=1024)[0], skip_special_tokens=True))
  torch.cuda.empty_cache()

Question: how Radio Communications used during wild fire Just answer this question accurately and concisely.
 Lahaina Fire and Police Radio Communications.
Police Radio Communications.
The following is a summary of the radio communications from the Lahaina Fire Department and Lahaina Police Department during the wildfire on August 8, 2023, as well as the aftermath:
Lahaina Fire Radio Communications:
08:35 hours - Lahaina Fire Department (LFD) received a report of a wildfire in the area of Keawe Street and Front Street.
08:40 hours - LFD dispatched a supervisor to the scene to assess the situation.
08:45 hours - LFD dispatched units to the reported fire location.
09:00 hours - LFD began evacuating residents in the area.
09:15 hours - LFD established a command post at the Lahaina Police Department (LPD) and began coordinating evacuations with LPD.
09:30 hours - LFD dispatched additional units to assist with evacuations and fire suppression efforts.
09:45 hours - LFD reported that the fir

In [None]:
base_model.eval()

with torch.no_grad():
  print(tokenizer.decode(base_model.generate(**promptTokenized, max_new_tokens=1024)[0], skip_special_tokens=True))
  torch.cuda.empty_cache()

Question: When did Hawaii wildfires start? Just answer this question accurately and concisely.
 Lahaina fire: 12:40 p.m. on August 8, 2023.
Kula fire: 1:58 p.m. on August 8, 2023.
Waiehu fire: 4:20 p.m. on August 8, 2023.
Lahaina fire: 6:20 p.m. on August 8, 2023.
Kula fire: 7:20 p.m. on August 8, 2023.
Waiehu fire: 8:40 p.m. on August 8, 2023.
Lahaina fire: 9:45 p.m. on August 8, 2023.
Kula fire: 10:45 p.m. on August 8, 2023.
Waiehu fire: 11:45 p.m. on August 8, 2023.
Lahaina fire: 12:45 a.m. on August 9, 2023.
Kula fire: 1:45 a.m. on August 9, 2023.
Waiehu fire: 2:45 a.m. on August 9, 2023.
Lahaina fire: 3:45 a.m. on August 9, 2023.
Kula fire: 4:45 a.m. on August 9, 2023.
Waiehu fire: 5:45 a.m. on August 9, 2023.
Lahaina fire: 6:45 a.m. on August 9, 2023.
Kula fire: 7:45 a.m. on August 9, 2023.
Waiehu fire: 8:45 a.m. on August 9, 2023.
Lahaina fire: 9:45 a.m. on August 9, 2023.
Kula fire: 10:45 a.m. on August 9, 2023.
Waiehu fire: 11:45 a.m. on August 9, 2023.
Lahaina fire: 12:45 p.m