In [20]:
import pandas as pd
from datasets import Dataset

# Load your CSV
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv("fire_classification_data_1000.csv")

# Check label distribution before splitting (optional)
print("Original label distribution:")
print(df["label"].value_counts())

# Split into train (80%), val (10%), test (10%)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

# Save the splits
train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)


Original label distribution:
label
Dangerous fire     400
Controlled fire    400
No fire            200
Name: count, dtype: int64


In [2]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [22]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Load CSVs
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")
test_df = pd.read_csv("test.csv")

# Convert label names to integers
label_map = {"Controlled fire": 0, "Dangerous fire": 1, "No fire": 2}
train_df["label"] = train_df["label"].map(label_map)
val_df["label"] = val_df["label"].map(label_map)
test_df["label"] = test_df["label"].map(label_map)

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(batch):
    return tokenizer(batch["context"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [23]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Define accuracy metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [33]:
# Ensure the label mappings are correct
model.config.id2label = {
    0: "Controlled fire",
    1: "Dangerous fire",
    2: "No fire"
}
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

# Retrain the model to ensure everything is correct
trainer.train()

# Evaluate the model on the test dataset
results = trainer.evaluate(test_dataset)
print("Test accuracy:", results["eval_accuracy"])

# Save the fine-tuned model and tokenizer
import shutil
save_path = "fire-risk-classifier"

# Clean up the directory if it exists to avoid file locking issues
if os.path.exists(save_path):
    shutil.rmtree(save_path)

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Model and tokenizer saved successfully.")

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0009,0.000583,1.0
2,0.0005,0.000372,1.0
3,0.0004,0.000306,1.0
4,0.0004,0.000288,1.0


Test accuracy: 1.0
Model and tokenizer saved successfully.
Model and tokenizer saved successfully.


In [34]:
# Test the saved model with the pipeline
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="fire-risk-classifier",
    tokenizer="fire-risk-classifier",
    return_all_scores=True
)

# Run a quick test
examples = [
    "A candle is lit in the dark",
    "Flames spreading on a wooden floor",
    "A quiet park with children playing",
]

for text in examples:
    out = classifier(text)
    print(f"\nInput: {text}")
    for label_info in out[0]:
        print(f"  {label_info['label']}: {label_info['score']:.2f}")

Device set to use cuda:0



Input: A candle is lit in the dark
  Controlled fire: 1.00
  Dangerous fire: 0.00
  No fire: 0.00

Input: Flames spreading on a wooden floor
  Controlled fire: 0.00
  Dangerous fire: 1.00
  No fire: 0.00

Input: A quiet park with children playing
  Controlled fire: 0.00
  Dangerous fire: 0.00
  No fire: 0.99




In [35]:
import traceback

try:
    # Add a test to verify the model and tokenizer
    print("Testing model and tokenizer...")
    test_input = "Fire spreading in a forest"
    test_output = classifier(test_input)
    print("Test output:", test_output)

except Exception as e:
    print("An error occurred:")
    traceback.print_exc()

# Check environment details
import transformers
print("Transformers version:", transformers.__version__)
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

Testing model and tokenizer...
Test output: [[{'label': 'Controlled fire', 'score': 0.0011441211681813002}, {'label': 'Dangerous fire', 'score': 0.9978399276733398}, {'label': 'No fire', 'score': 0.001015945803374052}]]
Transformers version: 4.51.3
Torch version: 2.5.1
CUDA available: True
