# LLM example Usecase: Climate Indicator Classification

### Load data

In [1]:
%pip install pandas accelerate transformers torch scikit-learn marvin openpyxl

Collecting marvin
  Obtaining dependency information for marvin from https://files.pythonhosted.org/packages/84/4f/7a5875e601b406778f2cfea77f311a7d70a36241333642e759eb2c45bf6e/marvin-3.0.6-py3-none-any.whl.metadata
  Downloading marvin-3.0.6-py3-none-any.whl.metadata (30 kB)
Collecting openpyxl
  Obtaining dependency information for openpyxl from https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl.metadata
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting aiosqlite>=0.20.0 (from marvin)
  Obtaining dependency information for aiosqlite>=0.20.0 from https://files.pythonhosted.org/packages/f5/10/6c25ed6de94c49f88a91fa5018cb4c0f3625f31d5be9f771ebe5cc7cd506/aiosqlite-0.21.0-py3-none-any.whl.metadata
  Using cached aiosqlite-0.21.0-py3-none-any.whl.metadata (4.3 kB)
Collecting alembic>=1.12.0 (from marvin)
  Obtaining dependency information for alembic>=1.12.0 from https://

In [None]:
import pandas as pd

excel_file_path = "/content/ej_dataset.xlsx" # you can get this file from: https://docs.google.com/spreadsheets/d/1Fk_WKMAYUz3Yl49QspZDGXrEMJq9pmAF/edit?usp=sharing&ouid=107577045329506013851&rtpof=true&sd=true
df = pd.read_excel(excel_file_path, engine="openpyxl")

df = df.dropna()
df.sample(frac=1).reset_index(drop=True)

## Load the Encoder Model using Huggingface Lib

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# Load your domain-specific encoder model (replace 'model_name' with your model's name)
model_name = 'nasa-impact/nasa-smd-ibm-distil-v0.1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Encode the data and Labels into ML-ready format

In [None]:
## encode labels
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset

# Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['Indicators'])
# print the label mapping
for index, item in enumerate(label_encoder.classes_):
    print(item, '->', index)

data = df
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['labels'])
train_encodings = tokenizer(list(train_data['Description']), return_tensors='pt', padding=True, truncation=True, max_length=512, return_attention_mask=True)
test_encodings = tokenizer(list(test_data['Description']), return_tensors='pt', padding=True, truncation=True, max_length=512, return_attention_mask=True)
train_dataset = CustomDataset(train_encodings, torch.tensor(train_data['labels'].tolist()))
test_dataset = CustomDataset(test_encodings, torch.tensor(test_data['labels'].tolist()))


Climate Change -> 0
Disasters -> 1
Extreme Heat -> 2
Food Availability -> 3
Health & Air Quality -> 4
Human Dimensions -> 5
Urban Flooding -> 6
Water Availability -> 7


## Create training and evaluation arguments for the Huggingface Trainer

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    evaluation_strategy="epoch",
    output_dir="./output",
    num_train_epochs=10,
    save_steps=50,
    save_total_limit=2,
    remove_unused_columns=True,
    logging_dir="./logs",
    optim="adamw_torch",
    learning_rate=5e-5,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=lambda p: classification_report(p.label_ids, p.predictions.argmax(-1), output_dict=True),
)

trainer.train()

results = trainer.evaluate()

# Print classification report
print("Classification Report:")
print(results)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,0,1,2,3,4,5,6,7,Accuracy,Macro avg,Weighted avg
1,No log,1.573936,"{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.6470588235294118, 'recall': 0.8461538461538461, 'f1-score': 0.7333333333333334, 'support': 13}","{'precision': 0.47058823529411764, 'recall': 0.9411764705882353, 'f1-score': 0.627450980392157, 'support': 17}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}",0.529412,"{'precision': 0.13970588235294118, 'recall': 0.22341628959276016, 'f1-score': 0.17009803921568628, 'support': 51}","{'precision': 0.32179930795847755, 'recall': 0.5294117647058824, 'f1-score': 0.39607843137254906, 'support': 51}"
2,No log,1.226911,"{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.4444444444444444, 'recall': 0.9230769230769231, 'f1-score': 0.6, 'support': 13}","{'precision': 0.6956521739130435, 'recall': 0.9411764705882353, 'f1-score': 0.7999999999999999, 'support': 17}","{'precision': 1.0, 'recall': 0.25, 'f1-score': 0.4, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}",0.568627,"{'precision': 0.26751207729468596, 'recall': 0.2642816742081448, 'f1-score': 0.22499999999999998, 'support': 51}","{'precision': 0.423605190868618, 'recall': 0.5686274509803921, 'f1-score': 0.45098039215686275, 'support': 51}"
3,No log,0.951818,"{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}","{'precision': 0.8, 'recall': 0.6666666666666666, 'f1-score': 0.7272727272727272, 'support': 6}","{'precision': 1.0, 'recall': 0.3333333333333333, 'f1-score': 0.5, 'support': 3}","{'precision': 0.75, 'recall': 0.9230769230769231, 'f1-score': 0.8275862068965517, 'support': 13}","{'precision': 0.8, 'recall': 0.9411764705882353, 'f1-score': 0.8648648648648648, 'support': 17}","{'precision': 0.8, 'recall': 1.0, 'f1-score': 0.888888888888889, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}",0.803922,"{'precision': 0.64375, 'recall': 0.6080316742081449, 'f1-score': 0.601076585990379, 'support': 51}","{'precision': 0.7519607843137255, 'recall': 0.803921568627451, 'f1-score': 0.7623630453245059, 'support': 51}"
4,No log,0.698755,"{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4}","{'precision': 1.0, 'recall': 0.3333333333333333, 'f1-score': 0.5, 'support': 3}","{'precision': 1.0, 'recall': 0.8333333333333334, 'f1-score': 0.9090909090909091, 'support': 6}","{'precision': 1.0, 'recall': 0.6666666666666666, 'f1-score': 0.8, 'support': 3}","{'precision': 0.8571428571428571, 'recall': 0.9230769230769231, 'f1-score': 0.888888888888889, 'support': 13}","{'precision': 0.8, 'recall': 0.9411764705882353, 'f1-score': 0.8648648648648648, 'support': 17}","{'precision': 0.8, 'recall': 1.0, 'f1-score': 0.888888888888889, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}",0.862745,"{'precision': 0.8071428571428572, 'recall': 0.7121983408748114, 'f1-score': 0.731466693966694, 'support': 51}","{'precision': 0.861624649859944, 'recall': 0.8627450980392157, 'f1-score': 0.8464384170266523, 'support': 51}"
5,No log,0.571192,"{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4}","{'precision': 1.0, 'recall': 0.3333333333333333, 'f1-score': 0.5, 'support': 3}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6}","{'precision': 1.0, 'recall': 0.6666666666666666, 'f1-score': 0.8, 'support': 3}","{'precision': 0.9230769230769231, 'recall': 0.9230769230769231, 'f1-score': 0.9230769230769231, 'support': 13}","{'precision': 0.8, 'recall': 0.9411764705882353, 'f1-score': 0.8648648648648648, 'support': 17}","{'precision': 0.8, 'recall': 1.0, 'f1-score': 0.888888888888889, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}",0.882353,"{'precision': 0.8153846153846154, 'recall': 0.7330316742081449, 'f1-score': 0.7471038346038346, 'support': 51}","{'precision': 0.8784313725490196, 'recall': 0.8823529411764706, 'f1-score': 0.8658482011423188, 'support': 51}"
6,No log,0.420103,"{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4}","{'precision': 1.0, 'recall': 0.3333333333333333, 'f1-score': 0.5, 'support': 3}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 13}","{'precision': 0.8947368421052632, 'recall': 1.0, 'f1-score': 0.9444444444444444, 'support': 17}","{'precision': 0.8, 'recall': 1.0, 'f1-score': 0.888888888888889, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}",0.941176,"{'precision': 0.8368421052631579, 'recall': 0.7916666666666666, 'f1-score': 0.7916666666666667, 'support': 51}","{'precision': 0.9296181630546955, 'recall': 0.9411764705882353, 'f1-score': 0.9237472766884531, 'support': 51}"
7,No log,0.403685,"{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4}","{'precision': 1.0, 'recall': 0.3333333333333333, 'f1-score': 0.5, 'support': 3}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3}","{'precision': 1.0, 'recall': 0.9230769230769231, 'f1-score': 0.9600000000000001, 'support': 13}","{'precision': 0.85, 'recall': 1.0, 'f1-score': 0.9189189189189189, 'support': 17}","{'precision': 0.8, 'recall': 1.0, 'f1-score': 0.888888888888889, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}",0.921569,"{'precision': 0.83125, 'recall': 0.782051282051282, 'f1-score': 0.783475975975976, 'support': 51}","{'precision': 0.9147058823529411, 'recall': 0.9215686274509803, 'f1-score': 0.905042689748572, 'support': 51}"
8,No log,0.363228,"{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4}","{'precision': 1.0, 'recall': 0.3333333333333333, 'f1-score': 0.5, 'support': 3}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3}","{'precision': 1.0, 'recall': 0.9230769230769231, 'f1-score': 0.9600000000000001, 'support': 13}","{'precision': 0.85, 'recall': 1.0, 'f1-score': 0.9189189189189189, 'support': 17}","{'precision': 0.8, 'recall': 1.0, 'f1-score': 0.888888888888889, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}",0.921569,"{'precision': 0.83125, 'recall': 0.782051282051282, 'f1-score': 0.783475975975976, 'support': 51}","{'precision': 0.9147058823529411, 'recall': 0.9215686274509803, 'f1-score': 0.905042689748572, 'support': 51}"
9,No log,0.319494,"{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4}","{'precision': 1.0, 'recall': 0.3333333333333333, 'f1-score': 0.5, 'support': 3}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 13}","{'precision': 0.8947368421052632, 'recall': 1.0, 'f1-score': 0.9444444444444444, 'support': 17}","{'precision': 0.8, 'recall': 1.0, 'f1-score': 0.888888888888889, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}",0.941176,"{'precision': 0.8368421052631579, 'recall': 0.7916666666666666, 'f1-score': 0.7916666666666667, 'support': 51}","{'precision': 0.9296181630546955, 'recall': 0.9411764705882353, 'f1-score': 0.9237472766884531, 'support': 51}"
10,No log,0.31812,"{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4}","{'precision': 1.0, 'recall': 0.3333333333333333, 'f1-score': 0.5, 'support': 3}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3}","{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 13}","{'precision': 0.8947368421052632, 'recall': 1.0, 'f1-score': 0.9444444444444444, 'support': 17}","{'precision': 0.8, 'recall': 1.0, 'f1-score': 0.888888888888889, 'support': 4}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}",0.941176,"{'precision': 0.8368421052631579, 'recall': 0.7916666666666666, 'f1-score': 0.7916666666666667, 'support': 51}","{'precision': 0.9296181630546955, 'recall': 0.9411764705882353, 'f1-score': 0.9237472766884531, 'support': 51}"


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4}" of type <class 'dict'> for key "eval/0" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3}" of type <class 'dict'> for key "eval/1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6}" of type <class 'dict'> for key "eval/2" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4}" of type <class 'dict'> for key "eval/0" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 1.0, 'recall': 0.3333333333333333, 'f1-score': 0.5, 'support': 3}" of type <class 'dict'> for key "eval/1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6}" of type <class 'dict'> for key "eval/2" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision

Classification Report:
{'eval_loss': 0.3181198835372925, 'eval_0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4}, 'eval_1': {'precision': 1.0, 'recall': 0.3333333333333333, 'f1-score': 0.5, 'support': 3}, 'eval_2': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6}, 'eval_3': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3}, 'eval_4': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 13}, 'eval_5': {'precision': 0.8947368421052632, 'recall': 1.0, 'f1-score': 0.9444444444444444, 'support': 17}, 'eval_6': {'precision': 0.8, 'recall': 1.0, 'f1-score': 0.888888888888889, 'support': 4}, 'eval_7': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}, 'eval_accuracy': 0.9411764705882353, 'eval_macro avg': {'precision': 0.8368421052631579, 'recall': 0.7916666666666666, 'f1-score': 0.7916666666666667, 'support': 51}, 'eval_weighted avg': {'precision': 0.9296181630546955, 'recall': 0.9411764705882353, 'f1-score': 0.92374727

## Evaluation on the test set

In [None]:
results

{'eval_loss': 0.3181198835372925,
 'eval_0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4},
 'eval_1': {'precision': 1.0,
  'recall': 0.3333333333333333,
  'f1-score': 0.5,
  'support': 3},
 'eval_2': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6},
 'eval_3': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3},
 'eval_4': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 13},
 'eval_5': {'precision': 0.8947368421052632,
  'recall': 1.0,
  'f1-score': 0.9444444444444444,
  'support': 17},
 'eval_6': {'precision': 0.8,
  'recall': 1.0,
  'f1-score': 0.888888888888889,
  'support': 4},
 'eval_7': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1},
 'eval_accuracy': 0.9411764705882353,
 'eval_macro avg': {'precision': 0.8368421052631579,
  'recall': 0.7916666666666666,
  'f1-score': 0.7916666666666667,
  'support': 51},
 'eval_weighted avg': {'precision': 0.9296181630546955,
  'recall': 0.9411764705882353,
  'f1-sc

## save model for later use

In [None]:

# Save the trained model
model.save_pretrained('ej_classifier')  # Replace 'your_model_directory' with your desired directory
tokenizer.save_pretrained('ej_tokenizer')  # Save the tokenizer as well

('ej_tokenizer/tokenizer_config.json',
 'ej_tokenizer/special_tokens_map.json',
 'ej_tokenizer/vocab.txt',
 'ej_tokenizer/added_tokens.json',
 'ej_tokenizer/tokenizer.json')

## Predict using the saved model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
import pandas as pd

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('ej_classifier')  # Load from the directory where you saved it
tokenizer = AutoTokenizer.from_pretrained('ej_tokenizer')

# Get predictions
with torch.no_grad():
    outputs = model(**test_encodings)

# Extract predicted class labels
predicted_labels = torch.argmax(outputs.logits, dim=1)

predicted_class_labels = label_encoder.inverse_transform(predicted_labels.numpy())

# print the text, true and predicted labels
for i in range(len(test_data)):
    print(test_data['Description'].iloc[i])
    print('True:', test_data['Indicators'].iloc[i])
    print('Predicted:', label_encoder.classes_[predicted_labels[i]])
    print('')