# INDUS Downstream Fine-Tuning: Climate Indicator Classification


## Overview
This notebook focuses on finetuning the INDUS Encoder model for various Downstream tasks.

- Learn how to use Huggingface to finetune INDUS family of models, or any other encoder model.
- Understand the training parameters for encoder models.
- Use finetuned model for inference.
- Understand the unique aspects of encoder models for Downstream tasks.

## Setup
Go to "Kernel"
Select "indus_eve"

In [9]:
# in case if you need to install the packages: 
# %pip install pandas accelerate transformers torch scikit-learn marvin openpyxl

In [10]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset

## Load the dataset, preprocess it and encode the labels


In [None]:
excel_file_path = "ej_dataset.xlsx"
df = pd.read_excel(excel_file_path, engine="openpyxl")
df = df.dropna()
df.sample(frac=1).reset_index(drop=True)

## Load the Encoder Model using Huggingface Lib

In [None]:

# Load your domain-specific encoder model (replace 'model_name' with your model's name)
model_name = 'nasa-impact/nasa-smd-ibm-distil-v0.1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


## Encode the data and Labels into ML-ready format

In [None]:
# Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['Indicators'])
# print the label mapping
for index, item in enumerate(label_encoder.classes_):
    print(item, '->', index)

data = df
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['labels'])
train_encodings = tokenizer(list(train_data['Description']), return_tensors='pt', padding=True, truncation=True, max_length=512, return_attention_mask=True)
test_encodings = tokenizer(list(test_data['Description']), return_tensors='pt', padding=True, truncation=True, max_length=512, return_attention_mask=True)
train_dataset = CustomDataset(train_encodings, torch.tensor(train_data['labels'].tolist()))
test_dataset = CustomDataset(test_encodings, torch.tensor(test_data['labels'].tolist()))


## Create training and evaluation arguments for the Huggingface Trainer

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    # evaluation_strategy="epoch",
    output_dir="./output",
    num_train_epochs=10,
    save_steps=50,
    save_total_limit=2,
    remove_unused_columns=True,
    logging_dir="./logs",
    optim="adamw_torch",
    learning_rate=5e-5,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=lambda p: classification_report(p.label_ids, p.predictions.argmax(-1), output_dict=True),
)
trainer.train()


## Evaluation on the test_dataset

In [None]:
results = trainer.evaluate()

# Print classification report
print("Classification Report:")
print(results)

## save model for later use

In [None]:
# Save the trained model
model.save_pretrained('ej_classifier')  # Replace 'your_model_directory' with your desired directory
tokenizer.save_pretrained('ej_tokenizer')  # Save the tokenizer as well

## Predict using the saved model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
import pandas as pd

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('ej_classifier')  # Load from the directory where you saved it
tokenizer = AutoTokenizer.from_pretrained('ej_tokenizer')

# Get predictions
with torch.no_grad():
    outputs = model(**test_encodings)

# Extract predicted class labels
predicted_labels = torch.argmax(outputs.logits, dim=1)

predicted_class_labels = label_encoder.inverse_transform(predicted_labels.numpy())

# print the text, true and predicted labels
for i in range(len(test_data)):
    print(test_data['Description'].iloc[i])
    print('True:', test_data['Indicators'].iloc[i])
    print('Predicted:', label_encoder.classes_[predicted_labels[i]])
    print('')