# Initial setup (Main Libraries Download and Torch setting)

In [None]:
!curl ipinfo.io

In [None]:
# Install huggingface library
!pip install torch ray==2.6.3 transformers hyperopt accelerate evaluate

In [None]:
!pip install codecarbon

In [None]:
import torch

# Torch GPU setting
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
#Google Drive Source setup

%cd /content
!mkdir gdrive
%cd gdrive
!mkdir "My Drive"
!google-drive-ocamlfuse "/content/gdrive/My Drive"

In [None]:
from google.colab import drive

# Google Drive Disk Mount

drive.mount('/content/gdrive')

In [None]:
import random
import numpy as np

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Dataset Loading and Preprocessing

In [None]:
# read here your dataset
import pandas as pd

columns_to_read = ["text", "isgreen"] # Task #1
#columns_to_read = ["text", "sentiment"] # Task #2

train = pd.read_csv("/content/gdrive/My Drive/dataset_exp/original/trainset.tsv", delimiter='\t', usecols=columns_to_read).dropna()
train['isgreen'] = train['isgreen'].replace({'Eco-related': 1, 'Not eco-related': 0})
#train['sentiment'] = train['sentiment'].replace({'Positive': 1, 'Negative': 0, 'Neutral': 2})
train.rename(columns={'text': 'sentence'}, inplace=True)
train.rename(columns={'isgreen': 'label'}, inplace=True)
#train.rename(columns={'sentiment': 'label'}, inplace=True)

eval = pd.read_csv("/content/gdrive/My Drive/dataset_exp/original/testset.tsv", delimiter='\t', usecols=columns_to_read).dropna()
eval['isgreen'] = eval['isgreen'].replace({'Eco-related': 1, 'Not eco-related': 0})
#eval['sentiment'] = eval['sentiment'].replace({'Positive': 1, 'Negative': 0, 'Neutral': 2})
eval.rename(columns={'text': 'sentence'}, inplace=True)
eval.rename(columns={'isgreen': 'label'}, inplace=True)
#eval.rename(columns={'sentiment': 'label'}, inplace=True)

In [None]:
train.head()
eval.head()

In [None]:
# get here your sentences and labels
train_sentences = train.sentence.values
train_labels = train.label.values

In [None]:
# get here your sentences and labels
eval_sentences = eval.sentence.values
eval_labels = eval.label.values

# Model setup and Training

In [None]:
# BERT tokenizer: To feed our text to BERT, it must be split into tokens, and then these tokens must be mapped to their index in the tokenizer vocabulary
from transformers import AutoTokenizer

huggingface_model_name = 'bert-base-cased'
#uggingface_model_name = 'roberta-base'
#huggingface_model_name = 'distilroberta-base'
#huggingface_model_name = 'climatebert/distilroberta-base-climate-f'
#huggingface_model_name = 'climatebert/distilroberta-base-climate-s'

# Load the BERT tokenizer
print('Loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(huggingface_model_name, do_lower_case=True)  # it will download and save it in a cache local directory

## Preparing Data for Classification with BERT: Tokenization, Padding, and Attention Mask Generation

In [None]:
max_length = 128
num_labels = 2
#num_labels = 3 # Task #3

input_ids = []
attention_masks = []
train_lab_tensor = torch.zeros((len(train_sentences), num_labels))

for i, sent in enumerate(train_sentences):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_length,           # Pad & truncate all sentences.
                        padding='max_length',
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True,
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

    # add label to lab_tensor
    if train_labels[i] <= float(num_labels):
      train_lab_tensor[i, int(train_labels[i])] = 1

# Convert the lists into tensors.
train_input_ids = torch.cat(input_ids, dim=0)
train_attention_masks = torch.cat(attention_masks, dim=0)

In [None]:
# let's encode the evaluation dataset

max_length = 128 # instead of 47, just in case there are some longer test sentences
num_labels = 2

input_ids = []
attention_masks = []
eval_lab_tensor = torch.zeros((len(eval_sentences), num_labels))

# For every sentence...
for i, sent in enumerate(eval_sentences):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_length,           # Pad & truncate all sentences.
                        padding='max_length',
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True,
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

    # add label to lab_tensor
    if eval_labels[i] <= float(num_labels):
      eval_lab_tensor[i, int(eval_labels[i])] = 1

# Convert the lists into tensors.
eval_input_ids = torch.cat(input_ids, dim=0)
eval_attention_masks = torch.cat(attention_masks, dim=0)

## Custom Iterable DataLoader Definition

In [None]:
import torch
from torch.utils.data import IterableDataset
from torch.utils.data import TensorDataset, random_split


class MyDataLoader(IterableDataset):

  def __init__(self, ids, mask, labels):
    super(MyDataLoader).__init__()
    self._ids = ids
    self._mask = mask
    self._labels = labels

  def __len__(self):
    return self._ids.size(dim=0)

  def __getitem__(self, idx):
    item = dict()
    item["input_ids"] = torch.Tensor(self._ids[idx])
    item["attention_mask"] = torch.Tensor(self._mask[idx])
    item["labels"] = self._labels[idx, :]
    return item

In [None]:
# training and validation split - 90% train and 10% valid
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
trainset = MyDataLoader(train_input_ids, train_attention_masks, train_lab_tensor)
evalset = MyDataLoader(eval_input_ids, eval_attention_masks, eval_lab_tensor)

trainset, _ = random_split(trainset, [len(trainset), 0])
evalset, _ = random_split(evalset,  [len(evalset), 0])

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it
# here. For fine-tuning BERT on a specific task, the authors recommend a batch
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.

train_dataloader = DataLoader(
            trainset,  # The training samples.
            sampler=RandomSampler(trainset),
            batch_size = batch_size # Train with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            evalset, # The validation samples.
            sampler=SequentialSampler(evalset),
            batch_size = batch_size # Evaluate with this batch size.
        )

## AutoModel loading and Metrics definition

In [None]:
# define the model - we will use BERTForSequenceClassification because it has the same BERT architecture but with a single classification layer on top
from transformers import AutoModelForSequenceClassification

# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.

def my_model_init():
  model = AutoModelForSequenceClassification.from_pretrained(        # use DistilBertForSequenceClassification if you want
      huggingface_model_name,
      num_labels = 2, # The number of output labels--2 for binary classification.
                      # You can increase this for multi-class tasks.
      output_attentions = False, # Whether the model returns attentions weights.
      output_hidden_states = False, # Whether the model returns all hidden-states.
      return_dict=True
  )

  for name, param in model.named_parameters():
    if 'Bert' in name:
      param.requires_grad = False

  model.to(device)

  return model


In [None]:
from transformers import EvalPrediction
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score


def compute_metrics(p: EvalPrediction):
  y_true = p.label_ids
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  #y_pred = torch.zeros(preds.shape)
  #args = torch.argmax(torch.Tensor(preds), dim=1)
  #y_pred[:,args[:]] = 1

  y_pred = preds.argmax(-1)
  y_true = y_true.argmax(-1)


  new_df = pd.DataFrame( )
  new_df['pred_label'] = y_pred
  new_df['true_label'] = y_true
  new_df.to_csv(f'/content/gdrive/My Drive/eco_project_model/{huggingface_model_name}/noclimatescam/predictions.csv', header=True)

  #precision = precision_score(y_true=y_true, y_pred=y_pred, average='weighted')
  #recall = recall_score(y_true=y_true, y_pred=y_pred, average='weighted')
  #f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
  #roc_auc = roc_auc_score(y_true, y_pred, average='weighted')

  precision = precision_score(y_true=y_true, y_pred=y_pred, average='macro')
  recall = recall_score(y_true=y_true, y_pred=y_pred, average='macro')
  f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
  #roc_auc = roc_auc_score(y_true, y_pred, average='micro', multi_class='ovo')

  accuracy = accuracy_score(y_true, y_pred)
  #metrics = {'p': precision,
  #           'r': recall,
  #           'f1': f1_micro_average,
  #           'roc_auc': roc_auc,
  #           'accuracy': accuracy}
  metrics = {'p': precision,
            'r': recall,
            'f1': f1_micro_average,
            #'roc_auc': roc_auc,
            'accuracy': accuracy}
  return metrics


## Hyperparameters setting

In [None]:
batch_size = 16
num_epochs = 10

lr = 3e-5
eps= 2e-10
adam_beta_1 = 0.9
adam_beta_2 = 0.999
warmup_steps = len(trainset) * num_epochs

out_dir = f'/content/gdrive/My Drive/eco_project_model/{huggingface_model_name}/original'
num_saved_models = 1

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(output_dir=out_dir,
                                  overwrite_output_dir=True,
                                  do_train=True,
                                  do_eval=True,
                                  #do_test=True,
                                  do_predict=True,
                                  fp16=True,
                                  evaluation_strategy='epoch',
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  learning_rate=lr,
                                  #adam_beta1=adam_beta_1,
                                  #adam_beta2=adam_beta_2,
                                  adam_epsilon=eps,
                                  lr_scheduler_type='linear',
                                  warmup_steps=warmup_steps,
                                  num_train_epochs=num_epochs,
                                  save_strategy='epoch',
                                  save_total_limit=num_saved_models,
                                  load_best_model_at_end=True,
                                  metric_for_best_model='p',
                                  logging_strategy='epoch')

In [None]:
trainer = Trainer(
    model_init=my_model_init,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=evalset,
    compute_metrics=compute_metrics
)

## Actual Training (with Emissions Tracker)

In [None]:
#from codecarbon import EmissionsTracker


#tracker = EmissionsTracker()
#tracker.start()
trainer.train()
#emissions: float = tracker.stop()
#print(emissions)


## Model Evaluation

In [None]:
trainer.evaluate()

# Ray Optimal Hyperparameter search

In [None]:
from contextlib import suppress
from ray import tune
from ray.air.config import CheckpointConfig
from ray.tune import CLIReporter
from ray.tune.examples.pbt_transformers.utils import (
    download_data,
    build_compute_metrics_fn,
)
from ray.tune.schedulers import PopulationBasedTraining

In [None]:
tune_config = {
    "per_device_train_batch_size": batch_size,
    "per_device_eval_batch_size": batch_size,
    "num_train_epochs": tune.choice([2, 3, 5, 10, 15, 20, 25, 30]),
    "max_steps": -1
}

In [None]:
scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="eval_f1",
    mode="max",
    perturbation_interval=1,
    hyperparam_mutations={
        "weight_decay": tune.uniform(0.0, 0.5),
        "learning_rate": tune.uniform(1e-5, 5e-5),
        "per_device_train_batch_size": [8, 16, 32, 64],
        "adam_epsilon": tune.uniform(1e-10, 1e-8),
        "warmup_steps": tune.randint(len(trainset), warmup_steps)
    },
)

In [None]:
reporter = CLIReporter(
    parameter_columns={
        "weight_decay": "w_decay",
        "learning_rate": "lr",
        "per_device_train_batch_size": "train_bs/gpu",
        "num_train_epochs": "num_epochs",
    },
    metric_columns=["eval_f1", "eval_loss", "epoch", "training_iteration"],
)

In [None]:
best_res = trainer.hyperparameter_search(
        hp_space=lambda _: tune_config,
        backend="ray",
        resources_per_trial={"cpu": 1, "gpu": 1},
        scheduler=scheduler,
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="training_iteration",
        ),
        stop=None,
        progress_reporter=reporter,
        local_dir="~/ray_results/",
        name="tune_transformer_pbt",
        log_to_file=True,
    )

In [None]:
print(best_res)