# Imports

In [None]:
#!pip install pytorch-transformers accelerate evaluate -U

In [None]:
try:
    from mai_nlp_helper_functions import *
except ImportError as e:
    raise ImportError("You don't have the mai_nlp_helper_functions.py file in the same directory as your note book. Either add it, or copy paste the contents in this cell") from e


In [None]:

import pandas as pd
import datasets
from sklearn.model_selection import train_test_split
from datasets import Dataset, Features, ClassLabel, Value
from transformers import DataCollatorWithPadding


## Get data

In [None]:
df = pd.read_csv("DATA/esg_reports.csv")
df["labels"] = df["subject"].fillna("Other")
df

## Transformers model 1

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [None]:
tokenizer

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

sentence = df.iloc[203]["text"]
encoded = preprocess_function({"text":[sentence]})
print(sentence)
print(encoded)
for w in encoded['input_ids'][0]:
    print(w, tokenizer.decode([w]))

In [None]:
for w in encoded['input_ids'][0]:
    print(w, tokenizer.decode([w]))

In [None]:
class_names = ["Environmental", "Social", "Other"]
esg_classes = Features({'__index_level_0__': Value('string'), 
                             'text': Value('string'), 
                             'labels': ClassLabel(names=class_names)})


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
df_train, df_test = train_test_split(df, random_state=22141, stratify=df["labels"])
train = Dataset.from_pandas(df_train[["text", "labels"]], features=esg_classes)
test = Dataset.from_pandas(df_test[["text"]], features=esg_classes)
train

In [None]:
tokenized_train = train.map(preprocess_function, batched=True)
tokenized_test = test.map(preprocess_function, batched=True)
tokenized_train

In [None]:

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

This takes a long time. I trained this in the cloud instead, results are in the lecture powerpoint.

In [None]:
# If you are runnign for real, save your work!
# model.save_pretrained("mymodel")
# model = AutoModelForSequenceClassification.from_pretrained("mymodel")

## Evaluate - alternative

In [None]:

experiment_name = "Transformers dummy"

predictions = trainer.predict(tokenized_test)
prediction_labels = [class_names[i] for i in predictions.predictions.argmax(-1)]

stats = evaluate_model(df_test["labels"], prediction_labels, class_names)
log_experiment_results(experiment_name, stats["macro avg"])

In [None]:
predictions = trainer.predict(tokenized_test)