# Imports

In [None]:
!pip install datasets accelerate -U

In [None]:
import datetime
import string

import pandas as pd
import spacy
from matplotlib import pyplot as plt
from sklearn.metrics import (ConfusionMatrixDisplay, classification_report,
                             confusion_matrix)
from tabulate import tabulate
from tqdm import tqdm

tqdm.pandas()

punctuations = string.punctuation

try:
  nlp = spacy.load("en_core_web_md")
except:
  spacy.cli.download("en_core_web_md")
  nlp = spacy.load("en_core_web_md")

stop_words = spacy.lang.en.stop_words.STOP_WORDS


def spacy_tokenizer(sentence):
    """
    Tokenises a sentence using spaCy.
    Parameters:
    - sentence: str, the sentence to tokenise
    Returns:
    - mytokens: list, the list of tokens
    """
    # Creating our token object, which is used to create documents with linguistic annotations.
    tokens = nlp(sentence["text"].lower())

    # Remove OOV words
    tokens = [word for word in tokens if not word.is_oov]

    # Lemmatise + lower case
    tokens = [
        word.lemma_.strip() if word.lemma_ != "-PRON-" else word.lower_
        for word in tokens
    ]

    # Remove stop words
    tokens = [
        word for word in tokens if word not in stop_words and word not in punctuations
    ]

    return tokens


def log_experiment_results(experiment_name, stats, filename="experiment_log.md"):
    """
    Appends experiment results and statistics to a markdown log file.

    Parameters:
    - experiment_name: str, the name of the experiment
    - stats: dict, a dictionary containing the statistics to log
    - filename: str, the path to the log file
    """
    stats["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
    stats["Experiment Name"] = experiment_name
    try:

        df = pd.read_table(filename, sep="|", skipinitialspace=True).drop(0)
        df.columns = df.columns.str.strip()
        df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

    except (FileNotFoundError, pd.errors.EmptyDataError, pd.errors.ParserError):
        df = pd.DataFrame(columns=list(stats.keys()))

    df = pd.concat([df, pd.DataFrame([stats])], ignore_index=True)
    df = df[
        ["precision", "recall", "f1-score", "support", "timestamp", "Experiment Name"]
    ]
    markdown_table = tabulate(
        df,
        headers="keys",
        tablefmt="pipe",
        showindex=False,
        floatfmt=(".3g"),
        intfmt=",",
    )
    with open(filename, "w") as f:
        f.write(markdown_table)


def evaluate_model(y_test, predictions, classes):
    """
    Prints classification report and confusion matrix.

    Parameters:
    - y_test: list, the true labels
    - predictions: list, the predicted labels
    - classes: list, the list of classes

    Returns:
    - stats: dict, the classification report
    """
    stats = classification_report(y_test, predictions, output_dict=True)
    print(classification_report(y_test, predictions))

    # Plot confusion matrix
    fig, ax = plt.subplots(figsize=(8, 5))

    cmp = ConfusionMatrixDisplay(
        confusion_matrix(y_test, predictions),
        display_labels=classes,
    )

    cmp.plot(ax=ax)
    plt.show()
    return stats


In [None]:

import pandas as pd
import datasets
from sklearn.model_selection import train_test_split
from datasets import Dataset, Features, ClassLabel, Value
from transformers import DataCollatorWithPadding


## Get data

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/JosPolfliet/vlerick-mai-nlp-2023/main/DATA/esg_reports.csv")
df["labels"] = df["subject"].fillna("Other")
df["labels"].value_counts()

## Transformers model 1

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [None]:
tokenizer

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

sentence = df.iloc[203]["text"]
encoded = preprocess_function({"text":[sentence]})
print(sentence)
print(encoded)
for w in encoded['input_ids'][0]:
    print(w, tokenizer.decode([w]))

In [None]:
for w in encoded['input_ids'][0]:
    print(w, tokenizer.decode([w]))

In [None]:
class_names = ["Environmental", "Social", "None"]
esg_classes = Features({'__index_level_0__': Value('string'),
                             'text': Value('string'),
                             'labels': ClassLabel(names=class_names)})


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
df_train, df_test = train_test_split(df, random_state=22141, stratify=df["labels"])
train = Dataset.from_pandas(df_train[["text", "labels"]], features=esg_classes)
test = Dataset.from_pandas(df_test[["text", "labels"]], features=esg_classes)
train

In [None]:
tokenized_train = train.map(preprocess_function, batched=True)
tokenized_test = test.map(preprocess_function, batched=True)
tokenized_train

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-4,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch
    eval_steps=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

This takes a long time. I trained this in the cloud instead, results are in the lecture powerpoint.

In [None]:
# If you are runnign for real, save your work!
# model.save_pretrained("mymodel")
# model = AutoModelForSequenceClassification.from_pretrained("mymodel")

## Evaluate


In [None]:

experiment_name = "Transformers dummy"

predictions = trainer.predict(tokenized_test)
prediction_labels = [class_names[i] for i in predictions.predictions.argmax(-1)]

stats = evaluate_model(df_test["labels"], prediction_labels, class_names)
log_experiment_results(experiment_name, stats["macro avg"])

In [None]:
predictions = trainer.predict(tokenized_test)