# Imports

In [None]:
!python -m spacy download en_core_web_md

In [None]:
import spacy
import pandas as pd
import numpy as np
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from spacy.vectors import Vectors
from tqdm import tqdm
tqdm.pandas()


In [None]:
df = pd.read_csv("https://media.githubusercontent.com/media/JosPolfliet/vlerick-mai-nlp-2023/main/DATA/esg_reports.csv")
df["subject"] = df["subject"].fillna("Other")
df

In [None]:
df.head()

## Get data

In [None]:
for i in range(10,15):
    print(df.iloc[i])

We'll use the following example sentence

In [None]:
sentence = df.iloc[67]
sentence.text

## Clean data

In [None]:
punctuations = string.punctuation

nlp = spacy.load('en_core_web_md')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(sentence["text"].lower())

    # Remove OOV words
    mytokens = [ word for word in mytokens if not word.is_oov ]

    # Lemmatise + lower case
    mytokens = [ word.lemma_.strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Remove stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    return mytokens

spacy_tokenizer(sentence)

## Calculate word frequencies

In [None]:
from collections import Counter

tokenFreq = Counter()

def countWordFrequencies(example):
    tokens = spacy_tokenizer(example)
    tokenFreq.update(tokens)

df.progress_apply(countWordFrequencies, axis=1)


In [None]:
tokenFreq.total()

In [None]:
len(tokenFreq.keys())

In [None]:
VOCAB_SIZE = 1000

In [None]:
print("Most frequent")
print(tokenFreq.most_common(10))
print("Least frequent")
print(tokenFreq.most_common(VOCAB_SIZE)[-10:])


In [None]:
tokenMap = {k: i+1 for i, (k, _) in enumerate(tokenFreq.most_common(VOCAB_SIZE-1))}
tokenMap["<oov>"]=0
tokenMapInverse = {v: k for k,v in tokenMap.items()}
print(list(tokenMap.items())[0:10])
print([tokenMapInverse[i] for i in range(10)])

## Simple model - BOW

In [None]:
def embed_sentence_BOW(example):
    """
    Simple BOW
    """
    embedding = np.zeros(VOCAB_SIZE)
    tokens = spacy_tokenizer(example)
    for token in tokens:
        try:
            tokenIndex = tokenMap[token]
        except KeyError: # Out of vocabulary
            tokenIndex = 0
        embedding[tokenIndex] = embedding[tokenIndex] + 1
    return embedding

sentence = df.iloc[302]
sentence_embedding = embed_sentence_BOW(sentence)

print(sentence_embedding)
print(sentence_embedding.shape)
print(sentence)
print("What this means: ")

for i in range(VOCAB_SIZE):
    if sentence_embedding[i]:
        print(f"Token {i} '{tokenMapInverse[i]}' occurs {sentence_embedding[i]} time in sentence.")

In [None]:
df["embedding"] = df.progress_apply(embed_sentence_BOW, axis=1)
df

## Classify

In [None]:
X = np.stack(df["embedding"])
X.shape

In [None]:
X[302:303,]

In [None]:
y = np.array(df["subject"])
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22141)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier(max_depth=2, random_state=22141, class_weight="balanced", n_estimators=300)
clf.fit(X_train, y_train)

print("Done")

## Evaluate

In [None]:
from sklearn.metrics import (ConfusionMatrixDisplay, classification_report,
                             confusion_matrix)
from matplotlib import pyplot as plt
import datetime
from tabulate import tabulate

def log_experiment_results(experiment_name, stats, filename="experiment_log.md"):
    """
    Appends experiment results and statistics to a markdown log file.

    Parameters:
    - experiment_name: str, the name of the experiment
    - stats: dict, a dictionary containing the statistics to log
    - filename: str, the path to the log file
    """
    stats["timestamp"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
    stats["Experiment Name"] = experiment_name
    try:

        df = pd.read_table(filename, sep="|", skipinitialspace=True).drop(0)
        df.columns = df.columns.str.strip()
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    except (FileNotFoundError, pd.errors.EmptyDataError, pd.errors.ParserError):
        df = pd.DataFrame(columns=list(stats.keys()))

    df = pd.concat([df, pd.DataFrame([stats])], ignore_index=True)
    df = df[["precision", "recall", "f1-score", "support", "timestamp", "Experiment Name"]]
    markdown_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False, floatfmt=(".3g"), intfmt=",")
    with open(filename, 'w') as f:
        f.write(markdown_table)

def evaluate_model(y_test, predictions, clf):
    stats = classification_report(y_test, predictions, output_dict=True)
    print(classification_report(y_test, predictions))

    # Plot confusion matrix
    fig, ax = plt.subplots(figsize=(8, 5))
    cmp = ConfusionMatrixDisplay(
        confusion_matrix(y_test, predictions),
        display_labels=clf.classes_,
    )

    cmp.plot(ax=ax)
    plt.show()
    return stats

In [None]:
experiment_name = input("Enter experiment name: ")
predictions = clf.predict(X_test)
stats = evaluate_model(y_test, predictions, clf)
log_experiment_results(experiment_name, stats["macro avg"])

## What's next?
### Fundamental limitations of the BOW approach
1. We're just counting words, which means we don't consider the order of words in a sentence.
2. Lots of OOV tokens that are modelled incorrectly, which we don't like.
3. We are using every word with equal weight, while some words are more important than others
