# Imports and installs

In [60]:
try:
    from mai_nlp_helper_functions import *
except ImportError as e:
    raise ImportError("You don't have the mai_nlp_helper_functions.py file in the same directory as your notebook. Either add it, or copy paste the contents in this cell") from e


In [61]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.model_selection import train_test_split

tqdm.pandas()

## Get data

In [62]:
df = pd.read_csv("https://media.githubusercontent.com/media/JosPolfliet/vlerick-mai-nlp-2023/main/DATA/esg_reports.csv")
df["subject"] = df["subject"].fillna("Other")
df

Unnamed: 0,text,subject
0,"Since its inception in 2010, we have distribut...",Social
1,"I will focus first on traffic, which is a key ...",Other
2,"Also, many farms are aged with degraded soil t...",Environmental
3,"Alaska 2009 STEWARDSHIP REPORT54 cOnSERvATIOn,...",Social
4,The company’s U.S. GOM operations can be impac...,Other
...,...,...
2180,American shad are a species of concern for res...,Environmental
2181,Weyerhaeuser's policies address best practices...,Other
2182,compares recently observed trends in weather p...,Environmental
2183,"If our partners, including our licensors, suff...",Other


## Clean data

In [63]:
def spacy_tokenizer(sentence):
    """
    Tokenises a sentence using spaCy.
    Parameters:
    - sentence: str, the sentence to tokenise
    Returns:
    - mytokens: list, the list of tokens
    """
    # Creating our token object, which is used to create documents with linguistic annotations.
    global nlp
    global stop_words
    if not nlp:
        try:
            nlp = spacy.load("en_core_web_trf")
            stop_words = spacy.lang.en.stop_words.STOP_WORDS
        except:
            spacy.cli.download("en_core_web_trf")
            nlp = spacy.load("en_core_web_trf")
            stop_words = spacy.lang.en.stop_words.STOP_WORDS

    tokens = nlp(sentence["text"].lower())

    # Remove OOV words
    tokens = [word for word in tokens if not word.is_oov]

    # Lemmatise + lower case
    tokens = [
        word.lemma_.strip() if word.lemma_ != "-PRON-" else word.lower_
        for word in tokens
    ]

    # Remove stop words
    # tokens = [
    #     word for word in tokens if word not in stop_words and word not in punctuations
    # ]

    return tokens


In [64]:
print(sentence.text)

The geographic analysis in the Business Review, including the average balance sheet and interest rates, changes in net interest income and average interest rates, yields, spreads and margins in this report have generally been compiled on the basis of location of office - UK and overseas – unless indicated otherwise.


In [65]:
for token in spacy_tokenizer({"text": "The geographic analysis in the Business Review, including the average balance sheet and interest rates, changes in net interest income and average interest rates, yields, spreads and margins in this report have generally been compiled on the basis of location of office - UK and overseas – unless indicated otherwise."}):
    print(token)    

the
geographic
analysis
in
the
business
review
,
include
the
average
balance
sheet
and
interest
rate
,
change
in
net
interest
income
and
average
interest
rate
,
yield
,
spread
and
margin
in
this
report
have
generally
be
compile
on
the
basis
of
location
of
office
-
uk
and
overseas
–
unless
indicate
otherwise
.


In [66]:

def embed_words(sentence):
    tokens = spacy_tokenizer(sentence)
    return nlp.vocab.vectors.get_batch(tokens)
sentence = df.iloc[301]

word_embeddings = embed_words(sentence)
print(word_embeddings)
print(word_embeddings.shape)

[[-5.1043    2.3496    3.2472   ... -7.6875   -2.5128    0.69342 ]
 [-3.0683    0.8136   -0.29935  ... -2.0391   -0.9999    1.9089  ]
 [ 0.65773  -1.1594    1.1196   ... -3.4405    2.6652    0.34709 ]
 ...
 [-2.0586    0.34812   0.84462  ... -3.4597   -2.9688    1.4875  ]
 [-0.6902    0.59066  -1.8021   ... -1.4649   -2.0703    4.1519  ]
 [-0.076454 -4.6896   -4.0431   ...  1.304    -0.52699  -1.3622  ]]
(55, 300)


## Simple model: just average the words

In [None]:
def embed_sentence_simple(sentence):
    """
    This is the part where you can go crazy and complex to add features, change aggregation way, use PCA, ...
    """
    word_embeddings = embed_words(sentence)
    # sentence_embedding = np.concatenate([word_embeddings.min(axis=0), 
    #                                      word_embeddings.max(axis=0), 
    #                                      [embed_words(sentence).shape[0], 
    #                                      len([token for token in nlp(sentence["text"]) if token.is_oov])]])
    # sentence_embedding = np.concatenate([word_embeddings.min(axis=0), word_embeddings.max(axis=0)])
    sentence_embedding = word_embeddings.mean(axis=0)
    return sentence_embedding

sentence_embedding = embed_sentence_simple(sentence)
print(sentence_embedding)
print(sentence_embedding.shape)

In [None]:
%%time

df["features_simple"] = df.progress_apply(embed_sentence_simple,axis=1)

# For bigger datasets, you can write the results to disk to cache them
# df.to_pickle("DATA/df_with_features.pkl")

In [None]:
# Load cached results
# df = pd.read_pickle("DATA/df_with_features.pkl")

## Classify

In [None]:
X = np.stack(df["features_simple"].values)
y = np.array(df["subject"])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22141)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")



In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier(max_depth=3, random_state=22141, class_weight="balanced", n_estimators=500)
clf.fit(X_train, y_train)



## Evaluate

In [None]:

experiment_name = input("Enter experiment name: ")
predictions = clf.predict(X_test)
stats = evaluate_model(y_test, predictions, clf.classes_)
log_experiment_results(experiment_name, stats["macro avg"])

## What's next?

The example above is to illustrate the theory of embeddings. In reality, you can build pipelines super easily with SpaCy (if you prioritize speed) or HuggingFace transfomers (if you prioritize accuracy)

https://medium.com/@ycouble/training-and-integrating-a-custom-text-classifier-to-a-spacy-pipeline-b19e6a132487