# Sentence embeddings
We will mainly use `sentence-transformers`, which is a dedicated package from Hugging Face 🤗. 

Relevant documentation
- Semantic textual similarity https://www.sbert.net/docs/usage/semantic_textual_similarity.html
- Semantic search https://www.sbert.net/examples/applications/semantic-search/README.html

### From word embeddings to sentence embeddings

In [None]:
try:
    from mai_nlp_helper_functions import *
except ImportError as e:
    raise ImportError("You don't have the mai_nlp_helper_functions.py file in the same directory as your note book. Either add it, or copy paste the contents in this cell") from e

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas()

In [None]:
df = pd.read_csv("https://media.githubusercontent.com/media/JosPolfliet/vlerick-mai-nlp-2023/main/DATA/esg_reports.csv")
df["subject"] = df["subject"].fillna("Other")
df

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('BAAI/bge-large-en-v1.5') # https://www.sbert.net/docs/pretrained_models.html

# Sentences we want to encode. Example:
sentence = df.iloc[302]

# Sentences are encoded by calling model.encode()
embedding = model.encode(sentence["text"])
print(f"Shape: {embedding.shape}")
embedding

A sentence embedding is just a vector representing the whole sentence at ones. So we can use it directly as features too.

In [None]:
def embed_sentence_lm(sentence):
    sentence_embedding = model.encode(sentence["text"])
    return sentence_embedding

sentence_embedding = embed_sentence_lm(sentence)
print(sentence_embedding)
print(sentence_embedding.shape)

In [None]:
%%time

df["features_simple"] = df.progress_apply(embed_sentence_lm,axis=1)


In [None]:
X = np.stack(df["features_simple"].values)
y = np.array(df["subject"])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22141)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier(max_depth=3, random_state=22141, class_weight="balanced", n_estimators=600)
clf.fit(X_train, y_train)


In [None]:
df.iloc[3]

In [None]:

print(clf.predict_proba([X_train[1]]))

In [None]:

experiment_name = input("Enter experiment name: ")
predictions = clf.predict(X_test)
stats = evaluate_model(y_test, predictions, clf.classes_)
log_experiment_results(experiment_name, stats["macro avg"])