In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

import gensim
from gensim.models import KeyedVectors


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_path = "../data/topic/topics_aggressive.csv"

df = pd.read_csv(dataset_path)

df.head()

Unnamed: 0,label,text,cleaned_text,topic,topic_name
0,neutral,"According to Gran , the company has no plans t...",according gran company plan move production ru...,5,russia
1,neutral,Technopolis plans to develop in stages an area...,technopolis plan develop stage area less NUM s...,4,sq
2,negative,The international electronic industry company ...,international electronic industry company elco...,51,elcoteq
3,positive,With the new production plant the company woul...,new production plant company would increase ca...,7,paper
4,positive,According to the company 's updated strategy f...,according company updated strategy year NUM NU...,34,basware


In [3]:
def get_word2vec_embeddings(texts, model):

    embeddings = []
    for text in tqdm(texts, desc="Calcolo embedding Word2Vec"):
        words = text.split()
        vectors = [model[w] for w in words if w in model]
        if vectors:
            embeddings.append(np.mean(vectors, axis=0))
        else:
            embeddings.append(np.zeros(model.vector_size))
    return np.array(embeddings)


In [4]:
def get_sbert_embeddings(texts):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    return model.encode(texts, show_progress_bar=True)


In [5]:
def get_bert_cls_embeddings(texts, model_name="bert-base-uncased"):
    from transformers import AutoTokenizer, AutoModel
    import torch
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()

    all_embeddings = []
    with torch.no_grad():
        for text in tqdm(texts, desc="Embedding BERT CLS", unit="text"):
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
            all_embeddings.append(cls_embedding)
    return np.array(all_embeddings)


In [6]:
# dowload the GoogleNews-vectors-negative300.bin.gz file and add it in the embeddings folder.
# link https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g

model_path = "../data/embeddings/GoogleNews-vectors-negative300.bin"
word2vec_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

embeddings_word2vec = get_word2vec_embeddings(df["cleaned_text"], word2vec_model)
np.save("../data/embeddings/embeddings_word2vec.npy", embeddings_word2vec)

print(f"{len(embeddings_word2vec)} word2vec embeddings saved.")
print(f"length of each embedding: {len(embeddings_word2vec[0])}")

Calcolo embedding Word2Vec: 100%|██████████| 4821/4821 [00:00<00:00, 7556.41it/s]


4821 word2vec embeddings saved.
length of each embedding: 300


In [7]:
embeddings_sbert = get_sbert_embeddings(df["cleaned_text"])
np.save("../data/embeddings/embeddings_sbert.npy", embeddings_sbert)

print(f"{len(embeddings_sbert)} sbert embeddings saved.")
print(f"length of each embedding: {len(embeddings_sbert[0])}")

Batches: 100%|██████████| 151/151 [00:28<00:00,  5.26it/s]

4821 sbert embeddings saved.
length of each embedding: 384





In [8]:
embeddings_bert = get_bert_cls_embeddings(df["cleaned_text"])
np.save("../data/embeddings/embeddings_bert.npy", embeddings_bert)

print(f"{len(embeddings_bert)} bert embeddings saved.")
print(f"length of each embedding: {len(embeddings_bert[0])}")

Embedding BERT CLS: 100%|██████████| 4821/4821 [04:39<00:00, 17.24text/s]

4821 bert embeddings saved.
length of each embedding: 768





In [9]:
embeddings_finbert = get_bert_cls_embeddings(df["cleaned_text"], model_name="yiyanghkust/finbert-tone")

np.save("../data/embeddings/embeddings_finbert.npy", embeddings_finbert)
print(f"{len(embeddings_finbert)} finbert embeddings saved.")
print(f"length of each embedding: {len(embeddings_finbert[0])}")

Embedding BERT CLS: 100%|██████████| 4821/4821 [04:13<00:00, 19.00text/s]


4821 finbert embeddings saved.
length of each embedding: 768
