# Overview

This notebook contain the process of converting news content and news headline from text into embedding vector. The process leverage the *transformer* and *sentence_transformer* library from huggingface. The news that is converted is related to BBRI, BBCA, and BMRI stock.

The text from the news article that is converted is:
1. News content
2. News headline

The model that is used to convert the text is:
1. FinBERT
2. IndoBERT (indobert-base-p2)
3. paraphrase-multilingual-mpnet-base-v2
4. LazarusNLP (all-indo-e5-small-v4)

In [None]:
!pip install -U sentence_transformers



In [None]:
import pandas as pd
from tqdm.auto import tqdm

bbri_news = pd.read_csv("full_raw_bbri_news.csv")
bbca_news = pd.read_csv("full_raw_bbca_news.csv")
bmri_news = pd.read_csv("full_raw_bmri_news.csv")

# FinBERT

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# For a sentiment classification model (like FinBERT)
tokenizer_finbert = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model_finbert = AutoModel.from_pretrained("ProsusAI/finbert")

# Check if GPU is available and move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

Using device: cuda


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
def generate_finbert_embedding(text_content, headline, model, tokenizer):
    """
    Generates FinBERT embeddings and sentiment predictions for text content and headline.

    Args:
        text_content (str): The main text content.
        headline (str): The headline text.

    Returns:
        tuple: A tuple containing the text content embedding and the headline embedding.
    """

    # Process the steps using batching
    texts = [text_content, headline]
    # Process text content
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True)
    # Move input tensors to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()

    text_embedding = embeddings[0]
    headline_embedding = embeddings[1]

    return (text_embedding, headline_embedding)

## BBRI NEWS

In [None]:
text_embeddings = []
headline_embeddings = []

for index, row in tqdm(bbri_news.iterrows(), desc="Processing FinBERT: BBRI News"):
    text_content = row['text_content']
    headline = row['headline']

    (text_embedding, headline_embedding) = generate_finbert_embedding(text_content, headline, model_finbert, tokenizer_finbert)

    text_embeddings.append(text_embedding)
    headline_embeddings.append(headline_embedding)

bbri_news['text_embedding_finbert'] = text_embeddings
bbri_news['headline_embedding_finbert'] = headline_embeddings

Processing FinBERT: BBRI News: 0it [00:00, ?it/s]

TypeError: generate_finbert_embedding() missing 2 required positional arguments: 'model' and 'tokenizer'

## BBCA NEWS

In [None]:
text_embeddings = []
headline_embeddings = []

for index, row in tqdm(bbca_news.iterrows(), desc="Processing FinBERT: BBCA News"):
    text_content = row['text_content']
    headline = row['headline']

    (text_embedding, headline_embedding) = generate_finbert_embedding(text_content, headline, model_finbert, tokenizer_finbert)

    text_embeddings.append(text_embedding)
    headline_embeddings.append(headline_embedding)

bbca_news['text_embedding_finbert'] = text_embeddings
bbca_news['headline_embedding_finbert'] = headline_embeddings

## BMRI NEWS

In [None]:
text_embeddings = []
headline_embeddings = []

for index, row in tqdm(bmri_news.iterrows(), desc="Processing FinBERT: BMRI News"):
    text_content = row['text_content']
    headline = row['headline']

    (text_embedding, headline_embedding) = generate_finbert_embedding(text_content, headline, model_finbert, tokenizer_finbert)

    text_embeddings.append(text_embedding)
    headline_embeddings.append(headline_embedding)

bmri_news['text_embedding_finbert'] = text_embeddings
bmri_news['headline_embedding_finbert'] = headline_embeddings

# IndoBERT

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the standard IndoBERT model (Phase 2)
# For a sentiment-focused model, you could use "indobenchmark/indobert-base-p1"
model_name = "indobenchmark/indobert-base-p2"
tokenizer_indobert = AutoTokenizer.from_pretrained(model_name)
model_indobert = AutoModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

In [None]:
def generate_indobert_embedding(text_content, headline, model, tokenizer):
    """
    Generates IndoBERT embeddings and sentiment predictions for text content and headline.

    Args:
        text_content (str): The main text content.
        headline (str): The headline text.

    Returns:
        tuple: A tuple containing the text content embedding and the headline embedding.
    """

    # Process the steps using batching
    texts = [text_content, headline]
    # Process text content
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True)
    # Move input tensors to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()

    text_embedding = embeddings[0]
    headline_embedding = embeddings[1]

    return (text_embedding, headline_embedding)

## BBRI NEWS

In [None]:
text_embeddings = []
headline_embeddings = []

for index, row in tqdm(bbri_news.iterrows(), desc="Processing IndoBERT: BBRI News"):
    text_content = row['text_content']
    headline = row['headline']

    (text_embedding, headline_embedding) = generate_indobert_embedding(text_content, headline, model_indobert, tokenizer_indobert)

    text_embeddings.append(text_embedding)
    headline_embeddings.append(headline_embedding)

bbri_news['text_embedding_finbert'] = text_embeddings
bbri_news['headline_embedding_finbert'] = headline_embeddings

## BBCA NEWS

In [None]:
text_embeddings = []
headline_embeddings = []

for index, row in tqdm(bbca_news.iterrows(), desc="Processing IndoBERT: BBCA News"):
    text_content = row['text_content']
    headline = row['headline']

    (text_embedding, headline_embedding) = generate_indobert_embedding(text_content, headline, model_indobert, tokenizer_indobert)

    text_embeddings.append(text_embedding)
    headline_embeddings.append(headline_embedding)

bbca_news['text_embedding_finbert'] = text_embeddings
bbca_news['headline_embedding_finbert'] = headline_embeddings

## BMRI NEWS

In [None]:
text_embeddings = []
headline_embeddings = []

for index, row in tqdm(bmri_news.iterrows(), desc="Processing IndoBERT: BMRI News"):
    text_content = row['text_content']
    headline = row['headline']

    (text_embedding, headline_embedding) = generate_indobert_embedding(text_content, headline, model_indobert, tokenizer_indobert)

    text_embeddings.append(text_embedding)
    headline_embeddings.append(headline_embedding)

bmri_news['text_embedding_finbert'] = text_embeddings
bmri_news['headline_embedding_finbert'] = headline_embeddings

# paraphrase-multilingual-mpnet-base-v2

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.notebook import tqdm

model_multilingual = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

## BBRI NEWS

In [None]:
tqdm.pandas(desc="Processing multilingual-mpnet: BBRI news content")
bbri_news['text_embedding_multilingual_mpnet'] = bbri_news['text_content'].progress_apply(lambda x: model_multilingual.encode(x))

tqdm.pandas(desc="Processing multilingual-mpnet: BBRI news headline")
bbri_news['headline_embedding_multilingual_mpnet'] = bbri_news['headline'].progress_apply(lambda x: model_multilingual.encode(x))

## BBCA NEWS

In [None]:
tqdm.pandas(desc="Processing multilingual-mpnet: BBCA news content")
bbca_news['text_embedding_multilingual_mpnet'] = bbca_news['text_content'].progress_apply(lambda x: model_multilingual.encode(x))

tqdm.pandas(desc="Processing multilingual-mpnet: BBCA news headline")
bbca_news['headline_embedding_multilingual_mpnet'] = bbca_news['headline'].progress_apply(lambda x: model_multilingual.encode(x))

## BMRI NEWS

In [None]:
tqdm.pandas(desc="Processing multilingual-mpnet: BMRI news content")
bmri_news['text_embedding_multilingual_mpnet'] = bmri_news['text_content'].progress_apply(lambda x: model_multilingual.encode(x))

tqdm.pandas(desc="Processing multilingual-mpnet: BMRI news headline")
bmri_news['headline_embedding_multilingual_mpnet'] = bmri_news['headline'].progress_apply(lambda x: model_multilingual.encode(x))

# LazarusNLP / all-indo-e5-small-v4

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.notebook import tqdm

model_lazarus = SentenceTransformer('LazarusNLP/all-indo-e5-small-v4')

## BBRI NEWS

In [None]:
tqdm.pandas(desc="Processing LazarusNLP: BBRI news content")
bbri_news['text_embedding_lazarus'] = bbri_news['text_content'].progress_apply(lambda x: model_lazarus.encode(x))

tqdm.pandas(desc="Processing LazarusNLP: BBRI news headline")
bbri_news['headline_embedding_lazarus'] = bbri_news['headline'].progress_apply(lambda x: model_lazarus.encode(x))

## BBCA NEWS

In [None]:
tqdm.pandas(desc="Processing LazarusNLP: BBCA news content")
bbca_news['text_embedding_lazarus'] = bbca_news['text_content'].progress_apply(lambda x: model_lazarus.encode(x))

tqdm.pandas(desc="Processing LazarusNLP: BBCA news headline")
bbca_news['headline_embedding_lazarus'] = bbca_news['headline'].progress_apply(lambda x: model_lazarus.encode(x))

## BMRI NEWS

In [None]:
tqdm.pandas(desc="Processing LazarusNLP: BMRI news content")
bmri_news['text_embedding_lazarus'] = bmri_news['text_content'].progress_apply(lambda x: model_lazarus.encode(x))

tqdm.pandas(desc="Processing LazarusNLP: BMRI news headline")
bmri_news['headline_embedding_lazarus'] = bmri_news['headline'].progress_apply(lambda x: model_lazarus.encode(x))

# FINAL RESULT

In [None]:
bbri_news.to_csv("bbri_news_with_embeddings.csv", index=False)
bbca_news.to_csv("bbca_news_with_embeddings.csv", index=False)
bmri_news.to_csv("bmri_news_with_embeddings.csv", index=False)

In [None]:
from google.colab import files

# this result in the embedding vector for each stock news using the model that has been defined
files.download("bbri_news_with_embeddings.csv")
files.download("bbca_news_with_embeddings.csv")
files.download("bmri_news_with_embeddings.csv")

# Aggregate the news embedding into daily data

## BBRI News

In [None]:
import numpy as np

# Group by date and aggregate the embedding columns by averaging for BBRI data
aggregated_bbri_embeddings = bbri_news.groupby('date').agg(
    text_embedding_finbert=('text_embedding_finbert', lambda x: np.mean(list(x), axis=0)),
    headline_embedding_finbert=('headline_embedding_finbert', lambda x: np.mean(list(x), axis=0)),

    text_embedding_indobert=('text_embedding_indobert', lambda x: np.mean(list(x), axis=0)),
    headline_embedding_indobert=('headline_embedding_indobert', lambda x: np.mean(list(x), axis=0)),

    text_embedding_multilingual_mpnet=('text_embedding_multilingual_mpnet', lambda x: np.mean(list(x), axis=0)),
    headline_embedding_multilingual_mpnet=('headline_embedding_multilingual_mpnet', lambda x: np.mean(list(x), axis=0)),

    text_embedding_lazarus=('text_embedding_lazarus', lambda x: np.mean(list(x), axis=0)),
    headline_embedding_lazarus=('headline_embedding_lazarus', lambda x: np.mean(list(x), axis=0))
).reset_index()

print(aggregated_bbri_embeddings.shape)

## BBCA News

In [None]:
import numpy as np

# Group by date and aggregate the embedding columns by averaging for BBCA data
aggregated_bbca_embeddings = bbca_news.groupby('date').agg(
    text_embedding_finbert=('text_embedding_finbert', lambda x: np.mean(list(x), axis=0)),
    headline_embedding_finbert=('headline_embedding_finbert', lambda x: np.mean(list(x), axis=0)),

    text_embedding_indobert=('text_embedding_indobert', lambda x: np.mean(list(x), axis=0)),
    headline_embedding_indobert=('headline_embedding_indobert', lambda x: np.mean(list(x), axis=0)),

    text_embedding_multilingual_mpnet=('text_embedding_multilingual_mpnet', lambda x: np.mean(list(x), axis=0)),
    headline_embedding_multilingual_mpnet=('headline_embedding_multilingual_mpnet', lambda x: np.mean(list(x), axis=0)),

    text_embedding_lazarus=('text_embedding_lazarus', lambda x: np.mean(list(x), axis=0)),
    headline_embedding_lazarus=('headline_embedding_lazarus', lambda x: np.mean(list(x), axis=0))
).reset_index()

print(aggregated_bbca_embeddings.shape)

## BMRI News

In [None]:
import numpy as np

# Group by date and aggregate the embedding columns by averaging for BMRI data
aggregated_bmri_embeddings = bmri_news.groupby('date').agg(
    text_embedding_finbert=('text_embedding_finbert', lambda x: np.mean(list(x), axis=0)),
    headline_embedding_finbert=('headline_embedding_finbert', lambda x: np.mean(list(x), axis=0)),

    text_embedding_indobert=('text_embedding_indobert', lambda x: np.mean(list(x), axis=0)),
    headline_embedding_indobert=('headline_embedding_indobert', lambda x: np.mean(list(x), axis=0)),

    text_embedding_multilingual_mpnet=('text_embedding_multilingual_mpnet', lambda x: np.mean(list(x), axis=0)),
    headline_embedding_multilingual_mpnet=('headline_embedding_multilingual_mpnet', lambda x: np.mean(list(x), axis=0)),

    text_embedding_lazarus=('text_embedding_lazarus', lambda x: np.mean(list(x), axis=0)),
    headline_embedding_lazarus=('headline_embedding_lazarus', lambda x: np.mean(list(x), axis=0))
).reset_index()

print(aggregated_bmri_embeddings.shape)

## Save to csv file

In [None]:
# save daily aggregated news embedding into csv file

aggregated_bbri_embeddings.to_csv("bbri_embedding_daily_full.csv", index=False)
aggregated_bbca_embeddings.to_csv("bbca_embedding_daily_full.csv", index=False)
aggregated_bmri_embeddings.to_csv("bmri_embedding_daily_full.csv", index=False)