# 3.2 - Data Reconciliation

In [None]:
#import
import numpy as np
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# read data
df_cleaned = pd.read_parquet('../data/processed/cleaned_data.parquet', engine='pyarrow')
original_columns = df_cleaned.columns

In [None]:
# setup model
model_name = "nickprock/sentence-bert-base-italian-xxl-uncased"

# enable model on mpu (- Apple Silicon GPU) or cuda (- Nvidia GPU) or cpu if no gpu is available
device = torch.device(
    "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
)
tokenizer = AutoTokenizer.from_pretrained(model_name) #tokenizer
model = AutoModel.from_pretrained(model_name) #model
model.to(device) # move model to device GPU

In [None]:
# extract topics list (unique)
topics = []
for _, row in df_cleaned.iterrows():
    for topic in row['topics']:
        if topic and topic not in topics:
            topics.append(topic)

In [None]:
# funtion to calculate the embeddings for each sentence
def sentence_embedding(tokenizer, model, device, sentences_list):
    # initialize list to store sentence embeddings
    sentence_embeddings = []

    for sentences in sentences_list:
        # sentence tokenizer
        encoded_input = tokenizer(sentences, padding = True, truncation = True, return_tensors = "pt")
        encoded_input = {key: val.to(device) for key, val in encoded_input.items()}

        # model inference
        with torch.no_grad():
            model_output = model(**encoded_input)

        # extract sentence embeddings
        sentence_embedding = model_output.last_hidden_state.mean(dim = 1)
        sentence_embeddings.append(sentence_embedding)

    # average the resulsts to obtain a single embedding for the article
    article_embedding = torch.cat(sentence_embeddings).mean(dim = 0).cpu().numpy()
    
    return article_embedding

# calculate embeddings
df_cleaned['sentence embeddings'] = df_cleaned['text chunked'].apply(
    lambda sentences: sentence_embedding(tokenizer, model, device, sentences)
)

In [None]:
def topic_embedding(tokenizer, model, topic):
    encoded_input = tokenizer(topic, return_tensors = "pt", padding = True, truncation = True)
    encoded_input = {key: val.to(device) for key, val in encoded_input.items()}

    # model inference
    with torch.no_grad():
        model_output = model(**encoded_input)

    # embeddings for the topic
    embedding = model_output.last_hidden_state.mean(dim = 1).squeeze().cpu().numpy()
    return embedding

df_topics = pd.DataFrame(topics, columns = ['topic'])
df_topics['sentence embeddings'] = df_topics['topic'].apply(lambda topic: topic_embedding(tokenizer, model, topic))

In [None]:
article_embeddings = np.array(df_cleaned['sentence embeddings'].tolist())
topic_embeddings = np.array(df_topics['sentence embeddings'].tolist())
similarity_matrix = cosine_similarity(article_embeddings, topic_embeddings)
most_similar_topics_indices = similarity_matrix.argmax(axis=1)

In [None]:
df_cleaned['topic'] = df_topics.iloc[most_similar_topics_indices]['topic'].values
df_cleaned['flag topic in topics'] = df_cleaned.apply(lambda row: 1 if row['topic'] in row['topics'] else 0, axis = 1)

In [None]:
df_for_fine_tuning = df_cleaned[df_cleaned['flag topic in topics'] == 1]
df_for_fine_tuning = df_for_fine_tuning[original_columns]

In [None]:
df_for_fine_tuning.to_parquet('../data/processed/df_for_fine_tuning.parquet', engine='pyarrow', index = False)