In [None]:
!pip install pandas sentence-transformers spacy scikit-learn nltk torch
!python -m download it_core_news_sm

In [None]:
import pandas as pd  # Importing pandas for data manipulation
from sentence_transformers import SentenceTransformer, CrossEncoder, util  # Importing models from sentence-transformers for sentence embeddings and reranking
from sklearn.feature_extraction.text import TfidfVectorizer  # Importing TF-IDF vectorizer from scikit-learn for text feature extraction
from sklearn.metrics.pairwise import cosine_similarity  # Importing cosine similarity function to compute similarity between vectors
from nltk.tokenize import sent_tokenize  # Importing NLTK's sentence tokenizer to split text into sentences
from tabulate import tabulate  # Importing tabulate to display tables in a readable format
import nltk  # Importing NLTK for various NLP tasks
nltk.download('punkt')  # Downloading the 'punkt' tokenizer for sentence tokenization
nltk.download('punkt_tab')  # Downloading additional NLTK data related to tokenization
from nltk.tokenize import sent_tokenize  # Re-importing the sentence tokenizer from NLTK
import torch  # Importing PyTorch for tensor operations (used in transformer models)

# Load a pre-trained SentenceTransformer model for generating sentence embeddings
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# Load a pre-trained CrossEncoder model for sentence-level ranking tasks (reranking)
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


In [None]:
def split_text(text, window=4, overlap=2):
    # 1. Handle empty input and invalid parameters
    if not text.strip():
        return []  # Return empty list if the input text is empty

    if window < 1 or overlap < 0 or overlap >= window:
        raise ValueError("Invalid parameters: window > 0, 0 ≤ overlap < window")

    # 2. Robust sentence splitting for English using NLTK
    sentences = sent_tokenize(text)

    # 3. Optimization for very short texts (less than or equal to window size)
    if len(sentences) <= window:
        return [" ".join(sentences)] if len(sentences) > 0 else []  # If text is short, return the entire text as a single chunk

    # 4. Efficient chunking calculation using a sliding window approach
    chunks = []
    start = 0
    while True:
        end = start + window
        chunk = sentences[start:end]

        if not chunk:
            break

        chunks.append(" ".join(chunk))  # Join sentences in each chunk to form a single string

        if end >= len(sentences):
            break

        start += (window - overlap)  # Move start by window size minus overlap to ensure sliding window with overlap

    return chunks


# Hybrid search (embedding + tfidf)
def ricerca_ibrida(domanda, testo_completo, answer, top_k=5):
    # 1. Split the complete text into chunks
    chunks = split_text(testo_completo)

    # 2. Filter chunks based on keywords from the answer
    answer_keywords = set(nltk.word_tokenize(answer.lower()))  # Tokenize the answer and convert to lowercase
    filtered_chunks = [
        chunk for chunk in chunks
        if any(keyword in chunk.lower() for keyword in answer_keywords)  # Keep chunks that contain any of the answer's keywords
    ] if answer else chunks  # If no answer, use all chunks

    # If no chunks are filtered, use all chunks
    if not filtered_chunks:
        filtered_chunks = chunks

    # 3. Calculate embeddings for the question, answer, and filtered chunks
    question_answer_embeddings = model.encode([domanda, answer], convert_to_tensor=True)  # Encoding question and answer
    chunk_embeddings = model.encode(filtered_chunks, convert_to_tensor=True)  # Encoding filtered chunks

    # 4. Compute semantic similarity between the question and the chunks
    sim_domanda = util.pytorch_cos_sim(question_answer_embeddings[0], chunk_embeddings)[0].cpu().numpy()

    # 5. Compute semantic similarity between the answer and the chunks
    sim_risposta = util.pytorch_cos_sim(question_answer_embeddings[1], chunk_embeddings)[0].cpu().numpy()

    # 6. Compute TF-IDF similarity
    vectorizer = TfidfVectorizer(ngram_range=(1, 2)).fit([domanda] + filtered_chunks)  # Fit TF-IDF vectorizer on question and chunks
    tfidf_domanda = vectorizer.transform([domanda])  # Transform the question into its TF-IDF representation
    tfidf_chunks = vectorizer.transform(filtered_chunks)  # Transform filtered chunks into their TF-IDF representations
    punteggi_tfidf = cosine_similarity(tfidf_domanda, tfidf_chunks)[0]  # Compute cosine similarity between question and chunks

    # 7. Combine the scores with specific weights
    punteggi_combinati = (
        0.5 * sim_domanda +  # Weight for the question-semantic similarity
        0.3 * sim_risposta +  # Weight for the answer-semantic similarity
        0.2 * punteggi_tfidf  # Weight for the TF-IDF similarity
    )

    # 8. Sort the chunks by their combined scores and select the top_k chunks
    risultati_ordinati = sorted(zip(filtered_chunks, punteggi_combinati),
                              key=lambda x: x[1], reverse=True)  # Sort by combined score in descending order

    return [chunk for chunk, _ in risultati_ordinati[:top_k]]  # Return the top_k chunks


# Final reordering function using a cross-encoder reranker
def riordina_risultati(domanda, risultati):
    coppie = [[domanda, risultato] for risultato in risultati]  # Create pairs of the question and each result
    punteggi = reranker.predict(coppie)  # Use the cross-encoder to predict relevance scores for each pair
    punteggi = torch.tensor(punteggi).cpu().numpy()  # Convert scores to numpy array for sorting
    risultati_riordinati = [risultato for _, risultato in sorted(zip(punteggi, risultati), reverse=True)]  # Sort results by scores

    return risultati_riordinati  # Return the reordered results


In [None]:
# Creation of the new column 'Retrieval_Text'
def crea_retrieval_text(row):
    # 1. Split the input_text to separate the question and context
    question_part, context_part = row['input_text'].split("\nContext: ")  # Split the input_text by the context separator
    question = question_part.split("Question: ")[1]  # Extract the question from the part after "Question: "
    context = context_part  # The context comes after "Context: "

    # 2. Retrieve the answer if it exists (default to an empty string if missing)
    answer = row.get('answer', '')  # Safely get the answer using get()

    # 3. Perform hybrid search for relevant sentences using the question, context, and answer
    frasi_rilevanti = ricerca_ibrida(question, context, answer, top_k=10)  # Get top 10 relevant sentences using hybrid search

    # 4. Reorder the retrieved sentences to prioritize the most relevant ones
    frasi_riordinate = riordina_risultati(question, frasi_rilevanti)[:3]  # Get top 3 sentences after reranking

    # 5. Format the final context string by combining the reordered sentences
    final_context = f"Context: {' '.join(frasi_riordinate)}"

    # 6. Return the final formatted text (question and context)
    return f"Question: {question} \n{final_context}"

# Loading the dataset
PATH = "DB_QC_A_da_utilizzare.parquet"  # Path to the Q&A dataset
Q_A_DataSet = pd.read_parquet(PATH)  # Load the dataset into a pandas DataFrame

# Create the new 'Retrieval_Text' column by applying the 'crea_retrieval_text' function row-wise
Q_A_DataSet['Retrieval_Text'] = Q_A_DataSet.apply(lambda row: crea_retrieval_text(row), axis=1)

# Print the first 5 rows of the dataset to check the new column
print(tabulate(Q_A_DataSet.head(5), headers='keys', tablefmt='psql'))

# 1. Remove the 'input_text' column as it's no longer needed
Q_A_DataSet = Q_A_DataSet.drop(columns=['input_text'])
print(tabulate(Q_A_DataSet.head(5), headers='keys', tablefmt='psql', showindex=False))
# 2. Save the modified dataset with only the necessary columns into a new Parquet file
Q_A_DataSet.to_parquet("DB_QC_A_retrieval.parquet", index=False)  # Save the dataset as a Parquet file without row indices
