# Assignment II: 20597 - Natural Langauge Processing
## Giacomo Negri, 3155287

In [2]:
import pandas as pd
import numpy as np
import re
import string
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
import gensim.downloader as api
from sentence_transformers import SentenceTransformer

In [3]:
def load_data():
    dev_responses = pd.read_csv('dev_responses.csv')
    train_responses = pd.read_csv('train_responses.csv')
    return pd.concat([dev_responses, train_responses], ignore_index=True)

def preprocess_text(text):
    text = text.lower().strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

## Track I
This code use TF-IDF vectorization and cosine similarity. After loading the data, they are lighly preprocessed, removing white spaces, punctuation and lowering capitalized words. Then it was employed a TfidfVectorizer from sklearn. Different n-gram ranges for words and characters were used and then the vectorizers were combined using FeatureUnion. The tuning of the parameters (ngram_range, max_df, min_df, sublinear_tf) was achived by triyng multiple combinations and selecting the most solid one aross different seeds with respect to the BLEU score. The model transforms the processed test prompts into TF-IDF vectors and computes cosine similarity to find the most relevant training prompt response for each test prompt.

In [4]:
combined_data = load_data()
test_prompts=pd.read_csv('test_prompts.csv')

combined_data['user_prompt'] = combined_data['user_prompt'].astype(str)
combined_data['model_response'] = combined_data['model_response'].astype(str)

combined_data['processed_prompt'] = combined_data['user_prompt'].apply(preprocess_text)
test_prompts['processed_prompt'] = test_prompts['user_prompt'].apply(preprocess_text)

ngram_range_w = (1,2)
ngram_range_c = (2,4)
max_df = 0.80
min_df = 1

word_vectorizer = TfidfVectorizer(ngram_range=ngram_range_w, max_df=max_df, min_df=min_df, analyzer='word', sublinear_tf=True)
char_vectorizer = TfidfVectorizer(ngram_range=ngram_range_c, max_df=max_df, min_df=min_df, analyzer='char', sublinear_tf=True)
vectorizer = FeatureUnion([("word_tfidf", word_vectorizer), ("char_tfidf", char_vectorizer)])

tfidf_train = vectorizer.fit_transform(combined_data['processed_prompt'])
tfidf_test = vectorizer.transform(test_prompts['processed_prompt'])

similarities = cosine_similarity(tfidf_test, tfidf_train)
top_indices = np.argmax(similarities, axis=1)

retrieved_responses = combined_data.iloc[top_indices]['conversation_id'].values

answers=pd.DataFrame({})
answers['conversation_id']=test_prompts['conversation_id']
answers['response_id']=retrieved_responses

answers.reset_index(drop=True, inplace=True)
# print(answers.head(15))
answers.to_csv('track_1_test.csv')

## Track II

This code use word embeddings and cosine similarity. After loading the data, they are lighly preprocessed, as in the previous track. It is then load the pre-trained word2vec-google-news-300 model and defines a function to compute sentence embeddings by averaging word vectors. The previous model was selected for its performance with respect to the BLEU score. The processed test prompts are converted into embeddings, and the cosine similarity is computed to find the closest match. The most similar is then selected.

In [7]:
combined_data = load_data()
test_prompts=pd.read_csv('test_prompts.csv')

combined_data['user_prompt'] = combined_data['user_prompt'].astype(str)
combined_data['model_response'] = combined_data['model_response'].astype(str)

combined_data['processed_prompt'] = combined_data['user_prompt'].apply(preprocess_text)
test_prompts['processed_prompt'] = test_prompts['user_prompt'].apply(preprocess_text)

text_model = api.load("word2vec-google-news-300")

def get_embedding(prompt, model):
    words = prompt.split()
    valid_vectors = [model[tok] for tok in words if tok in model.key_to_index]
    return np.mean(valid_vectors, axis=0) if valid_vectors else np.zeros(model.vector_size)

train_embeddings = np.vstack([get_embedding(prompt, text_model) for prompt in combined_data['processed_prompt']])
test_embeddings = np.vstack([get_embedding(prompt, text_model) for prompt in test_prompts['processed_prompt']])

similarities = cosine_similarity(test_embeddings, train_embeddings)

top_indices = np.argmax(similarities, axis=1)
retrieved_responses = combined_data.iloc[top_indices]['conversation_id'].values

answers=pd.DataFrame({})
answers['conversation_id']=test_prompts['conversation_id']
answers['response_id']=retrieved_responses

answers.reset_index(drop=True, inplace=True)
# print(answers.head(15))
answers.to_csv('track_2_test.csv')

## Track III

This code combines semantic and lexical similarities. After loading and preprocessing the data, user prompts are encoded using the pre-trained BERT-based model 'all-mpnet-base-v2' (SentenceTransformer) to generate semantic embeddings. Although slower, this model provides higher-quality results, than for instance 'all-MiniLM-L12-v2'. Lexical similarities are computed using TF-IDF vectorization with word and character n-grams, as in track I. Both semantic and lexical similarities are combined with a weighted average (alpha=0.6), chosen based on its optimal BLEU score. This hybrid approach improves the accuracy of identifying the most relevant response by leveraging both semantic and lexical features.

In [8]:
combined_data = load_data()
test_prompts=pd.read_csv('test_prompts.csv')

combined_data['user_prompt'] = combined_data['user_prompt'].astype(str)
combined_data['model_response'] = combined_data['model_response'].astype(str)

combined_data['processed_prompt'] = combined_data['user_prompt'].apply(preprocess_text)
test_prompts['processed_prompt'] = test_prompts['user_prompt'].apply(preprocess_text)

bert_model = SentenceTransformer('all-mpnet-base-v2')

train_embeddings = bert_model.encode(combined_data['processed_prompt'].tolist(), convert_to_tensor=True)
test_embeddings = bert_model.encode(test_prompts['processed_prompt'].tolist(), convert_to_tensor=True)

similarities = cosine_similarity(test_embeddings.cpu().numpy(), train_embeddings.cpu().numpy())

ngram_range_w = (1, 2)
ngram_range_c = (2, 4)
max_df = 0.80
min_df = 1

word_vectorizer = TfidfVectorizer(ngram_range=ngram_range_w, max_df=max_df, min_df=min_df, analyzer='word', sublinear_tf=True)
char_vectorizer = TfidfVectorizer(ngram_range=ngram_range_c, max_df=max_df, min_df=min_df, analyzer='char', sublinear_tf=True)
vectorizer = FeatureUnion([("word_tfidf", word_vectorizer), ("char_tfidf", char_vectorizer)])

train_tfidf = vectorizer.fit_transform(combined_data['processed_prompt'])
test_tfidf = vectorizer.transform(test_prompts['processed_prompt'])
lexical_similarities = cosine_similarity(test_tfidf, train_tfidf)

alpha=0.6
combined_similarities = alpha * similarities + (1 - alpha) * lexical_similarities
top_indices = np.argmax(combined_similarities, axis=1)
retrieved_responses = combined_data.iloc[top_indices]['conversation_id'].values

answers=pd.DataFrame({})
answers['conversation_id']=test_prompts['conversation_id']
answers['response_id']=retrieved_responses

answers.reset_index(drop=True, inplace=True)
# print(answers.head(15))
answers.to_csv('track_3_test.csv')