# DEVELOPMENT

## Track I

In [42]:
import pandas as pd
import numpy as np
import re
import string
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion

def load_data():
    dev_responses = pd.read_csv('dev_responses.csv')
    train_responses = pd.read_csv('train_responses.csv')
    return pd.concat([dev_responses, train_responses], ignore_index=True)

def preprocess_text(text):
    text = text.lower().strip()
    # text = re.sub(r'\b\d+\b', '', text)  # Remove isolated numbers
    text = text.translate(str.maketrans('', '', string.punctuation))
    # text = re.sub(r'\s+', ' ', text).strip()
    return text

original_data = load_data()
original_data['user_prompt'] = original_data['user_prompt'].astype(str)
original_data['model_response'] = original_data['model_response'].astype(str)
original_data['processed_prompt'] = original_data['user_prompt'].apply(preprocess_text)

X_train, X_test = train_test_split(original_data, test_size=0.2, random_state=42)

def compute_tfidf(X_train, X_test, ngram_range_word,ngram_range_char, max_df, min_df):
    word_vectorizer = TfidfVectorizer(ngram_range=ngram_range_word, max_df=max_df, min_df=min_df, analyzer='word', sublinear_tf=True)
    char_vectorizer = TfidfVectorizer(ngram_range=ngram_range_char, max_df=max_df, min_df=min_df, analyzer='char', sublinear_tf=True)
    
    vectorizer = FeatureUnion([("word_tfidf", word_vectorizer), ("char_tfidf", char_vectorizer)])
    
    tfidf_train = vectorizer.fit_transform(X_train['processed_prompt'])
    tfidf_test = vectorizer.transform(X_test['processed_prompt'])
    
    return vectorizer, tfidf_train, tfidf_test

def find_best_responses(tfidf_train, tfidf_test, X_train):
    similarities = cosine_similarity(tfidf_test, tfidf_train)
    # similarities=-euclidean_distances(tfidf_test, tfidf_train)
    best_indices = np.argmax(similarities, axis=1)
    
    retrieved_responses = X_train.iloc[best_indices]['model_response'].values
    return retrieved_responses

smoothing_function = SmoothingFunction().method3

def compute_bleu(ref_text, hyp_text):
    return sentence_bleu([ref_text.split()], hyp_text.split(), 
                         weights=(0.5, 0.5, 0, 0), 
                         smoothing_function=smoothing_function)

ngram_ranges_word = [(1,2)]
ngram_ranges_char = [(2,4)]
max_df_values = [0.8]
min_df_values = [1]

best_score = 0
best_params = None

for ngram_range, max_df, min_df in [(a, b, c) for a in ngram_ranges_word for b in max_df_values for c in min_df_values]:
    vectorizer, tfidf_train, tfidf_test = compute_tfidf(X_train, X_test, ngram_range, ngram_ranges_char[0], max_df, min_df)
    
    X_test = X_test.copy()
    X_test['retrieved_response'] = find_best_responses(tfidf_train, tfidf_test, X_train)

    X_test['bleu_score'] = X_test.apply(lambda row: compute_bleu(row['model_response'], row['retrieved_response']), axis=1)
    avg_bleu = X_test['bleu_score'].mean()
    print(f'BLEU (w_gram: {ngram_range}, max_df={max_df}, min_df={min_df}): {avg_bleu}')

    if avg_bleu > best_score:
        best_score = avg_bleu
        best_params = (ngram_range, max_df, min_df)

print(f'Best Params: {best_params} with BLEU Score: {best_score}')

BLEU (w_gram: (1, 2), max_df=0.8, min_df=1): 0.08962108211013144
Best Params: ((1, 2), 0.8, 1) with BLEU Score: 0.08962108211013144


## Track II

In [51]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.model_selection import train_test_split
import gensim.downloader as api

# Load Data
def load_data():
    dev_responses = pd.read_csv('dev_responses.csv')
    train_responses = pd.read_csv('train_responses.csv')
    return pd.concat([dev_responses, train_responses])

def preprocess_text(text):
    text = text.lower().strip()
    # text = re.sub(r'\b\d+\b', '', text)  # Remove isolated numbers
    text = text.translate(str.maketrans('', '', string.punctuation))
    # text = re.sub(r'\s+', ' ', text).strip()
    return text

original_data = load_data()
original_data['preprocess_prompt'] = original_data['user_prompt'].apply(preprocess_text)

X_train, X_test = train_test_split(original_data, test_size=0.2, random_state=21)

X_train['preprocess_prompt'] = X_train['preprocess_prompt'].astype(str)
X_train['model_response'] = X_train['model_response'].astype(str)
X_test['model_response'] = X_test['model_response'].astype(str)

text_model = api.load("word2vec-google-news-300")
# text_model = api.load("fasttext-wiki-news-subwords-300")

# Get embedding for a single prompt
def get_embedding(prompt, model):
    words = prompt.split()
    valid_vectors = [model[tok] for tok in words if tok in model.key_to_index]
    # return np.median(valid_vectors, axis=0) if valid_vectors else np.zeros(model.vector_size)
    return np.mean(valid_vectors, axis=0) if valid_vectors else np.zeros(model.vector_size)

X_train_embeddings = np.vstack([get_embedding(p, text_model) for p in X_train['preprocess_prompt']])
X_test_embeddings = np.vstack([get_embedding(p, text_model) for p in X_test['preprocess_prompt']])

similarities = cosine_similarity(X_test_embeddings, X_train_embeddings)
# similarities = -euclidean_distances(X_test_embeddings, X_train_embeddings)

top_indices = np.argmax(similarities, axis=1)
retrieved_responses = X_train.iloc[top_indices]['model_response'].values

X_test['retrieved_response']=retrieved_responses

smoothing_function = SmoothingFunction().method3

def compute_bleu(ref_text, hyp_text):
    return sentence_bleu([ref_text.split()], hyp_text.split(), 
                         weights=(0.5, 0.5, 0, 0), 
                         smoothing_function=smoothing_function)

X_test['bleu_score'] = X_test.apply(lambda row: compute_bleu(row['model_response'], row['retrieved_response']), axis=1)

average_bleu = X_test['bleu_score'].mean()
print(f'Average BLEU Score: {average_bleu}')

Average BLEU Score: 0.0892002592467896


## Track III

In [38]:
import pandas as pd
import numpy as np
import re
import string
import torch
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
import gensim.downloader as api
from sentence_transformers import SentenceTransformer

# Load Data
def load_data():
    dev_responses = pd.read_csv('dev_responses.csv')
    train_responses = pd.read_csv('train_responses.csv')
    return pd.concat([dev_responses, train_responses])

# Preprocess Text: Lowercase, strip, remove punctuation and digits
def preprocess_text(text):
    text = text.lower().strip()
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
    return text

# Load data and preprocess prompts
original_data = load_data()
original_data['preprocess_prompt'] = original_data['user_prompt'].apply(preprocess_text)

# Split into train and test sets (80% train, 20% test)
X_train, X_test = train_test_split(original_data, test_size=0.2, random_state=100)

# Ensure necessary columns are in string format
X_train['preprocess_prompt'] = X_train['preprocess_prompt'].astype(str)
X_train['model_response'] = X_train['model_response'].astype(str)
X_test['model_response'] = X_test['model_response'].astype(str)

# Load BERT-based Sentence Transformer Model
# bert_model = SentenceTransformer('all-MiniLM-L12-v2')
bert_model = SentenceTransformer('all-mpnet-base-v2')

# Compute sentence embeddings for all training prompts
X_train_embeddings = bert_model.encode(X_train['preprocess_prompt'].tolist(), convert_to_tensor=True)
X_test_embeddings = bert_model.encode(X_test['preprocess_prompt'].tolist(), convert_to_tensor=True)

# Compute cosine similarities between test and train embeddings (semantic similarity)
similarities = cosine_similarity(X_test_embeddings.cpu().numpy(), X_train_embeddings.cpu().numpy())

# Compute lexical similarities using TF-IDF
ngram_range_w = (1, 2)
ngram_range_c = (2, 4)
max_df = 0.9
min_df = 1

word_vectorizer = TfidfVectorizer(ngram_range=ngram_range_w, max_df=max_df, min_df=min_df, analyzer='word', sublinear_tf=True)
char_vectorizer = TfidfVectorizer(ngram_range=ngram_range_c, max_df=max_df, min_df=min_df, analyzer='char', sublinear_tf=True)
tfidf = FeatureUnion([("word_tfidf", word_vectorizer), ("char_tfidf", char_vectorizer)])

# tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train['preprocess_prompt'])
X_test_tfidf = tfidf.transform(X_test['preprocess_prompt'])
lexical_similarities = cosine_similarity(X_test_tfidf, X_train_tfidf)

# Combine lexical and semantic similarities
for i in range(1,11,1):#[0.8]:#
    alpha = i/10  # Weight for semantic similarity
    
    combined_similarities = alpha * similarities + (1 - alpha) * lexical_similarities
    
    # Find the index of the most similar train response for each test response
    top_indices = np.argmax(combined_similarities, axis=1)
    
    # Retrieve the corresponding responses from the train set
    X_test['retrieved_response'] = X_train.iloc[top_indices]['model_response'].values
    
    # BLEU Score Calculation
    smoothing_function = SmoothingFunction().method3
    
    def compute_bleu(ref_text, hyp_text):
        return sentence_bleu([ref_text.split()], hyp_text.split(), 
                             weights=(0.5, 0.5, 0, 0), 
                             smoothing_function=smoothing_function)
    
    X_test['bleu_score'] = X_test.apply(lambda row: compute_bleu(row['model_response'], row['retrieved_response']), axis=1)
    
    # Final average BLEU score
    average_bleu = X_test['bleu_score'].mean()
    print(f'Average BLEU for alpha={alpha} is: {average_bleu}')

Average BLEU for alpha=0.1 is: 0.09765004822105479
Average BLEU for alpha=0.2 is: 0.10226067224053291
Average BLEU for alpha=0.3 is: 0.1062946072944321
Average BLEU for alpha=0.4 is: 0.1083045350373584
Average BLEU for alpha=0.5 is: 0.10973773194184341
Average BLEU for alpha=0.6 is: 0.11079854422692353
Average BLEU for alpha=0.7 is: 0.1101675598143283
Average BLEU for alpha=0.8 is: 0.11022807233311652
Average BLEU for alpha=0.9 is: 0.11005694972271751
Average BLEU for alpha=1.0 is: 0.10856402947082004


## TRIAL II

In [52]:
import pandas as pd
import numpy as np
import re
import string
import torch
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
import gensim.downloader as api
from sentence_transformers import SentenceTransformer

def load_data():
    dev_responses = pd.read_csv('dev_responses.csv')
    train_responses = pd.read_csv('train_responses.csv')
    return pd.concat([dev_responses, train_responses])

def preprocess_text(text):
    text = text.lower().strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    # text = re.sub(r'\s+', ' ', text).strip()
    return text

original_data = load_data()
original_data['preprocess_prompt'] = original_data['user_prompt'].apply(preprocess_text)

X_train, X_test = train_test_split(original_data, test_size=0.2, random_state=100)

X_train['preprocess_prompt'] = X_train['preprocess_prompt'].astype(str)
X_train['model_response'] = X_train['model_response'].astype(str)
X_test['model_response'] = X_test['model_response'].astype(str)

# bert_model = SentenceTransformer('all-MiniLM-L12-v2')
bert_model = SentenceTransformer('all-mpnet-base-v2')

X_train_embeddings = bert_model.encode(X_train['preprocess_prompt'].tolist(), convert_to_tensor=True)
X_test_embeddings = bert_model.encode(X_test['preprocess_prompt'].tolist(), convert_to_tensor=True)

similarities = cosine_similarity(X_test_embeddings.cpu().numpy(), X_train_embeddings.cpu().numpy())

ngram_range_w = (1, 2)
ngram_range_c = (2, 4)
max_df = 0.8
min_df = 1

word_vectorizer = TfidfVectorizer(ngram_range=ngram_range_w, max_df=max_df, min_df=min_df, analyzer='word', sublinear_tf=True)
char_vectorizer = TfidfVectorizer(ngram_range=ngram_range_c, max_df=max_df, min_df=min_df, analyzer='char', sublinear_tf=True)
tfidf = FeatureUnion([("word_tfidf", word_vectorizer), ("char_tfidf", char_vectorizer)])

X_train_tfidf = tfidf.fit_transform(X_train['preprocess_prompt'])
X_test_tfidf = tfidf.transform(X_test['preprocess_prompt'])
lexical_similarities = cosine_similarity(X_test_tfidf, X_train_tfidf)

for i in range(1,11,1):
    alpha = i/10
    
    combined_similarities = alpha * similarities + (1 - alpha) * lexical_similarities
    
    top_indices = np.argmax(combined_similarities, axis=1)
    
    X_test['retrieved_response'] = X_train.iloc[top_indices]['model_response'].values
    
    smoothing_function = SmoothingFunction().method3
    def compute_bleu(ref_text, hyp_text):
        return sentence_bleu([ref_text.split()], hyp_text.split(), 
                             weights=(0.5, 0.5, 0, 0), 
                             smoothing_function=smoothing_function)
    
    X_test['bleu_score'] = X_test.apply(lambda row: compute_bleu(row['model_response'], row['retrieved_response']), axis=1)
    
    average_bleu = X_test['bleu_score'].mean()
    print(f'Average BLEU for alpha={alpha} is: {average_bleu}')

Average BLEU for alpha=0.1 is: 0.09763591275616758
Average BLEU for alpha=0.2 is: 0.10228284042124845
Average BLEU for alpha=0.3 is: 0.10635380183359866
Average BLEU for alpha=0.4 is: 0.10819860105729108
Average BLEU for alpha=0.5 is: 0.10971284244252758
Average BLEU for alpha=0.6 is: 0.110796625131974
Average BLEU for alpha=0.7 is: 0.1101675598143283
Average BLEU for alpha=0.8 is: 0.11022807233311652
Average BLEU for alpha=0.9 is: 0.11005694972271751
Average BLEU for alpha=1.0 is: 0.10856402947082004


In [45]:
# !pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [21]:
# !pip uninstall torch transformers sentence-transformers -y

Found existing installation: torch 2.6.0
Uninstalling torch-2.6.0:
  Successfully uninstalled torch-2.6.0
Found existing installation: transformers 4.50.0
Uninstalling transformers-4.50.0:
  Successfully uninstalled transformers-4.50.0
Found existing installation: sentence-transformers 3.4.1
Uninstalling sentence-transformers-3.4.1:
  Successfully uninstalled sentence-transformers-3.4.1


You can safely remove it manually.


In [23]:
# !pip cache purge

Files removed: 0




In [24]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-win_amd64.whl.metadata (28 kB)
Downloading https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-win_amd64.whl (206.5 MB)
   ---------------------------------------- 0.0/206.5 MB ? eta -:--:--
   ---------------------------------------- 0.8/206.5 MB 4.8 MB/s eta 0:00:44
    --------------------------------------- 2.6/206.5 MB 6.9 MB/s eta 0:00:30
    --------------------------------------- 3.7/206.5 MB 6.2 MB/s eta 0:00:33
    --------------------------------------- 5.0/206.5 MB 6.4 MB/s eta 0:00:32
   - -------------------------------------- 6.6/206.5 MB 6.8 MB/s eta 0:00:30
   - -------------------------------------- 8.1/206.5 MB 6.9 MB/s eta 0:00:29
   - -------------------------------------- 9.2/206.5 MB 7.0 MB/s eta 0:00:29
   -- ------------------------------------- 11.3/206.5 MB 6.8 MB/s eta 0:00:29
   -- --------

In [25]:
# !pip install transformers sentence-transformers

Collecting transformers
  Downloading transformers-4.50.0-py3-none-any.whl.metadata (39 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Downloading transformers-4.50.0-py3-none-any.whl (10.2 MB)
   ---------------------------------------- 0.0/10.2 MB ? eta -:--:--
   - -------------------------------------- 0.3/10.2 MB ? eta -:--:--
   ---- ----------------------------------- 1.0/10.2 MB 3.9 MB/s eta 0:00:03
   ------- -------------------------------- 1.8/10.2 MB 4.0 MB/s eta 0:00:03
   ---------- ----------------------------- 2.6/10.2 MB 4.0 MB/s eta 0:00:02
   ------------- -------------------------- 3.4/10.2 MB 3.9 MB/s eta 0:00:02
   -------------------- ------------------- 5.2/10.2 MB 4.7 MB/s eta 0:00:02
   ----------------------- ---------------- 6.0/10.2 MB 4.7 MB/s eta 0:00:01
   --------------------------- ------------ 7.1/10.2 MB 4.7 MB/s eta 0:00:01
   ------------------------------------- -- 9.4/10.2 MB 5.4 M