In [52]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import pearsonr

In [53]:
vi_data_df = pd.read_csv('vi_text_retrieval.csv')
context = vi_data_df['text']
context = [doc.lower() for doc in context]

vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(context)
tfidf.toarray()[7][0]

0.31126580760710637

In [54]:
def tfidf_search(question, tfidf_vectorizer, top_docs=5):
    context = vi_data_df['text']
    context = [doc.lower() for doc in context]
    
    # Fit the TF-IDF vectorizer and transform the documents
    tfidf = tfidf_vectorizer.fit_transform(context)
    
    # Transform the question using the same vectorizer
    question_tfidf = tfidf_vectorizer.transform([question.lower()])
    
    # Compute cosine similarity between the question and the documents
    cosine_scores = cosine_similarity(tfidf, question_tfidf)
    
    # Get top k cosine score and index its
    results = []
    for idx in cosine_scores.argsort()[0][-top_docs:]:
        doc_score = {
            'id': idx,
            'cosine_similarity': cosine_scores[idx]
        }
        results.append(doc_score)
    return results

In [55]:
question = vi_data_df.iloc[0]['question']
results = tfidf_search(question, vectorizer, top_docs=5)
results[0]['cosine_similarity']

array([0.62799105])

In [60]:
def corr_search(question, tfidf_vectorizer, documents, top_docs=5):
    # Convert documents to lowercase
    context = [doc.lower() for doc in documents]
    
    # Fit and transform the TF-IDF vectorizer with the documents
    tfidf_matrix = tfidf_vectorizer.fit_transform(context)
    
    # Transform the question into TF-IDF representation
    query_tfidf = tfidf_vectorizer.transform([question.lower()])
    
    # Convert query vector to dense array
    query_vector = query_tfidf.toarray().flatten()
    
    # Initialize list to store correlation scores
    corr_scores = []
    
    # Compute Pearson correlation between the query and each document
    for i in range(tfidf_matrix.shape[0]):
        # Convert document vector to dense array
        doc_vector = tfidf_matrix.getrow(i).toarray().flatten()
        
        # Ensure vectors are non-empty and of the same length
        if len(query_vector) == len(doc_vector) and np.any(query_vector) and np.any(doc_vector):
            corr, _ = pearsonr(query_vector, doc_vector)
        else:
            corr = 0  # Handle cases where vectors might not be comparable
        
        corr_scores.append((i, corr))
    
    # Sort and get top k correlation scores and their indices
    sorted_scores = sorted(corr_scores, key=lambda x: x[1], reverse=True)
    results = [{'id': idx, 'corr_score': score} for idx, score in sorted_scores[:top_docs]]
    
    return results

In [65]:
question = vi_data_df.iloc[0]['question']
results = corr_search(question, vectorizer, vi_data_df['text'], top_docs=10)
results[1]['corr_score']

0.20734246471972492

In [66]:
results

[{'id': 0, 'corr_score': 0.6259599752568686},
 {'id': 97, 'corr_score': 0.20734246471972492},
 {'id': 136, 'corr_score': 0.1712461552016509},
 {'id': 384, 'corr_score': 0.1552034605469958},
 {'id': 118, 'corr_score': 0.1544453199666376},
 {'id': 88, 'corr_score': 0.13898841948520774},
 {'id': 440, 'corr_score': 0.0917184643138468},
 {'id': 102, 'corr_score': 0.08894581424330283},
 {'id': 424, 'corr_score': 0.08699546869428038},
 {'id': 405, 'corr_score': 0.08508517160933964}]