In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import time
start_time = time.time()
def load_data(file_path):
    return pd.read_csv(file_path).head(100)
def lowercase_all(df):
    df = df.map(lambda x: x.lower() if isinstance(x, str) else x)
    return df
def generate_embeddings(model, df):
    embeddings = {}
    for column in df.columns:
        column_embeddings = model.encode(df[column].astype(str).tolist(), convert_to_tensor=True)
        embeddings[column] = column_embeddings.cpu().numpy()
    return embeddings
def compute_cosine_similarity(query_embedding, entry_embeddings):
    similarities = np.dot(entry_embeddings, query_embedding.T)
    return similarities.max()
def calculate_similarities(df, query, model):
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy()
    results = []
    embeddings = generate_embeddings(model, df)
    for index, row in df.iterrows():
        row_similarities = []
        for column in df.columns:
            entry_embeddings = embeddings[column][index].reshape(1, -1)
            similarity = compute_cosine_similarity(query_embedding, entry_embeddings)
            row_similarities.append((column, similarity))
        max_similarity = max(row_similarities, key=lambda x: x[1])[1]
        results.append((index, max_similarity))
    return results
def find_best_match(similarities):
    best_match = max(similarities, key=lambda x: x[1], default=None)
    return best_match
def process_data(file_path, query):
    df = load_data(file_path)
    df = lowercase_all(df)
    model = SentenceTransformer('all-MiniLM-L6-v2')
    similarities = calculate_similarities(df, query, model)
    best_match = find_best_match(similarities)
    return best_match, df
def main():
    file_path = input("Enter the path to the CSV file: ")
    user_query = input("Enter the query to match: ")
    best_match, df = process_data(file_path, user_query)
    
    if best_match is not None:
        index, similarity = best_match
        print("File path:", file_path)
        print("Query given:", user_query)
        print("Best Match Found at index:", index)
        print("Similarity Score:", similarity)
        print(df.iloc[index])
    else:
        print("No match found")
if __name__ == "__main__":
    main()
end_time = time.time()
runtime = end_time - start_time
print(f"Runtime: {runtime} seconds")



File path: /Users/haresh.sam/Documents/tempV/table.csv
Query given: ethnic
Best Match Found at index: 33
Similarity Score: 0.6264677
title              ethnicity in the ancient world - did it matter?
author                                              erich s. gruen
category                                              anthropology
publish_year                               first published in 2020
title_id                                        /works/ol22153255w
author_id                        /authors/ol398860a/erich_s._gruen
cover_url                         /images/icons/avatar_book-sm.png
book_stats       publish date 2020|publisher  de gruyter gmbh, ...
descriptions     this edition doesn't have a description yet. c...
reading_stats       0 want to read|0 currently reading|0 have read
Name: 33, dtype: object
Runtime: 55.36014485359192 seconds
