In [4]:
import ast
import numpy as np
import torch
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer, util
import gc

# ✅ Optimized model list (lightweight and performant)
model_list = [
    "all-MiniLM-L6-v2", "all-MiniLM-L12-v2", "all-mpnet-base-v2",
    "LaBSE", "multi-qa-MiniLM-L6-cos-v1"
]

# 📥 Get user input
user_input = input("Enter a list of sentences (e.g., ['What is AI?', 'Tell me a joke']): ")
try:
    answerlist = ast.literal_eval(user_input)
    if not isinstance(answerlist, list) or not all(isinstance(x, str) for x in answerlist):
        raise ValueError
except Exception:
    print("❌ Invalid input. Please enter a valid Python list of strings.")
    exit()

# 🔍 Select best model by PCA variance
def get_best_model_by_variance(answerlist, model_list):
    best_model = None
    best_variance = 0.0

    for model_name in model_list:
        try:
            print(f"\n🔄 Evaluating model: {model_name}")
            model = SentenceTransformer(model_name)
            embeddings = model.encode(answerlist, normalize_embeddings=False)
            pca = PCA(n_components=2)
            reduced = pca.fit_transform(embeddings)
            total_var = sum(pca.explained_variance_ratio_)
            print(f"📊 {model_name}: Total PCA variance = {total_var * 100:.2f}%")

            if total_var > best_variance:
                best_variance = total_var
                best_model = model_name

            # 🚮 Cleanup
            del model, embeddings, reduced, pca
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        except Exception as e:
            print(f"⚠️ {model_name}: Skipped due to error: {e}")

    return best_model
    
print((get_best_model_by_variance(answerlist, model_list)))
# 🚀 Determine best model and load it
best_model_name = get_best_model_by_variance(answerlist, model_list)
print(f"\n✅ Best model based on PCA variance: {best_model_name}")
model = SentenceTransformer(best_model_name)

# ⚡ Precompute embeddings
real_embeddings = model.encode(answerlist, convert_to_tensor=True)

# 🔎 Single query function
def answer(query):
    query_embedding = model.encode(query, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(query_embedding, real_embeddings)[0]
    best_idx = torch.argmax(similarities).item()
    best_score = similarities[best_idx].item()
    best_match = answerlist[best_idx]
    print(f"✅ Best Match: '{best_match}' (score: {best_score:.4f})")
    return best_match

# 🔄 Batch query function
def batch_answer(queries):
    if isinstance(queries, str):
        queries = [queries]
    query_embeddings = model.encode(queries, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(query_embeddings, real_embeddings)
    results = []
    for i in range(len(queries)):
        sim_row = similarities[i]
        best_idx = torch.argmax(sim_row).item()
        best_match = answerlist[best_idx]
        results.append(best_match)
    return results


Enter a list of sentences (e.g., ['What is AI?', 'Tell me a joke']):  [         "LaBSE", "paraphrase-multilingual-mpnet-base-v2", "paraphrase-multilingual-MiniLM-L12-v2",         "all-mpnet-base-v2", "all-MiniLM-L12-v2", "all-MiniLM-L6-v2", "static-similarity-mrl-multilingual-v1",         "sentence-t5-xl", "sentence-t5-xxl", "sentence-t5-large", "sentence-t5-base",         "static-retrieval-mrl-en-v1", "multi-qa-MiniLM-L6-cos-v1", "gtr-t5-xxl", "gtr-t5-xl", "gtr-t5-base", "gtr-t5-large"     ]



🔄 Evaluating model: all-MiniLM-L6-v2
📊 all-MiniLM-L6-v2: Total PCA variance = 42.28%

🔄 Evaluating model: all-MiniLM-L12-v2
📊 all-MiniLM-L12-v2: Total PCA variance = 41.34%

🔄 Evaluating model: all-mpnet-base-v2
📊 all-mpnet-base-v2: Total PCA variance = 46.18%

🔄 Evaluating model: LaBSE
📊 LaBSE: Total PCA variance = 47.44%

🔄 Evaluating model: multi-qa-MiniLM-L6-cos-v1
📊 multi-qa-MiniLM-L6-cos-v1: Total PCA variance = 41.95%
LaBSE

🔄 Evaluating model: all-MiniLM-L6-v2
📊 all-MiniLM-L6-v2: Total PCA variance = 42.28%

🔄 Evaluating model: all-MiniLM-L12-v2
📊 all-MiniLM-L12-v2: Total PCA variance = 41.34%

🔄 Evaluating model: all-mpnet-base-v2
📊 all-mpnet-base-v2: Total PCA variance = 46.18%

🔄 Evaluating model: LaBSE
📊 LaBSE: Total PCA variance = 47.44%

🔄 Evaluating model: multi-qa-MiniLM-L6-cos-v1
📊 multi-qa-MiniLM-L6-cos-v1: Total PCA variance = 41.95%

✅ Best model based on PCA variance: LaBSE
