In [9]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

def calculate_avg_similarity(model, reference_sentences, model_outputs):
    """
    Calculates the average cosine similarity between a list of reference
    sentences and a corresponding list of model output sentences.
    """
    if len(reference_sentences) != len(model_outputs):
        print(f"Error: Mismatched list lengths. {len(reference_sentences)} references vs {len(model_outputs)} outputs.")
        return 0.0

    reference_embeddings = model.encode(reference_sentences, convert_to_tensor=True)
    output_embeddings = model.encode(model_outputs, convert_to_tensor=True)
    cosine_scores = util.cos_sim(reference_embeddings, output_embeddings)
    paired_scores = [cosine_scores[i][i].item() for i in range(len(reference_sentences))]
    return np.mean(paired_scores)


# --- 1. DEFINE YOUR DATA ---

reference_style_sentences = [
    "This imagined story of one is the real experience of many. It is rarely a surprise to see the campus buzzing with energy past midnight, whether it's the chatter at Aromas or the tables at the Reading Hall",
    "The institute has a sense of beauty even in those wee hours before sunrise, as one appreciates the thriving student culture that has existed at IITB since time immemorial",
    "That nighttime grind, however, furthers a vicious cycle of sleep deprivation passed down to each batch",
    "There could be a multitude of reasons for participating in this cycle and losing sleep",
    "A good percentage of us suffer from this pattern quite regularly, sometimes for days or even weeks on end. As a result, a lot of the instizens seem visibly afflicted with chronic fatigue",
    "Don’t get us wrong, unlike some other nosy figures in the institute, the intent here is not to push the agenda of ‘students must sleep on time,’ and in the process, dismantle all of the spotty yet admirable legacy we have built, particularly in freedom",
    "This article is not a lecture on sleeping on time - it is rather an attempt to understand if the issue is purely conceived out of choice, or systematic problems beyond our control"
]

three_shot_outputs = [
    "What may appear as an individual eccentricity is, in truth, a collective choreography. The campus, after dusk, hums with a peculiar brilliance—from the soft murmur at Aromas to the unbroken stillness of the Reading Hall—an order so ingrained it transcends mere habit, becoming heritage.",
    "There exists an ineffable gravitas in IIT Bombay’s pre-dawn hours—a quiet sanctity preserved through generations, where sleeplessness has been reimagined not as deprivation, but as devotion.",
    "Yet this ritual of wakefulness, noble in its intent, sustains a lineage of exhaustion—a cycle handed down, batch after batch, its toll etched into weary eyes and unspoken resignation.",
    "The impulses sustaining this rhythm are manifold: ambition cloaked as purpose, anxiety disguised as diligence, and the silent companionship of those unwilling to be left behind in the race for significance.",
    "A considerable portion of the populace remains ensnared in this cadence for months, even years—until fatigue itself becomes a badge, worn with the quiet pride of endurance.",
    "Let it be said—this is no moral plea for early rest, nor an assault on the sanctified autonomy of the nocturnal scholar. The right to sculpt one’s own rhythm remains a freedom too dearly earned to be tampered with.",
    "This meditation, then, is not on sleep, but on agency. Whether this sleepless fervor is an act of will or a symptom of unseen design remains the question that shadows every glowing screen at 3 a.m."
]

PEFT_phi_outputs = [
    "This is the kind of experience that many students go through, with them often being found hanging out at places like Aromas or the Reading Hall past midnight",
    "This charm is especially evident in the pre-dawn hours when one can sense the long-standing culture of IITB where students go beyond just learning to code.",
    "While this kind of nocturnal study also perpetuates the cycle of Sleep deprivation, feeding into the very problem.",
    "The reasons for the same are manifold.",
    "This is a problem that a significant portion of the population faces, often for extended periods, leading to an epidemic of chronic fatigue.",
    "It’s not about imposing a specific bedtime, akin to other such attempts to control student life.",
    "This is not just a column on sleep schedules, but rather a question of whether this is a matter of choice or a symptom of a larger problem."
]

# --- 2. LOAD MODEL ---
model_name = 'paraphrase-mpnet-base-v2'
print(f"Loading model: {model_name}...")
model = SentenceTransformer(model_name)
print("Model loaded.")

# --- 3. CALCULATE SIMILARITIES ---
print("Calculating similarities...")
avg_sim_model_a = calculate_avg_similarity(model, reference_style_sentences, model_a_outputs)
avg_sim_model_b = calculate_avg_similarity(model, reference_style_sentences, model_b_outputs)

# --- 4. DISPLAY RESULTS ---
print("\n--- Style Evaluation Results ---")
print("Average cosine similarity of each model's output vs reference style sentences:\n")
print(f"{'Model':<20} | {'Avg. Cosine Similarity':<25}")
print("-" * 47)
print(f"{'few shot prompting chat gpt 5':<20} | {avg_sim_model_a:<25.4f}")
print(f"{'Fine-tuned Model phi 3.5':<20} | {avg_sim_model_b:<25.4f}")

print("\n--- Interpretation ---")
models = {
    "few shot prompting chat gpt 5": avg_sim_model_a,
    "Fine-tuned Model phi 3.5": avg_sim_model_b
}
best_model = max(models, key=models.get)
best_score = models[best_model]

print(f"✅ {best_model} (Score: {best_score:.4f}) shows the highest similarity.")
print("This indicates its style is the closest to your reference sentences.")
print("\nA higher score means the model's output style is more similar to your reference style.")


Loading model: paraphrase-mpnet-base-v2...
Model loaded.
Calculating similarities...

--- Style Evaluation Results ---
Average cosine similarity of each model's output vs reference style sentences:

Model                | Avg. Cosine Similarity   
-----------------------------------------------
few shot prompting chat gpt 5 | 0.5874                   
Fine-tuned Model phi 3.5 | 0.7020                   

--- Interpretation ---
✅ Fine-tuned Model phi 3.5 (Score: 0.7020) shows the highest similarity.
This indicates its style is the closest to your reference sentences.

A higher score means the model's output style is more similar to your reference style.
