In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize, pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer, util

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

# Load CSV
df = pd.read_csv("generated_caption_model.csv")

models = {
    "Model M1": "M1_caption",
    "Model M2": "M2_caption",
    "Model M3": "M3_caption"
}

sia = SentimentIntensityAnalyzer()
embedder = SentenceTransformer("all-MiniLM-L6-v2")

eda_results = {}

# ---------- Helper Functions ----------

def vocabulary_richness(captions):
    tokens = [w for cap in captions for w in word_tokenize(cap.lower())]
    return len(set(tokens)) / len(tokens)

def avg_caption_length(captions):
    return np.mean([len(word_tokenize(c)) for c in captions])

def sentiment_score(captions):
    return np.mean([sia.polarity_scores(c)["compound"] for c in captions])

def pos_diversity(captions):
    pos_tags = []
    for c in captions:
        pos_tags += [tag for _, tag in pos_tag(word_tokenize(c))]
    return len(set(pos_tags)) / len(pos_tags)

def semantic_similarity(refs, cands):
    ref_emb = embedder.encode(refs, convert_to_tensor=True)
    cand_emb = embedder.encode(cands, convert_to_tensor=True)
    return util.cos_sim(ref_emb, cand_emb).mean().item()

# ---------- Compute EDA ----------

for model, col in models.items():
    captions = df[col].astype(str).tolist()
    references = df["reference_caption"].astype(str).tolist()

    eda_results[model] = {
        "Vocabulary": vocabulary_richness(captions),
        "Readability": avg_caption_length(captions),
        "Sentiment": sentiment_score(captions),
        "POS Diversity": pos_diversity(captions),
        "Similarity": semantic_similarity(references, captions)
    }

# ---------- Normalize (0–1 scale) ----------

eda_df = pd.DataFrame(eda_results).T
eda_df = (eda_df - eda_df.min()) / (eda_df.max() - eda_df.min())

eda_df.reset_index(inplace=True)
eda_df.rename(columns={"index": "Model"}, inplace=True)

eda_df.to_csv("eda_results_real.csv", index=False)

print("✅ REAL EDA computed and saved to eda_results_real.csv")
print(eda_df)
