In [1]:
#install dependencies
!pip install -q sentence-transformers transformers torch yake openpyxl
!pip install -q yake
!pip install -q sentence-transformers yake
!pip install -q tqdm
!pip install -q xlsxwriter



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.7/80.7 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.5/360.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
#  Imports
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import yake
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm




In [3]:
from google.colab import files
uploaded = files.upload()

Saving gea_statements.xlsx to gea_statements.xlsx


In [5]:
# load data
df = pd.read_excel("/content/gea_statements.xlsx")

# clean type column (strip spaces, lowercase)
df['Type'] = df['Type'].str.strip().str.lower()

# fix common typos
df['Type'] = df['Type'].replace({
    'core alue': 'core value',
    'steategy': 'strategy',
    'goals': 'goal'
})

# create lists
missions   = df.loc[df['Type'] == 'mission', 'Statement'].dropna().tolist()
visions    = df.loc[df['Type'] == 'vision', 'Statement'].dropna().tolist()
values     = df.loc[df['Type'] == 'core value', 'Statement'].dropna().tolist()
goals      = df.loc[df['Type'] == 'goal', 'Statement'].dropna().tolist()
strategies = df.loc[df['Type'] == 'strategy', 'Statement'].dropna().tolist()

print("Loaded:")
print("Missions:", len(missions))
print("Visions:", len(visions))
print("Values:", len(values))
print("Goals:", len(goals))
print("Strategies:", len(strategies))

KeyError: 'Type'

In [None]:
# embeddings
EMB_MODEL_NAME = "all-mpnet-base-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
emb_model = SentenceTransformer(EMB_MODEL_NAME, device="cuda" if torch.cuda.is_available() else "cpu")

def embed(texts):
    if isinstance(texts, str):
        texts = [texts]
    return emb_model.encode(texts, normalize_embeddings=True, batch_size=32, show_progress_bar=False)

In [None]:
#  NLI / entailment
NLI_MODEL = "roberta-large-mnli"
tok = AutoTokenizer.from_pretrained(NLI_MODEL)
nli = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
nli.eval()
CONTR_IDX, NEUTRAL_IDX, ENTAIL_IDX = 0, 1, 2

@torch.inference_mode()
def nli_probs(premises, hypotheses, batch_size=16):
    device = next(nli.parameters()).device
    c_all, n_all, e_all = [], [], []
    for i in range(0, len(premises), batch_size):
        p = premises[i:i+batch_size]
        h = hypotheses[i:i+batch_size]
        enc = tok(p, h, padding=True, truncation=True, return_tensors="pt").to(device)
        logits = nli(**enc).logits
        probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
        c_all.append(probs[:, CONTR_IDX])
        n_all.append(probs[:, NEUTRAL_IDX])
        e_all.append(probs[:, ENTAIL_IDX])
    return np.concatenate(c_all), np.concatenate(n_all), np.concatenate(e_all)



In [None]:
# keyword extraction using YAKE
def extract_keywords(text, max_keywords=5):
    kw_extractor = yake.KeywordExtractor(lan="en", n=3, top=max_keywords)
    keywords = kw_extractor.extract_keywords(text)
    return [kw for kw, score in keywords] or [text]


In [None]:
# precompute embeddings & keywords for speed
def precompute_embeddings(texts, use_keywords=False, max_kw=5):
    """
    Returns:
        items: list[str or list[str]]  -> text or keywords
        embeddings: list[np.array]    -> embeddings for each text/keyword
    """
    items = []
    embeddings = []

    for t in texts:
        # extract keywords if requested
        if use_keywords:
            kws = extract_keywords(t, max_keywords=max_kw)
            if not kws:
                kws = [t]  # fallback to full text
        else:
            kws = [t]

        items.append(kws)

        # embed each keyword and take mean for multi-keyword entries
        kw_embs = embed(kws)
        mean_emb = np.mean(kw_embs, axis=0)
        embeddings.append(mean_emb)

    return items, embeddings

# optimized scoring using precomputed embeddings
def build_matrix_fast(rows, cols, use_keywords=False, max_kw=5):
    # precompute embeddings
    rows_items, rows_embs = precompute_embeddings(rows, use_keywords=False)
    cols_items, cols_embs = precompute_embeddings(cols, use_keywords=use_keywords, max_kw=max_kw)

    mat = []
    debug_rows = []

    for i, r_emb in enumerate(rows_embs):
        row_scores = []
        for j, c_emb in enumerate(cols_embs):
            # cosine similarity
            sim = float(np.dot(r_emb, c_emb))

            # NLI on full texts (for more precision, you can also do batch NLI)
            c, n, e = nli_probs([rows[i]], [cols[j]])
            c, n, e = float(c[0]), float(n[0]), float(e[0])

            # apply scoring rules
            if c >= 0.60 and sim >= 0.20: score = -2
            elif c >= 0.40 and sim >= 0.20: score = -1
            elif sim >= 0.55: score = 3
            elif sim >= 0.42: score = 2
            elif sim >= 0.22: score = 1
            else: score = 0

            row_scores.append(score)
            debug_rows.append({
                "premise": rows[i],
                "hypothesis": cols[j],
                "score": score,
                "keywords": cols_items[j]
            })

        mat.append(row_scores)

    df_matrix = pd.DataFrame(mat, columns=cols)
    df_matrix.insert(0, "Premise", rows)
    df_debug = pd.DataFrame(debug_rows)
    return df_matrix, df_debug


In [None]:
# scoring pairs
def score_pair(premise, hypothesis):
    s_vec = embed(premise)[0]
    h_vec = embed(hypothesis)[0]
    sim = float(np.dot(s_vec, h_vec))  # cosine similarity, embeddings normalized

    c, n, e = nli_probs([premise], [hypothesis])
    c, n, e = float(c[0]), float(n[0]), float(e[0])

    # scoring rules
    if c >= 0.60 and sim >= 0.20: return -2, sim, c, n, e
    if c >= 0.40 and sim >= 0.20: return -1, sim, c, n, e
    if sim >= 0.55: return 3, sim, c, n, e
    if sim >= 0.42: return 2, sim, c, n, e
    if sim >= 0.22: return 1, sim, c, n, e
    return 0, sim, c, n, e


In [None]:
# plot matrix
def plot_matrix(df_matrix, title="Coherence Matrix"):
    plt.figure(figsize=(10,6))
    sns.heatmap(df_matrix.iloc[:,1:], annot=True, fmt="d", cmap="coolwarm", cbar=True)
    plt.title(title)
    plt.ylabel("Premise")
    plt.xlabel("Hypothesis")
    plt.show()


In [None]:
from tqdm.notebook import tqdm

matrices = {}

companies = df['Company'].unique()[:4]  # only first 4 companies

for company in tqdm(companies, desc="Processing first 4 companies"):
    df_c = df[df['Company'] == company]

    missions   = df_c.loc[df_c['Type'] == 'mission', 'Statement'].dropna().tolist()
    visions    = df_c.loc[df_c['Type'] == 'vision', 'Statement'].dropna().tolist()
    values     = df_c.loc[df_c['Type'] == 'core value', 'Statement'].dropna().tolist()
    goals      = df_c.loc[df_c['Type'] == 'goal', 'Statement'].dropna().tolist()
    strategies = df_c.loc[df_c['Type'] == 'strategy', 'Statement'].dropna().tolist()

    if len(missions)==0 and len(visions)==0 and len(values)==0 and len(goals)==0:
        continue  # skip empty companies

    matrices[company] = {}

    # build matrices per company
    matrices[company]['Mission_vs_Vision'] = build_matrix_fast(missions, visions, use_keywords=True)
    # matrices[company]['Vision_vs_Mission'] = build_matrix_fast(visions, missions, use_keywords=True)
    matrices[company]['Values_vs_Mission'] = build_matrix_fast(values, missions, use_keywords=False)
    matrices[company]['Values_vs_Vision'] = build_matrix_fast(values, visions, use_keywords=False)
    matrices[company]['Goals_vs_Mission'] = build_matrix_fast(goals, missions, use_keywords=False)
    matrices[company]['Goals_vs_Vision'] = build_matrix_fast(goals, visions, use_keywords=False)
    matrices[company]['Goals_vs_Strategy'] = build_matrix_fast(goals, strategies, use_keywords=False)


In [None]:
from IPython.display import display
import ipywidgets as widgets
import os


# function to save matrices per company

def save_company_matrices(matrices_dict, folder="company_excels"):
    """
    Save each company's matrices to a separate Excel file.
    Returns dict of filenames.
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

    saved_files = {}

    for company, comps_matrices in matrices_dict.items():
        filename = f"{folder}/{company.replace(' ','_')}_matrices.xlsx"
        with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
            for name, (df_matrix, df_debug) in comps_matrices.items():
                df_matrix.to_excel(writer, sheet_name=name[:31], index=False)
        saved_files[company] = filename

    return saved_files

# function to create download buttons

def create_download_buttons(saved_files):
    """
    Create clickable download buttons in Colab for each file.
    """
    for company, file_path in saved_files.items():
        button = widgets.Button(description=f"Download {company}")
        output = widgets.Output()

        def on_button_clicked(b, path=file_path):
            with output:
                files.download(path)

        button.on_click(on_button_clicked)
        display(button, output)

# save & create buttons

saved_files = save_company_matrices(matrices)
create_download_buttons(saved_files)


for company, company_matrices in matrices.items():
    print(f"\n=== {company} ===")
    for matrix_name, (df_matrix, df_debug) in company_matrices.items():
        print(f"\n--- {matrix_name} ---")
        display(df_matrix)
        # plot_matrix(df_matrix, title=f"{company} - {matrix_name}")


