In [None]:
!pip install pandas scikit-learn sentence-transformers openpyxl  #if needed

In [None]:
# =============== Imports ===============
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer, util

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# =============== 1. Config section (edit according to your setup) ===============

# File paths
ABSTRACT_FILE = "YOUR FILE NAME.xlsx"
SDG_KEYWORD_FILE = "SDG_keywords.xlsx"

# Column names (check against your own Excel)
ABSTRACT_COL = "abstract"     # Column containing abstracts
SDG_COL = "goal"              # SDG label, e.g. "Goal 1" ..."Goal 17"
KEYWORDS_COL = "keywords"     # Comma-separated keyword phrases

# Output column names
PRIMARY_SDG_COL = "Primary_SDG"
PRIMARY_SCORE_COL = "Primary_SDG_Score"
MATCHED_KEYWORDS_COL = "Primary_SDG_Semantic_Matched_Keywords"

# TF-IDF relevance threshold: used only to flag "relevant or not",
# does NOT affect which SDG is chosen as Primary SDG
RELEVANCE_THRESHOLD = 0.05   # You can tune this later, e.g. 0.05 or 0.10

# Semantic keyword match threshold (higher = stricter)
KEYWORD_SIM_THRESHOLD = 0.60   # Recommended start at 0.60; if too few hits, try 0.55

# Output file name
OUTPUT_FILE = "YOUR_FILE_NAME.xlsx"


# =============== 2. Read data ===============

df_abs = pd.read_excel(ABSTRACT_FILE)
df_sdg = pd.read_excel(SDG_KEYWORD_FILE)

# Fill missing abstracts with empty strings to avoid errors
df_abs[ABSTRACT_COL] = df_abs[ABSTRACT_COL].fillna("").astype(str)

# Basic cleaning for SDG data
df_sdg = df_sdg.dropna(subset=[SDG_COL, KEYWORDS_COL]).copy()
df_sdg[SDG_COL] = df_sdg[SDG_COL].astype(str)
df_sdg[KEYWORDS_COL] = df_sdg[KEYWORDS_COL].astype(str)


# =============== 3. Build one "keyword document" per SDG (for TF-IDF) ===============

# If one SDG appears on multiple rows, concatenate all its keywords into one long string
# e.g. Goal 1 -> "extreme poverty, income inequality, ... , social protection"
sdg_doc_series = df_sdg.groupby(SDG_COL)[KEYWORDS_COL].apply(
    lambda s: ", ".join([str(x) for x in s if pd.notna(x)])
)

sdg_ids = list(sdg_doc_series.index)     # e.g. ["Goal 1", "Goal 2", ...]
sdg_texts = list(sdg_doc_series.values) # The long keyword document for each SDG

print(f"Detected {len(sdg_ids)} SDGs: {sdg_ids}")


# =============== 4. TF-IDF: compute Primary SDG and Score ===============

abstract_texts = df_abs[ABSTRACT_COL].tolist()

# Corpus = SDG documents + all abstracts
corpus = sdg_texts + abstract_texts

vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),   # unigrams + bigrams
    max_df=0.95,
    min_df=1
)

tfidf_matrix = vectorizer.fit_transform(corpus)

n_sdg = len(sdg_ids)
sdg_matrix = tfidf_matrix[:n_sdg, :]      # SDG vectors
abstract_matrix = tfidf_matrix[n_sdg:, :] # Abstract vectors

# Cosine similarity: shape (num_abstracts, n_sdg)
similarity_matrix = cosine_similarity(abstract_matrix, sdg_matrix)

# For each abstract, find the SDG with the highest score
primary_sdg_indices = similarity_matrix.argmax(axis=1)
primary_scores = similarity_matrix[np.arange(similarity_matrix.shape[0]), primary_sdg_indices]
primary_sdgs = [sdg_ids[idx] for idx in primary_sdg_indices]

# Write back to DataFrame
df_abs[PRIMARY_SDG_COL] = primary_sdgs
df_abs[PRIMARY_SCORE_COL] = primary_scores

# Optional: add a boolean flag "Primary_SDG_Relevant" based on the agreed threshold
df_abs["Primary_SDG_Relevant"] = df_abs[PRIMARY_SCORE_COL] >= RELEVANCE_THRESHOLD


# =============== 5. Sentence-level semantic matching + fallback keywords ===============
import re
import torch
from sentence_transformers import SentenceTransformer, util

print("Loading sentence embedding model...")
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Model loaded.")

# Semantic match threshold (controls how strict a "normal" keyword hit is)
KEYWORD_SIM_THRESHOLD = 0.55  # You can tune this based on results

# Fallback: only if Primary_SDG_Score >= this value, we force at least one keyword
FALLBACK_PRIMARY_SCORE = 0.05

# Simple sentence splitter
def split_sentences(text: str):
    text = str(text).strip()
    if not text:
        return []
    parts = re.split(r'[。！？!?\.]+', text)
    return [p.strip() for p in parts if p.strip()]

# 1) Prepare SDG -> [phrase list] and SDG -> phrase embeddings
sdg_phrase_map = {}
sdg_phrase_emb_map = {}

for goal, big_str in sdg_doc_series.items():
    phrases = [p.strip() for p in str(big_str).split(",") if p.strip()]
    if not phrases:
        continue
    phrase_embs = model.encode(phrases, convert_to_tensor=True, show_progress_bar=False)
    sdg_phrase_map[goal] = phrases
    sdg_phrase_emb_map[goal] = phrase_embs

print(f"Computed keyword embeddings for {len(sdg_phrase_map)} SDGs.")

# Retrieve existing results
abstract_texts = df_abs[ABSTRACT_COL].fillna("").astype(str).tolist()
primary_sdgs = df_abs[PRIMARY_SDG_COL].tolist()
primary_scores_list = df_abs[PRIMARY_SCORE_COL].tolist()

semantic_matched_keywords = []

# 2) For each abstract, do sentence-level semantic matching + fallback
for abs_text, sdg_id, primary_score in zip(abstract_texts, primary_sdgs, primary_scores_list):
    abs_text = abs_text.strip()
    if not abs_text or sdg_id not in sdg_phrase_map:
        semantic_matched_keywords.append("")
        continue

    sentences = split_sentences(abs_text)
    if not sentences:
        semantic_matched_keywords.append("")
        continue

    # Encode all sentences at once
    sent_embs = model.encode(sentences, convert_to_tensor=True, show_progress_bar=False)

    phrases = sdg_phrase_map[sdg_id]
    phrase_embs = sdg_phrase_emb_map[sdg_id]   # [num_phrases, dim]

    # sim_matrix: [num_phrases, num_sentences]
    sim_matrix = util.cos_sim(phrase_embs, sent_embs)

    matched = []
    best_phrase = None
    best_sim = -1.0

    # For each phrase, take its maximum similarity over all sentences
    for ph, sims in zip(phrases, sim_matrix):
        max_sim = float(torch.max(sims))

        # Track the single best phrase overall, for fallback
        if max_sim > best_sim:
            best_sim = max_sim
            best_phrase = ph

        # Normal hit: if above threshold, add to matched list
        if max_sim >= KEYWORD_SIM_THRESHOLD:
            matched.append(ph)

    # If there are no normal matches but the primary_score is high enough,
    # and we have a best_phrase, then use fallback to force at least one keyword.
    if not matched and primary_score >= FALLBACK_PRIMARY_SCORE and best_phrase is not None:
        # You could optionally tag this as a fallback, but here we keep it simple:
        # matched = [f"{best_phrase} (fallback, sim={best_sim:.2f})"]
        matched = [best_phrase]

    matched = sorted(set(matched))
    semantic_matched_keywords.append("; ".join(matched))

# Write back to DataFrame
df_abs[MATCHED_KEYWORDS_COL] = semantic_matched_keywords

# =============== 6. Export results ===============

df_abs.to_excel(OUTPUT_FILE, index=False)
print(f"All done! Results saved to: {OUTPUT_FILE}")
