# Comparing initial prompt outputs. Update API key accordingly and output file names

In [None]:
import os
os.environ["GOOGLE_API_KEY"] = 


In [None]:
# === 1. Imports & Configuration ===
import os
import re
import csv
import json
import statistics
from collections import defaultdict, Counter
import google.generativeai as genai

# === 2. Configuration ===
folder_path = r"C:\PhD\Prompt_Engineering\Step1_Local_Outputs"
file_names = [
    "Output1L.txt", "Output1M.txt", "Output1D.txt", "Output1G.txt",
    "Output2L.txt", "Output2M.txt", "Output2D.txt", "Output2G.txt",
    "Output3L.txt", "Output3M.txt", "Output3D.txt", "Output3G.txt"
]

# === 3. Gemini API Configuration ===
API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=API_KEY)  # Replace with your key
model = genai.GenerativeModel("models/gemini-2.5-flash")


# === 4. Load Ordered Features ===
def load_ordered_features(folder_path, file_names):
    outputs = {}
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è File not found: {file_name}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        lines = content.splitlines()
        features = [
            re.sub(r"^(\*|-|‚Ä¢|\d+\.)\s*", "", ln.strip())  # remove bullet points or numbering
            for ln in lines if re.match(r"^(\*|-|‚Ä¢|\d+\.)", ln.strip())
        ]
        outputs[file_name] = features
    return outputs

llm_outputs = load_ordered_features(folder_path, file_names)
print(f"‚úÖ Loaded {len(llm_outputs)} files with ordered features.")

# === 5. Ask Gemini to Normalize Feature Names ===
prompt = f"""
You will be given multiple lists of biochemical or chemical features extracted from different models.

For each unique feature string, normalize it to a single canonical concept name.
Keep a mapping in JSON format like this:

[
  {{
    "original": "Mol weight",
    "normalized": "Molecular weight"
  }},
  {{
    "original": "MW",
    "normalized": "Molecular weight"
  }},
  ...
]

Respond ONLY with valid JSON (a single array, no markdown or explanations).

Here are the lists:
{llm_outputs}
"""

print("üîÑ Sending normalization request to Gemini (may take a minute)...")
response = model.generate_content(prompt)
gemini_output = response.text.strip()

# === 6. Parse Gemini JSON Output Safely ===
def extract_json(text):
    match = re.search(r"(\[.*\])", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Gemini output not valid JSON.")
    return None

mapping = extract_json(gemini_output)
if not mapping:
    raw_path = os.path.join(folder_path, "Gemini_Normalization_RAW.txt")
    with open(raw_path, "w", encoding="utf-8") as f:
        f.write(gemini_output)
    raise SystemExit(f"‚ùå Could not parse Gemini output. Raw text saved at {raw_path}")

print(f"‚úÖ Parsed {len(mapping)} normalized mappings from Gemini.")

# === 7. Apply Normalization ===
norm_dict = {m["original"].lower().strip(): m["normalized"].strip() for m in mapping}

normalized_outputs = {}
for fname, feats in llm_outputs.items():
    normalized_feats = [norm_dict.get(f.lower().strip(), f.strip()) for f in feats]
    normalized_outputs[fname] = normalized_feats

# === 8. Aggregate Statistics (Frequency + Rank) ===
feature_stats = defaultdict(lambda: {"files": set(), "ranks": []})

for fname, features in normalized_outputs.items():
    for rank, feat in enumerate(features, start=1):
        clean_feat = feat.lower().strip()
        feature_stats[clean_feat]["files"].add(fname)
        feature_stats[clean_feat]["ranks"].append(rank)

# === 9. Compute Aggregates ===
summary = []
for feat, data in feature_stats.items():
    num_files = len(data["files"])
    avg_rank = statistics.mean(data["ranks"])
    median_rank = statistics.median(data["ranks"])
    most_common_rank = Counter(data["ranks"]).most_common(1)[0][0]
    summary.append({
        "Feature": feat,
        "Files Mentioned": num_files,
        "Average Rank": round(avg_rank, 2),
        "Median Rank": round(median_rank, 2),
        "Most Common Rank": most_common_rank,
        "Files": "; ".join(sorted(data["files"]))
    })

# === 10. Sort & Save ===
summary_sorted = sorted(summary, key=lambda x: (-x["Files Mentioned"], x["Average Rank"]))
csv_path = os.path.join(folder_path, "Feature_Frequency_and_Rank_Normalized.csv")

with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=summary_sorted[0].keys())
    writer.writeheader()
    writer.writerows(summary_sorted)

print(f"\n‚úÖ Saved normalized feature frequency analysis: {csv_path}")

# === 11. Preview Top 10 ===
print("\n=== Top 10 Most Common (Normalized) Features ===")
for item in summary_sorted[:10]:
    print(f"- {item['Feature']} ‚Üí in {item['Files Mentioned']} files (avg rank {item['Average Rank']})")


In [None]:
# === 1. Imports & Configuration ===
import os
import re
import csv
import json
import statistics
from collections import defaultdict, Counter
import google.generativeai as genai

# === 2. Configuration ===
folder_path = r"C:\PhD\Prompt_Engineering\Step1_Local_Outputs"
file_names = [
    "Output4L.txt", "Output4M.txt", "Output4D.txt", "Output4G.txt",
    "Output5L.txt", "Output5M.txt", "Output5D.txt", "Output5G.txt",
    "Output6L.txt", "Output6M.txt", "Output6D.txt", "Output6G.txt"
]

# === 3. Gemini API Configuration ===
API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=API_KEY)  # Replace with your key
model = genai.GenerativeModel("models/gemini-2.5-flash")


# === 4. Load Ordered Features ===
def load_ordered_features(folder_path, file_names):
    outputs = {}
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è File not found: {file_name}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        lines = content.splitlines()
        features = [
            re.sub(r"^(\*|-|‚Ä¢|\d+\.)\s*", "", ln.strip())  # remove bullet points or numbering
            for ln in lines if re.match(r"^(\*|-|‚Ä¢|\d+\.)", ln.strip())
        ]
        outputs[file_name] = features
    return outputs

llm_outputs = load_ordered_features(folder_path, file_names)
print(f"‚úÖ Loaded {len(llm_outputs)} files with ordered features.")

# === 5. Ask Gemini to Normalize Feature Names ===
prompt = f"""
You will be given multiple lists of biochemical or chemical features extracted from different models.

For each unique feature string, normalize it to a single canonical concept name.
Keep a mapping in JSON format like this:

[
  {{
    "original": "Mol weight",
    "normalized": "Molecular weight"
  }},
  {{
    "original": "MW",
    "normalized": "Molecular weight"
  }},
  ...
]

Respond ONLY with valid JSON (a single array, no markdown or explanations).

Here are the lists:
{llm_outputs}
"""

print("üîÑ Sending normalization request to Gemini (may take a minute)...")
response = model.generate_content(prompt)
gemini_output = response.text.strip()

# === 6. Parse Gemini JSON Output Safely ===
def extract_json(text):
    match = re.search(r"(\[.*\])", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Gemini output not valid JSON.")
    return None

mapping = extract_json(gemini_output)
if not mapping:
    raw_path = os.path.join(folder_path, "Gemini_Normalization_RAW.txt")
    with open(raw_path, "w", encoding="utf-8") as f:
        f.write(gemini_output)
    raise SystemExit(f"‚ùå Could not parse Gemini output. Raw text saved at {raw_path}")

print(f"‚úÖ Parsed {len(mapping)} normalized mappings from Gemini.")

# === 7. Apply Normalization ===
norm_dict = {m["original"].lower().strip(): m["normalized"].strip() for m in mapping}

normalized_outputs = {}
for fname, feats in llm_outputs.items():
    normalized_feats = [norm_dict.get(f.lower().strip(), f.strip()) for f in feats]
    normalized_outputs[fname] = normalized_feats

# === 8. Aggregate Statistics (Frequency + Rank) ===
feature_stats = defaultdict(lambda: {"files": set(), "ranks": []})

for fname, features in normalized_outputs.items():
    for rank, feat in enumerate(features, start=1):
        clean_feat = feat.lower().strip()
        feature_stats[clean_feat]["files"].add(fname)
        feature_stats[clean_feat]["ranks"].append(rank)

# === 9. Compute Aggregates ===
summary = []
for feat, data in feature_stats.items():
    num_files = len(data["files"])
    avg_rank = statistics.mean(data["ranks"])
    median_rank = statistics.median(data["ranks"])
    most_common_rank = Counter(data["ranks"]).most_common(1)[0][0]
    summary.append({
        "Feature": feat,
        "Files Mentioned": num_files,
        "Average Rank": round(avg_rank, 2),
        "Median Rank": round(median_rank, 2),
        "Most Common Rank": most_common_rank,
        "Files": "; ".join(sorted(data["files"]))
    })

# === 10. Sort & Save ===
summary_sorted = sorted(summary, key=lambda x: (-x["Files Mentioned"], x["Average Rank"]))
csv_path = os.path.join(folder_path, "Feature_Frequency_and_Rank_Normalized4-6.csv")

with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=summary_sorted[0].keys())
    writer.writeheader()
    writer.writerows(summary_sorted)

print(f"\n‚úÖ Saved normalized feature frequency analysis: {csv_path}")

# === 11. Preview Top 10 ===
print("\n=== Top 10 Most Common (Normalized) Features ===")
for item in summary_sorted[:10]:
    print(f"- {item['Feature']} ‚Üí in {item['Files Mentioned']} files (avg rank {item['Average Rank']})")

In [None]:
# === 1. Imports & Configuration ===
import os
import re
import csv
import json
import statistics
from collections import defaultdict, Counter
import google.generativeai as genai

# === 2. Configuration ===
folder_path = r"C:\PhD\Prompt_Engineering\Step1_Local_Outputs"
file_names = [
    "Output7L.txt", "Output7M.txt", "Output7D.txt", "Output7G.txt",
    "Output8L.txt", "Output8M.txt", "Output8D.txt", "Output8G.txt",
    "Output9L.txt", "Output9M.txt", "Output9D.txt", "Output9G.txt"
]

# === 3. Gemini API Configuration ===
API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=API_KEY)  # Replace with your key
model = genai.GenerativeModel("models/gemini-2.5-flash")


# === 4. Load Ordered Features ===
def load_ordered_features(folder_path, file_names):
    outputs = {}
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è File not found: {file_name}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        lines = content.splitlines()
        features = [
            re.sub(r"^(\*|-|‚Ä¢|\d+\.)\s*", "", ln.strip())  # remove bullet points or numbering
            for ln in lines if re.match(r"^(\*|-|‚Ä¢|\d+\.)", ln.strip())
        ]
        outputs[file_name] = features
    return outputs

llm_outputs = load_ordered_features(folder_path, file_names)
print(f"‚úÖ Loaded {len(llm_outputs)} files with ordered features.")

# === 5. Ask Gemini to Normalize Feature Names ===
prompt = f"""
You will be given multiple lists of biochemical or chemical features extracted from different models.

For each unique feature string, normalize it to a single canonical concept name.
Keep a mapping in JSON format like this:

[
  {{
    "original": "Mol weight",
    "normalized": "Molecular weight"
  }},
  {{
    "original": "MW",
    "normalized": "Molecular weight"
  }},
  ...
]

Respond ONLY with valid JSON (a single array, no markdown or explanations).

Here are the lists:
{llm_outputs}
"""

print("üîÑ Sending normalization request to Gemini (may take a minute)...")
response = model.generate_content(prompt)
gemini_output = response.text.strip()

# === 6. Parse Gemini JSON Output Safely ===
def extract_json(text):
    match = re.search(r"(\[.*\])", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Gemini output not valid JSON.")
    return None

mapping = extract_json(gemini_output)
if not mapping:
    raw_path = os.path.join(folder_path, "Gemini_Normalization_RAW.txt")
    with open(raw_path, "w", encoding="utf-8") as f:
        f.write(gemini_output)
    raise SystemExit(f"‚ùå Could not parse Gemini output. Raw text saved at {raw_path}")

print(f"‚úÖ Parsed {len(mapping)} normalized mappings from Gemini.")

# === 7. Apply Normalization ===
norm_dict = {m["original"].lower().strip(): m["normalized"].strip() for m in mapping}

normalized_outputs = {}
for fname, feats in llm_outputs.items():
    normalized_feats = [norm_dict.get(f.lower().strip(), f.strip()) for f in feats]
    normalized_outputs[fname] = normalized_feats

# === 8. Aggregate Statistics (Frequency + Rank) ===
feature_stats = defaultdict(lambda: {"files": set(), "ranks": []})

for fname, features in normalized_outputs.items():
    for rank, feat in enumerate(features, start=1):
        clean_feat = feat.lower().strip()
        feature_stats[clean_feat]["files"].add(fname)
        feature_stats[clean_feat]["ranks"].append(rank)

# === 9. Compute Aggregates ===
summary = []
for feat, data in feature_stats.items():
    num_files = len(data["files"])
    avg_rank = statistics.mean(data["ranks"])
    median_rank = statistics.median(data["ranks"])
    most_common_rank = Counter(data["ranks"]).most_common(1)[0][0]
    summary.append({
        "Feature": feat,
        "Files Mentioned": num_files,
        "Average Rank": round(avg_rank, 2),
        "Median Rank": round(median_rank, 2),
        "Most Common Rank": most_common_rank,
        "Files": "; ".join(sorted(data["files"]))
    })

# === 10. Sort & Save ===
summary_sorted = sorted(summary, key=lambda x: (-x["Files Mentioned"], x["Average Rank"]))
csv_path = os.path.join(folder_path, "Feature_Frequency_and_Rank_Normalized7-9.csv")

with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=summary_sorted[0].keys())
    writer.writeheader()
    writer.writerows(summary_sorted)

print(f"\n‚úÖ Saved normalized feature frequency analysis: {csv_path}")

# === 11. Preview Top 10 ===
print("\n=== Top 10 Most Common (Normalized) Features ===")
for item in summary_sorted[:10]:
    print(f"- {item['Feature']} ‚Üí in {item['Files Mentioned']} files (avg rank {item['Average Rank']})")

In [None]:
# === 1. Imports & Configuration ===
import os
import re
import csv
import json
import statistics
from collections import defaultdict, Counter
import google.generativeai as genai

# === 2. Configuration ===
folder_path = r"C:\PhD\Prompt_Engineering\Step1_Local_Outputs"
file_names = [
    "Output10L.txt", "Output10M.txt", "Output10D.txt", "Output10G.txt",
    "Output11L.txt", "Output11M.txt", "Output11D.txt", "Output11G.txt",
    "Output12L.txt", "Output12M.txt", "Output12D.txt", "Output12G.txt"
]

# === 3. Gemini API Configuration ===
API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=API_KEY)  # Replace with your key
model = genai.GenerativeModel("models/gemini-2.5-flash")

# === 4. Load Ordered Features ===
def load_ordered_features(folder_path, file_names):
    outputs = {}
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è File not found: {file_name}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        lines = content.splitlines()
        features = [
            re.sub(r"^(\*|-|‚Ä¢|\d+\.)\s*", "", ln.strip())  # remove bullet points or numbering
            for ln in lines if re.match(r"^(\*|-|‚Ä¢|\d+\.)", ln.strip())
        ]
        outputs[file_name] = features
    return outputs

llm_outputs = load_ordered_features(folder_path, file_names)
print(f"‚úÖ Loaded {len(llm_outputs)} files with ordered features.")

# === 5. Ask Gemini to Normalize Feature Names ===
prompt = f"""
You will be given multiple lists of biochemical or chemical features extracted from different models.

For each unique feature string, normalize it to a single canonical concept name.
Keep a mapping in JSON format like this:

[
  {{
    "original": "Mol weight",
    "normalized": "Molecular weight"
  }},
  {{
    "original": "MW",
    "normalized": "Molecular weight"
  }},
  ...
]

Respond ONLY with valid JSON (a single array, no markdown or explanations).

Here are the lists:
{llm_outputs}
"""

print("üîÑ Sending normalization request to Gemini (may take a minute)...")
response = model.generate_content(prompt)
gemini_output = response.text.strip()

# === 6. Parse Gemini JSON Output Safely ===
def extract_json(text):
    match = re.search(r"(\[.*\])", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Gemini output not valid JSON.")
    return None

mapping = extract_json(gemini_output)
if not mapping:
    raw_path = os.path.join(folder_path, "Gemini_Normalization_RAW.txt")
    with open(raw_path, "w", encoding="utf-8") as f:
        f.write(gemini_output)
    raise SystemExit(f"‚ùå Could not parse Gemini output. Raw text saved at {raw_path}")

print(f"‚úÖ Parsed {len(mapping)} normalized mappings from Gemini.")

# === 7. Apply Normalization ===
norm_dict = {m["original"].lower().strip(): m["normalized"].strip() for m in mapping}

normalized_outputs = {}
for fname, feats in llm_outputs.items():
    normalized_feats = [norm_dict.get(f.lower().strip(), f.strip()) for f in feats]
    normalized_outputs[fname] = normalized_feats

# === 8. Aggregate Statistics (Frequency + Rank) ===
feature_stats = defaultdict(lambda: {"files": set(), "ranks": []})

for fname, features in normalized_outputs.items():
    for rank, feat in enumerate(features, start=1):
        clean_feat = feat.lower().strip()
        feature_stats[clean_feat]["files"].add(fname)
        feature_stats[clean_feat]["ranks"].append(rank)

# === 9. Compute Aggregates ===
summary = []
for feat, data in feature_stats.items():
    num_files = len(data["files"])
    avg_rank = statistics.mean(data["ranks"])
    median_rank = statistics.median(data["ranks"])
    most_common_rank = Counter(data["ranks"]).most_common(1)[0][0]
    summary.append({
        "Feature": feat,
        "Files Mentioned": num_files,
        "Average Rank": round(avg_rank, 2),
        "Median Rank": round(median_rank, 2),
        "Most Common Rank": most_common_rank,
        "Files": "; ".join(sorted(data["files"]))
    })

# === 10. Sort & Save ===
summary_sorted = sorted(summary, key=lambda x: (-x["Files Mentioned"], x["Average Rank"]))
csv_path = os.path.join(folder_path, "Feature_Frequency_and_Rank_Normalized10-12.csv")

with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=summary_sorted[0].keys())
    writer.writeheader()
    writer.writerows(summary_sorted)

print(f"\n‚úÖ Saved normalized feature frequency analysis: {csv_path}")

# === 11. Preview Top 10 ===
print("\n=== Top 10 Most Common (Normalized) Features ===")
for item in summary_sorted[:10]:
    print(f"- {item['Feature']} ‚Üí in {item['Files Mentioned']} files (avg rank {item['Average Rank']})")

In [None]:
# === 1. Imports & Configuration ===
import os
import re
import csv
import json
import statistics
from collections import defaultdict, Counter
import google.generativeai as genai

# === 2. Configuration ===
folder_path = r"C:\PhD\Prompt_Engineering\Step1_Local_Outputs"
file_names = [
    "Output13L.txt", "Output13M.txt", "Output13D.txt", "Output13G.txt",
    "Output14L.txt", "Output14M.txt", "Output14D.txt", "Output14G.txt",
    "Output15L.txt", "Output15M.txt", "Output15D.txt", "Output15G.txt"
]

# === 3. Gemini API Configuration ===
API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=API_KEY)  # Replace with your key
model = genai.GenerativeModel("models/gemini-2.5-flash")

# === 4. Load Ordered Features ===
def load_ordered_features(folder_path, file_names):
    outputs = {}
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è File not found: {file_name}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        lines = content.splitlines()
        features = [
            re.sub(r"^(\*|-|‚Ä¢|\d+\.)\s*", "", ln.strip())  # remove bullet points or numbering
            for ln in lines if re.match(r"^(\*|-|‚Ä¢|\d+\.)", ln.strip())
        ]
        outputs[file_name] = features
    return outputs

llm_outputs = load_ordered_features(folder_path, file_names)
print(f"‚úÖ Loaded {len(llm_outputs)} files with ordered features.")

# === 5. Ask Gemini to Normalize Feature Names ===
prompt = f"""
You will be given multiple lists of biochemical or chemical features extracted from different models.

For each unique feature string, normalize it to a single canonical concept name.
Keep a mapping in JSON format like this:

[
  {{
    "original": "Mol weight",
    "normalized": "Molecular weight"
  }},
  {{
    "original": "MW",
    "normalized": "Molecular weight"
  }},
  ...
]

Respond ONLY with valid JSON (a single array, no markdown or explanations).

Here are the lists:
{llm_outputs}
"""

print("üîÑ Sending normalization request to Gemini (may take a minute)...")
response = model.generate_content(prompt)
gemini_output = response.text.strip()

# === 6. Parse Gemini JSON Output Safely ===
def extract_json(text):
    match = re.search(r"(\[.*\])", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Gemini output not valid JSON.")
    return None

mapping = extract_json(gemini_output)
if not mapping:
    raw_path = os.path.join(folder_path, "Gemini_Normalization_RAW.txt")
    with open(raw_path, "w", encoding="utf-8") as f:
        f.write(gemini_output)
    raise SystemExit(f"‚ùå Could not parse Gemini output. Raw text saved at {raw_path}")

print(f"‚úÖ Parsed {len(mapping)} normalized mappings from Gemini.")

# === 7. Apply Normalization ===
norm_dict = {m["original"].lower().strip(): m["normalized"].strip() for m in mapping}

normalized_outputs = {}
for fname, feats in llm_outputs.items():
    normalized_feats = [norm_dict.get(f.lower().strip(), f.strip()) for f in feats]
    normalized_outputs[fname] = normalized_feats

# === 8. Aggregate Statistics (Frequency + Rank) ===
feature_stats = defaultdict(lambda: {"files": set(), "ranks": []})

for fname, features in normalized_outputs.items():
    for rank, feat in enumerate(features, start=1):
        clean_feat = feat.lower().strip()
        feature_stats[clean_feat]["files"].add(fname)
        feature_stats[clean_feat]["ranks"].append(rank)

# === 9. Compute Aggregates ===
summary = []
for feat, data in feature_stats.items():
    num_files = len(data["files"])
    avg_rank = statistics.mean(data["ranks"])
    median_rank = statistics.median(data["ranks"])
    most_common_rank = Counter(data["ranks"]).most_common(1)[0][0]
    summary.append({
        "Feature": feat,
        "Files Mentioned": num_files,
        "Average Rank": round(avg_rank, 2),
        "Median Rank": round(median_rank, 2),
        "Most Common Rank": most_common_rank,
        "Files": "; ".join(sorted(data["files"]))
    })

# === 10. Sort & Save ===
summary_sorted = sorted(summary, key=lambda x: (-x["Files Mentioned"], x["Average Rank"]))
csv_path = os.path.join(folder_path, "Feature_Frequency_and_Rank_Normalized13-15.csv")

with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=summary_sorted[0].keys())
    writer.writeheader()
    writer.writerows(summary_sorted)

print(f"\n‚úÖ Saved normalized feature frequency analysis: {csv_path}")

# === 11. Preview Top 10 ===
print("\n=== Top 10 Most Common (Normalized) Features ===")
for item in summary_sorted[:10]:
    print(f"- {item['Feature']} ‚Üí in {item['Files Mentioned']} files (avg rank {item['Average Rank']})")

In [None]:
#Switched to different gemini model to avoid timing out 
# === 1. Imports & Configuration ===
import os
import re
import csv
import json
import statistics
from collections import defaultdict, Counter
import google.generativeai as genai

# === 2. Configuration ===
folder_path = r"C:\PhD\Prompt_Engineering\Step1_Local_Outputs"
file_names = [
    "Output1L.txt", "Output1M.txt", "Output1D.txt", "Output1G.txt",
    "Output2L.txt", "Output2M.txt", "Output2D.txt", "Output2G.txt",
    "Output3L.txt", "Output3M.txt", "Output3D.txt", "Output3G.txt"
]

# === 3. Gemini API Configuration ===
API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=API_KEY)  # Replace with your key
model = genai.GenerativeModel("models/gemini-2.5-flash-lite")

# === 4. Load Ordered Features ===
def load_ordered_features(folder_path, file_names):
    outputs = {}
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è File not found: {file_name}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        lines = content.splitlines()
        features = [
            re.sub(r"^(\*|-|‚Ä¢|\d+\.)\s*", "", ln.strip())  # remove bullet points or numbering
            for ln in lines if re.match(r"^(\*|-|‚Ä¢|\d+\.)", ln.strip())
        ]
        outputs[file_name] = features
    return outputs

llm_outputs = load_ordered_features(folder_path, file_names)
print(f"‚úÖ Loaded {len(llm_outputs)} files with ordered features.")

# === 5. Ask Gemini to Normalize Feature Names ===
prompt = f"""
You will be given multiple lists of biochemical or chemical features extracted from different models.

For each unique feature string, normalize it to a single canonical concept name.
Keep a mapping in JSON format like this:

[
  {{
    "original": "Mol weight",
    "normalized": "Molecular weight"
  }},
  {{
    "original": "MW",
    "normalized": "Molecular weight"
  }},
  ...
]

Respond ONLY with valid JSON (a single array, no markdown or explanations).

Here are the lists:
{llm_outputs}
"""

print("üîÑ Sending normalization request to Gemini (may take a minute)...")
response = model.generate_content(prompt)
gemini_output = response.text.strip()

# === 6. Parse Gemini JSON Output Safely ===
def extract_json(text):
    match = re.search(r"(\[.*\])", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Gemini output not valid JSON.")
    return None

mapping = extract_json(gemini_output)
if not mapping:
    raw_path = os.path.join(folder_path, "Gemini_Normalization_RAW.txt")
    with open(raw_path, "w", encoding="utf-8") as f:
        f.write(gemini_output)
    raise SystemExit(f"‚ùå Could not parse Gemini output. Raw text saved at {raw_path}")

print(f"‚úÖ Parsed {len(mapping)} normalized mappings from Gemini.")

# === 7. Apply Normalization ===
norm_dict = {m["original"].lower().strip(): m["normalized"].strip() for m in mapping}

normalized_outputs = {}
for fname, feats in llm_outputs.items():
    normalized_feats = [norm_dict.get(f.lower().strip(), f.strip()) for f in feats]
    normalized_outputs[fname] = normalized_feats
    
# === 8. Rank-Weighted Similarity Function ===
def rank_weighted_similarity(list1, list2):
    """Compute similarity score accounting for feature rank (higher ranks weigh more)."""
    max_len = max(len(list1), len(list2))
    score = 0.0
    for i, f1 in enumerate(list1):
        for j, f2 in enumerate(list2):
            if f1 == f2:
                # Weight: closer to top = higher weight
                weight = 1 / (1 + abs(i - j))
                score += weight
    # Normalize by average length to keep 0-1 scale
    return score / max_len

# === 9. Compute Similarity Matrix ===
file_list = sorted(normalized_outputs.keys())
similarity_matrix = []

for f1 in file_list:
    row = [f1]
    for f2 in file_list:
        sim = rank_weighted_similarity(normalized_outputs[f1], normalized_outputs[f2])
        row.append(round(sim, 3))
    similarity_matrix.append(row)

# === 10. Save Similarity Matrix to CSV ===
csv_path = os.path.join(folder_path, "Feature_List_Similarity_Matrix1-3.csv")
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["File"] + file_list)
    writer.writerows(similarity_matrix)

print(f"\n‚úÖ Saved rank-weighted similarity matrix: {csv_path}")

# === 11. Preview Top Similarities ===
print("\n=== Sample Similarities ===")
for row in similarity_matrix[:5]:
    print(row[:6])


In [None]:
# === 1. Imports & Configuration ===
import os
import re
import csv
import json
import statistics
from collections import defaultdict, Counter
import google.generativeai as genai

# === 2. Configuration ===
folder_path = r"C:\PhD\Prompt_Engineering\Step1_Local_Outputs"
file_names = [
    "Output4L.txt", "Output4M.txt", "Output4D.txt", "Output4G.txt",
    "Output5L.txt", "Output5M.txt", "Output5D.txt", "Output5G.txt",
    "Output6L.txt", "Output6M.txt", "Output6D.txt", "Output6G.txt"
]

# === 3. Gemini API Configuration ===
API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=API_KEY)  # Replace with your key
model = genai.GenerativeModel("models/gemini-2.5-flash-lite")

# === 4. Load Ordered Features ===
def load_ordered_features(folder_path, file_names):
    outputs = {}
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è File not found: {file_name}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        lines = content.splitlines()
        features = [
            re.sub(r"^(\*|-|‚Ä¢|\d+\.)\s*", "", ln.strip())  # remove bullet points or numbering
            for ln in lines if re.match(r"^(\*|-|‚Ä¢|\d+\.)", ln.strip())
        ]
        outputs[file_name] = features
    return outputs

llm_outputs = load_ordered_features(folder_path, file_names)
print(f"‚úÖ Loaded {len(llm_outputs)} files with ordered features.")

# === 5. Ask Gemini to Normalize Feature Names ===
prompt = f"""
You will be given multiple lists of biochemical or chemical features extracted from different models.

For each unique feature string, normalize it to a single canonical concept name.
Keep a mapping in JSON format like this:

[
  {{
    "original": "Mol weight",
    "normalized": "Molecular weight"
  }},
  {{
    "original": "MW",
    "normalized": "Molecular weight"
  }},
  ...
]

Respond ONLY with valid JSON (a single array, no markdown or explanations).

Here are the lists:
{llm_outputs}
"""

print("üîÑ Sending normalization request to Gemini (may take a minute)...")
response = model.generate_content(prompt)
gemini_output = response.text.strip()

# === 6. Parse Gemini JSON Output Safely ===
def extract_json(text):
    match = re.search(r"(\[.*\])", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Gemini output not valid JSON.")
    return None

mapping = extract_json(gemini_output)
if not mapping:
    raw_path = os.path.join(folder_path, "Gemini_Normalization_RAW.txt")
    with open(raw_path, "w", encoding="utf-8") as f:
        f.write(gemini_output)
    raise SystemExit(f"‚ùå Could not parse Gemini output. Raw text saved at {raw_path}")

print(f"‚úÖ Parsed {len(mapping)} normalized mappings from Gemini.")

# === 7. Apply Normalization ===
norm_dict = {m["original"].lower().strip(): m["normalized"].strip() for m in mapping}

normalized_outputs = {}
for fname, feats in llm_outputs.items():
    normalized_feats = [norm_dict.get(f.lower().strip(), f.strip()) for f in feats]
    normalized_outputs[fname] = normalized_feats
    
# === 8. Rank-Weighted Similarity Function ===
def rank_weighted_similarity(list1, list2):
    """Compute similarity score accounting for feature rank (higher ranks weigh more)."""
    max_len = max(len(list1), len(list2))
    score = 0.0
    for i, f1 in enumerate(list1):
        for j, f2 in enumerate(list2):
            if f1 == f2:
                # Weight: closer to top = higher weight
                weight = 1 / (1 + abs(i - j))
                score += weight
    # Normalize by average length to keep 0-1 scale
    return score / max_len

# === 9. Compute Similarity Matrix ===
file_list = sorted(normalized_outputs.keys())
similarity_matrix = []

for f1 in file_list:
    row = [f1]
    for f2 in file_list:
        sim = rank_weighted_similarity(normalized_outputs[f1], normalized_outputs[f2])
        row.append(round(sim, 3))
    similarity_matrix.append(row)

# === 10. Save Similarity Matrix to CSV ===
csv_path = os.path.join(folder_path, "Feature_List_Similarity_Matrix4-6.csv")
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["File"] + file_list)
    writer.writerows(similarity_matrix)

print(f"\n‚úÖ Saved rank-weighted similarity matrix: {csv_path}")

# === 11. Preview Top Similarities ===
print("\n=== Sample Similarities ===")
for row in similarity_matrix[:5]:
    print(row[:6])


In [None]:
# === 1. Imports & Configuration ===
import os
import re
import csv
import json
import statistics
from collections import defaultdict, Counter
import google.generativeai as genai

# === 2. Configuration ===
folder_path = r"C:\PhD\Prompt_Engineering\Step1_Local_Outputs"
file_names = [
    "Output7L.txt", "Output7M.txt", "Output7D.txt", "Output7G.txt",
    "Output8L.txt", "Output8M.txt", "Output8D.txt", "Output8G.txt",
    "Output9L.txt", "Output9M.txt", "Output9D.txt", "Output9G.txt"
]

# === 3. Gemini API Configuration ===
API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=API_KEY)  # Replace with your key
model = genai.GenerativeModel("models/gemini-2.5-flash-lite")

# === 4. Load Ordered Features ===
def load_ordered_features(folder_path, file_names):
    outputs = {}
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è File not found: {file_name}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        lines = content.splitlines()
        features = [
            re.sub(r"^(\*|-|‚Ä¢|\d+\.)\s*", "", ln.strip())  # remove bullet points or numbering
            for ln in lines if re.match(r"^(\*|-|‚Ä¢|\d+\.)", ln.strip())
        ]
        outputs[file_name] = features
    return outputs

llm_outputs = load_ordered_features(folder_path, file_names)
print(f"‚úÖ Loaded {len(llm_outputs)} files with ordered features.")

# === 5. Ask Gemini to Normalize Feature Names ===
prompt = f"""
You will be given multiple lists of biochemical or chemical features extracted from different models.

For each unique feature string, normalize it to a single canonical concept name.
Keep a mapping in JSON format like this:

[
  {{
    "original": "Mol weight",
    "normalized": "Molecular weight"
  }},
  {{
    "original": "MW",
    "normalized": "Molecular weight"
  }},
  ...
]

Respond ONLY with valid JSON (a single array, no markdown or explanations).

Here are the lists:
{llm_outputs}
"""

print("üîÑ Sending normalization request to Gemini (may take a minute)...")
response = model.generate_content(prompt)
gemini_output = response.text.strip()

# === 6. Parse Gemini JSON Output Safely ===
def extract_json(text):
    match = re.search(r"(\[.*\])", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Gemini output not valid JSON.")
    return None

mapping = extract_json(gemini_output)
if not mapping:
    raw_path = os.path.join(folder_path, "Gemini_Normalization_RAW.txt")
    with open(raw_path, "w", encoding="utf-8") as f:
        f.write(gemini_output)
    raise SystemExit(f"‚ùå Could not parse Gemini output. Raw text saved at {raw_path}")

print(f"‚úÖ Parsed {len(mapping)} normalized mappings from Gemini.")

# === 7. Apply Normalization ===
norm_dict = {m["original"].lower().strip(): m["normalized"].strip() for m in mapping}

normalized_outputs = {}
for fname, feats in llm_outputs.items():
    normalized_feats = [norm_dict.get(f.lower().strip(), f.strip()) for f in feats]
    normalized_outputs[fname] = normalized_feats
    
# === 8. Rank-Weighted Similarity Function ===
def rank_weighted_similarity(list1, list2):
    """Compute similarity score accounting for feature rank (higher ranks weigh more)."""
    max_len = max(len(list1), len(list2))
    score = 0.0
    for i, f1 in enumerate(list1):
        for j, f2 in enumerate(list2):
            if f1 == f2:
                # Weight: closer to top = higher weight
                weight = 1 / (1 + abs(i - j))
                score += weight
    # Normalize by average length to keep 0-1 scale
    return score / max_len

# === 9. Compute Similarity Matrix ===
file_list = sorted(normalized_outputs.keys())
similarity_matrix = []

for f1 in file_list:
    row = [f1]
    for f2 in file_list:
        sim = rank_weighted_similarity(normalized_outputs[f1], normalized_outputs[f2])
        row.append(round(sim, 3))
    similarity_matrix.append(row)

# === 10. Save Similarity Matrix to CSV ===
csv_path = os.path.join(folder_path, "Feature_List_Similarity_Matrix7-9.csv")
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["File"] + file_list)
    writer.writerows(similarity_matrix)

print(f"\n‚úÖ Saved rank-weighted similarity matrix: {csv_path}")

# === 11. Preview Top Similarities ===
print("\n=== Sample Similarities ===")
for row in similarity_matrix[:5]:
    print(row[:6])


In [None]:
# === 1. Imports & Configuration ===
import os
import re
import csv
import json
import statistics
from collections import defaultdict, Counter
import google.generativeai as genai

# === 2. Configuration ===
folder_path = r"C:\PhD\Prompt_Engineering\Step1_Local_Outputs"
file_names = [
    "Output10L.txt", "Output10M.txt", "Output10D.txt", "Output10G.txt",
    "Output11L.txt", "Output11M.txt", "Output11D.txt", "Output11G.txt",
    "Output12L.txt", "Output12M.txt", "Output12D.txt", "Output12G.txt"
]

# === 3. Gemini API Configuration ===
API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=API_KEY)  # Replace with your key
model = genai.GenerativeModel("models/gemini-2.5-flash-lite")

# === 4. Load Ordered Features ===
def load_ordered_features(folder_path, file_names):
    outputs = {}
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è File not found: {file_name}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        lines = content.splitlines()
        features = [
            re.sub(r"^(\*|-|‚Ä¢|\d+\.)\s*", "", ln.strip())  # remove bullet points or numbering
            for ln in lines if re.match(r"^(\*|-|‚Ä¢|\d+\.)", ln.strip())
        ]
        outputs[file_name] = features
    return outputs

llm_outputs = load_ordered_features(folder_path, file_names)
print(f"‚úÖ Loaded {len(llm_outputs)} files with ordered features.")

# === 5. Ask Gemini to Normalize Feature Names ===
prompt = f"""
You will be given multiple lists of biochemical or chemical features extracted from different models.

For each unique feature string, normalize it to a single canonical concept name.
Keep a mapping in JSON format like this:

[
  {{
    "original": "Mol weight",
    "normalized": "Molecular weight"
  }},
  {{
    "original": "MW",
    "normalized": "Molecular weight"
  }},
  ...
]

Respond ONLY with valid JSON (a single array, no markdown or explanations).

Here are the lists:
{llm_outputs}
"""

print("üîÑ Sending normalization request to Gemini (may take a minute)...")
response = model.generate_content(prompt)
gemini_output = response.text.strip()

# === 6. Parse Gemini JSON Output Safely ===
def extract_json(text):
    match = re.search(r"(\[.*\])", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Gemini output not valid JSON.")
    return None

mapping = extract_json(gemini_output)
if not mapping:
    raw_path = os.path.join(folder_path, "Gemini_Normalization_RAW.txt")
    with open(raw_path, "w", encoding="utf-8") as f:
        f.write(gemini_output)
    raise SystemExit(f"‚ùå Could not parse Gemini output. Raw text saved at {raw_path}")

print(f"‚úÖ Parsed {len(mapping)} normalized mappings from Gemini.")

# === 7. Apply Normalization ===
norm_dict = {m["original"].lower().strip(): m["normalized"].strip() for m in mapping}

normalized_outputs = {}
for fname, feats in llm_outputs.items():
    normalized_feats = [norm_dict.get(f.lower().strip(), f.strip()) for f in feats]
    normalized_outputs[fname] = normalized_feats
    
# === 8. Rank-Weighted Similarity Function ===
def rank_weighted_similarity(list1, list2):
    """Compute similarity score accounting for feature rank (higher ranks weigh more)."""
    max_len = max(len(list1), len(list2))
    score = 0.0
    for i, f1 in enumerate(list1):
        for j, f2 in enumerate(list2):
            if f1 == f2:
                # Weight: closer to top = higher weight
                weight = 1 / (1 + abs(i - j))
                score += weight
    # Normalize by average length to keep 0-1 scale
    return score / max_len

# === 9. Compute Similarity Matrix ===
file_list = sorted(normalized_outputs.keys())
similarity_matrix = []

for f1 in file_list:
    row = [f1]
    for f2 in file_list:
        sim = rank_weighted_similarity(normalized_outputs[f1], normalized_outputs[f2])
        row.append(round(sim, 3))
    similarity_matrix.append(row)

# === 10. Save Similarity Matrix to CSV ===
csv_path = os.path.join(folder_path, "Feature_List_Similarity_Matrix10-12.csv")
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["File"] + file_list)
    writer.writerows(similarity_matrix)

print(f"\n‚úÖ Saved rank-weighted similarity matrix: {csv_path}")

# === 11. Preview Top Similarities ===
print("\n=== Sample Similarities ===")
for row in similarity_matrix[:5]:
    print(row[:6])


In [None]:
# === 1. Imports & Configuration ===
import os
import re
import csv
import json
import statistics
from collections import defaultdict, Counter
import google.generativeai as genai

# === 2. Configuration ===
folder_path = r"C:\PhD\Prompt_Engineering\Step1_Local_Outputs"
file_names = [
    "Output13L.txt", "Output13M.txt", "Output13D.txt", "Output13G.txt",
    "Output14L.txt", "Output14M.txt", "Output14D.txt", "Output14G.txt",
    "Output15L.txt", "Output15M.txt", "Output15D.txt", "Output15G.txt"
]

# === 3. Gemini API Configuration ===
API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=API_KEY)  # Replace with your key
model = genai.GenerativeModel("models/gemini-2.5-flash-lite")

# === 4. Load Ordered Features ===
def load_ordered_features(folder_path, file_names):
    outputs = {}
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è File not found: {file_name}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        lines = content.splitlines()
        features = [
            re.sub(r"^(\*|-|‚Ä¢|\d+\.)\s*", "", ln.strip())  # remove bullet points or numbering
            for ln in lines if re.match(r"^(\*|-|‚Ä¢|\d+\.)", ln.strip())
        ]
        outputs[file_name] = features
    return outputs

llm_outputs = load_ordered_features(folder_path, file_names)
print(f"‚úÖ Loaded {len(llm_outputs)} files with ordered features.")

# === 5. Ask Gemini to Normalize Feature Names ===
prompt = f"""
You will be given multiple lists of biochemical or chemical features extracted from different models.

For each unique feature string, normalize it to a single canonical concept name.
Keep a mapping in JSON format like this:

[
  {{
    "original": "Mol weight",
    "normalized": "Molecular weight"
  }},
  {{
    "original": "MW",
    "normalized": "Molecular weight"
  }},
  ...
]

Respond ONLY with valid JSON (a single array, no markdown or explanations).

Here are the lists:
{llm_outputs}
"""

print("üîÑ Sending normalization request to Gemini (may take a minute)...")
response = model.generate_content(prompt)
gemini_output = response.text.strip()

# === 6. Parse Gemini JSON Output Safely ===
def extract_json(text):
    match = re.search(r"(\[.*\])", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            print("‚ö†Ô∏è Gemini output not valid JSON.")
    return None

mapping = extract_json(gemini_output)
if not mapping:
    raw_path = os.path.join(folder_path, "Gemini_Normalization_RAW.txt")
    with open(raw_path, "w", encoding="utf-8") as f:
        f.write(gemini_output)
    raise SystemExit(f"‚ùå Could not parse Gemini output. Raw text saved at {raw_path}")

print(f"‚úÖ Parsed {len(mapping)} normalized mappings from Gemini.")

# === 7. Apply Normalization ===
norm_dict = {m["original"].lower().strip(): m["normalized"].strip() for m in mapping}

normalized_outputs = {}
for fname, feats in llm_outputs.items():
    normalized_feats = [norm_dict.get(f.lower().strip(), f.strip()) for f in feats]
    normalized_outputs[fname] = normalized_feats
    
# === 8. Rank-Weighted Similarity Function ===
def rank_weighted_similarity(list1, list2):
    """Compute similarity score accounting for feature rank (higher ranks weigh more)."""
    max_len = max(len(list1), len(list2))
    score = 0.0
    for i, f1 in enumerate(list1):
        for j, f2 in enumerate(list2):
            if f1 == f2:
                # Weight: closer to top = higher weight
                weight = 1 / (1 + abs(i - j))
                score += weight
    # Normalize by average length to keep 0-1 scale
    return score / max_len

# === 9. Compute Similarity Matrix ===
file_list = sorted(normalized_outputs.keys())
similarity_matrix = []

for f1 in file_list:
    row = [f1]
    for f2 in file_list:
        sim = rank_weighted_similarity(normalized_outputs[f1], normalized_outputs[f2])
        row.append(round(sim, 3))
    similarity_matrix.append(row)

# === 10. Save Similarity Matrix to CSV ===
csv_path = os.path.join(folder_path, "Feature_List_Similarity_Matrix13-15.csv")
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["File"] + file_list)
    writer.writerows(similarity_matrix)

print(f"\n‚úÖ Saved rank-weighted similarity matrix: {csv_path}")

# === 11. Preview Top Similarities ===
print("\n=== Sample Similarities ===")
for row in similarity_matrix[:5]:
    print(row[:6])