In [None]:
# =============================================
# Step 0: Setup
# =============================================
import warnings
warnings.filterwarnings('ignore')

# =============================================
# Step 1: Preprocess ICD-10 Metadata from .txt
# =============================================
import pandas as pd
import json
import os

with open(".\codes.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

entries = []
for line in lines:
    parts = line.strip().split(None, 4)
    if len(parts) == 5:
        _, code, _, short_desc, long_desc = parts
        entries.append({
            "code": code,
            "short_description": short_desc.strip(),
            "long_description": long_desc.strip()
        })

chunks = [f"ICD Code: {e['code']}. Description: {e['long_description']}" for e in entries]
codes = [e['code'] for e in entries]
descriptions = [e['long_description'] for e in entries]

os.makedirs("icd10_kb", exist_ok=True)
with open("icd10_kb/icd_chunks_3.json", "w", encoding="utf-8") as f:
    json.dump(chunks, f, indent=2)
with open("icd10_kb/icd_codes_3.json", "w", encoding="utf-8") as f:
    json.dump(codes, f, indent=2)
with open("icd10_kb/icd_descriptions_3.json", "w", encoding="utf-8") as f:
    json.dump(descriptions, f, indent=2)

print("✅ ICD-10 metadata processed.")

# =======================================================
# Step 2: Create Embedding Index using Sentence-BERT + FAISS
# =======================================================
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL_NAME = "pritamdeka/S-PubMedBert-MS-MARCO"
embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)

embeddings = embedder.encode(chunks, show_progress_bar=True)
dim = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings))
faiss.write_index(index, "icd10_index_3.faiss")
print("✅ FAISS index created.")

# ===================================================
# Step 3: Hybrid Keyword + Semantic Retriever Mapping
# ===================================================
import ast
from rapidfuzz import fuzz, process

# Load assets with _3 suffix
with open("icd10_kb/icd_chunks_3.json", "r") as f: icd_chunks = json.load(f)
with open("icd10_kb/icd_codes_3.json", "r") as f: icd_codes = json.load(f)
with open("icd10_kb/icd_descriptions_3.json", "r") as f: icd_descs = json.load(f)
index = faiss.read_index("icd10_index_3.faiss")

# Load patient diagnoses file
df = pd.read_excel(".\Diagnoses_list.xlsx")
results = []

for row_id, raw in enumerate(df["Diagnoses_list"].dropna()):
    try:
        diagnoses = ast.literal_eval(raw)
    except:
        continue

    for diag in diagnoses:
        # --- Step 1: Keyword Match ---
        keyword_result = process.extractOne(diag, icd_descs, scorer=fuzz.token_sort_ratio, score_cutoff=85, processor=None)

        if keyword_result:
            match, score, idx = keyword_result
            code = icd_codes[idx]
            confidence = "High" if score >= 95 else "Medium"
            justification = f"Keyword match with score {score}/100. Confidence: {confidence}. Found similar phrase in ICD description."
            alternatives = ""
        else:
            # --- Step 2: Semantic Search ---
            emb = embedder.encode([diag])
            D, I = index.search(np.array(emb), k=3)
            idx = I[0][0]
            alt = [(icd_codes[I[0][j]], icd_descs[I[0][j]], float(D[0][j])) for j in range(3)]
            code = icd_codes[idx]
            confidence = "Low" if D[0][0] > 0.8 else "Medium"
            justification = f"Semantic match using Sentence-BERT. Distance: {D[0][0]:.3f}. Confidence: {confidence}. No direct keyword match found above threshold."
            alternatives = f"{alt[1][0]} (distance: {alt[1][2]:.3f}), {alt[2][0]} (distance: {alt[2][2]:.3f})"

        results.append({
            "row_id": row_id,
            "original_diagnosis": diag,
            "matched_icd_code": code,
            "matched_description": icd_descs[idx],
            "confidence_level": confidence,
            "justification": justification,
            "alternative_codes": alternatives
        })

# ===========================================
# Step 4: Save Mapped Results with Justification
# ===========================================
df_results = pd.DataFrame(results)
df_results.to_excel("final_icd10_mapped_output_3.xlsx", index=False)

# Generate summary report
total_mappings = len(df_results)
high_confidence = len(df_results[df_results['confidence_level'] == 'High'])
medium_confidence = len(df_results[df_results['confidence_level'] == 'Medium'])
low_confidence = len(df_results[df_results['confidence_level'] == 'Low'])

print("✅ ICD-10 mapping with hybrid retrieval complete!")
print(f"📊 SUMMARY REPORT:")
print(f"   Total diagnoses mapped: {total_mappings}")
print(f"   High confidence matches: {high_confidence} ({high_confidence/total_mappings*100:.1f}%)")
print(f"   Medium confidence matches: {medium_confidence} ({medium_confidence/total_mappings*100:.1f}%)")
print(f"   Low confidence matc hes: {low_confidence} ({low_confidence/total_mappings*100:.1f}%)")
print(f"📁 File saved → final_icd10_mapped_output_3.xlsx")


  with open(".\codes.txt", "r", encoding="utf-8") as f:
  df = pd.read_excel(".\Diagnoses_list.xlsx")


✅ ICD-10 metadata processed.



Batches:   0%|          | 0/2942 [00:00<?, ?it/s]

✅ FAISS index created.
✅ ICD-10 mapping with hybrid retrieval complete!
📊 SUMMARY REPORT:
   Total diagnoses mapped: 3171
   High confidence matches: 111 (3.5%)
   Medium confidence matches: 49 (1.5%)
   Low confidence matches: 3011 (95.0%)
📁 File saved → final_icd10_mapped_output_3.xlsx
