## Dev file by Logan Kelsch
### This file is used for initial instantiation of model ideas, initial runs and tests, and temp code storage

### Depricated Train and test for Neural-Net

In [1]:
import sspec_nn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

data = pd.read_csv("temp_data/train.csv")

X = data.values[:, :-1]
y = data.values[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

model = sspec_nn.sspec_nn_train(X_train, y_train, batch_size=16, epochs=250, optimizer='adam')
y_proba, y_pred = sspec_nn.sspec_nn_predict(model, X_test)

cm = confusion_matrix(y_test, y_pred)

plt.imshow(cm)
plt.show()

ModuleNotFoundError: No module named 'tensorflow'

### Softmax (medical) condition classifier training function

In [None]:
# Update function to do per-class splits guaranteeing at least 1 sample in test and 1 in val for every class.
from pathlib import Path
import json
import numpy as np
import pandas as pd
from joblib import dump, load
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report, top_k_accuracy_score
from sklearn.linear_model import SGDClassifier

SAVE_DIR = Path("./symptoms/model/")

def train_softmax_classifier_classwise(
    data_path: str,
    save_dir: str,
    random_state: int=42,
    epochs: int=6,
    batch_size: int=128,
    lr_alpha: float=1e-5,
    learning_rate: str="optimal",
    eta0: float=0.0,
    ngram_range_word=(1,2),
    ngram_range_char=(3,5),
    max_features_word: int=40000,
    max_features_char: int=50000,
    min_df: int=2,
    early_stopping: bool=True,
    patience: int=2,
    verbose: bool=True
):
    rng = np.random.default_rng(random_state)
    data_path = Path(data_path); save_dir = Path(save_dir); save_dir.mkdir(parents=True, exist_ok=True)
    df = pd.read_csv(data_path)
    texts_all = df["user_input"].astype(str).to_numpy()
    y_all = df["target_condition_id"].astype(int).to_numpy()

    # keep classes with >=3
    uniq, counts = np.unique(y_all, return_counts=True)
    keep = uniq[counts >= 3]
    mask = np.isin(y_all, keep)
    texts = texts_all[mask]; y = y_all[mask]

    # map to indices
    unique_ids = np.array(sorted(np.unique(y).tolist()), dtype=int)
    id_to_idx = {cid: i for i, cid in enumerate(unique_ids)}
    y_idx = np.array([id_to_idx[int(v)] for v in y], dtype=int)

    # build per-class indices and split: 1 to test, 1 to val, rest to train
    by_class = {c: np.where(y_idx == c)[0] for c in range(len(unique_ids))}
    train_idx, val_idx, test_idx = [], [], []
    for c, idxs in by_class.items():
        idxs = idxs.copy()
        rng.shuffle(idxs)
        if len(idxs) >= 3:
            test_idx.append(idxs[0])
            val_idx.append(idxs[1])
            train_idx.extend(idxs[2:])
        else:
            # shouldn't happen (we filtered), but fallback: 1 test, rest train
            test_idx.append(idxs[0])
            train_idx.extend(idxs[1:])

    X_train_text = texts[train_idx].tolist()
    X_val_text = texts[val_idx].tolist()
    X_test_text = texts[test_idx].tolist()
    y_train_idx = y_idx[train_idx]
    y_val_idx = y_idx[val_idx]
    y_test_idx = y_idx[test_idx]

    # vectorizers
    v_word = TfidfVectorizer(analyzer="word", ngram_range=ngram_range_word, min_df=min_df, max_features=max_features_word, lowercase=True)
    v_char = TfidfVectorizer(analyzer="char", ngram_range=ngram_range_char, min_df=min_df, max_features=max_features_char, lowercase=True)
    Xw_tr = v_word.fit_transform(X_train_text); Xc_tr = v_char.fit_transform(X_train_text); X_tr = hstack([Xw_tr, Xc_tr], format="csr")
    Xw_va = v_word.transform(X_val_text); Xc_va = v_char.transform(X_val_text); X_va = hstack([Xw_va, Xc_va], format="csr")
    Xw_te = v_word.transform(X_test_text); Xc_te = v_char.transform(X_test_text); X_te = hstack([Xw_te, Xc_te], format="csr")

    # class weights
    classes, counts_tr = np.unique(y_train_idx, return_counts=True)
    freq = counts_tr / counts_tr.sum()
    w = 1.0 / np.maximum(freq, 1e-12)
    weight_map = {int(c): w[i]*(len(w)/w.sum()) for i,c in enumerate(classes)}

    clf = SGDClassifier(loss="log_loss", alpha=lr_alpha, learning_rate=learning_rate, eta0=eta0, random_state=random_state)
    classes_all = np.arange(len(unique_ids), dtype=int)
    n_train = X_tr.shape[0]
    best_val_f1 = -1.0; best_epoch = -1

    for epoch in range(1, epochs+1):
        order = np.arange(n_train); rng.shuffle(order)
        for start in range(0, n_train, batch_size):
            end = min(start + batch_size, n_train)
            idx = order[start:end]
            xb = X_tr[idx]; yb = y_train_idx[idx]
            sw = np.array([weight_map[int(c)] for c in yb], dtype=float)
            if epoch == 1 and start == 0:
                clf.partial_fit(xb, yb, classes=classes_all, sample_weight=sw)
            else:
                clf.partial_fit(xb, yb, sample_weight=sw)
        # validation
        proba_va = clf.predict_proba(X_va); pred_va = np.argmax(proba_va, axis=1)
        acc = accuracy_score(y_val_idx, pred_va)
        macro_f1 = f1_score(y_val_idx, pred_va, average="macro")
        top3 = top_k_accuracy_score(y_val_idx, proba_va, k=min(3, proba_va.shape[1]))
        top5 = top_k_accuracy_score(y_val_idx, proba_va, k=min(5, proba_va.shape[1]))
        if verbose:
            print(f"[Epoch {epoch}] val_acc={acc:.3f} macroF1={macro_f1:.3f} top3={top3:.3f} top5={top5:.3f}")
        if macro_f1 > best_val_f1:
            best_val_f1 = macro_f1; best_epoch = epoch
            dump(clf, SAVE_DIR / "sgd_softmax_best.joblib")
            dump(v_word, SAVE_DIR / "tfidf_word.joblib")
            dump(v_char, SAVE_DIR / "tfidf_char.joblib")
            with open(SAVE_DIR / "label_map.json", "w") as f:
                json.dump([int(x) for x in unique_ids], f)
        if early_stopping and (epoch - best_epoch) >= patience:
            if verbose:
                print(f"[EarlyStopping] no improvement for {patience} epochs (best epoch {best_epoch}). Stop.")
            break

    # test eval
    clf = load(SAVE_DIR / "sgd_softmax_best.joblib")
    proba_te = clf.predict_proba(X_te); pred_te = np.argmax(proba_te, axis=1)
    acc_te = accuracy_score(y_test_idx, pred_te)
    macro_f1_te = f1_score(y_test_idx, pred_te, average="macro")
    micro_f1_te = f1_score(y_test_idx, pred_te, average="micro")
    top3_te = top_k_accuracy_score(y_test_idx, proba_te, k=min(3, proba_te.shape[1]))
    top5_te = top_k_accuracy_score(y_test_idx, proba_te, k=min(5, proba_te.shape[1]))

    report = {
        "classes_kept_ge3": int(len(unique_ids)),
        "best_epoch": int(best_epoch),
        "test_accuracy": float(acc_te),
        "test_macro_f1": float(macro_f1_te),
        "test_micro_f1": float(micro_f1_te),
        "test_top3_accuracy": float(top3_te),
        "test_top5_accuracy": float(top5_te),
    }
    with open(SAVE_DIR / "report.json", "w") as f:
        json.dump(report, f, indent=2)

    # inference script
    infer_code = f"""#!/usr/bin/env python3
import json, sys
from joblib import load
from scipy.sparse import hstack
word = load(r"{(SAVE_DIR / 'tfidf_word.joblib').as_posix()}")
char = load(r"{(SAVE_DIR / 'tfidf_char.joblib').as_posix()}")
clf = load(r"{(SAVE_DIR / 'sgd_softmax_best.joblib').as_posix()}")
with open(r"{(SAVE_DIR / 'label_map.json').as_posix()}") as f:
    label_map = json.load(f)
def predict_topk(text, k=5):
    X = hstack([word.transform([text]), char.transform([text])], format='csr')
    p = clf.predict_proba(X)[0]
    idxs = p.argsort()[::-1][:k]
    return [(int(label_map[i]), float(p[i])) for i in idxs]
if __name__ == "__main__":
    txt = " ".join(sys.argv[1:]) or "pelvic pressure and heavy flow"
    print(predict_topk(txt, k=5))
"""
    (SAVE_DIR / "inference.py").write_text(infer_code)

    return {"report": report, "paths": {
        "report": str((SAVE_DIR / 'report.json').as_posix()),
        "clf": str((SAVE_DIR / 'sgd_softmax_best.joblib').as_posix()),
        "tfidf_word": str((SAVE_DIR / 'tfidf_word.joblib').as_posix()),
        "tfidf_char": str((SAVE_DIR / 'tfidf_char.joblib').as_posix()),
        "label_map": str((SAVE_DIR / 'label_map.json').as_posix()),
        "inference_script": str((SAVE_DIR / 'inference.py').as_posix()),
    }}

out2 = train_softmax_classifier_classwise(
    data_path="./symptoms/training_dataset.csv",
    save_dir="./symptoms/model/",
    epochs=25,
    patience=10,
    early_stopping=True,
    verbose=True
)

out2


### Condition classifier inference functionality

In [None]:
import model_inference as mi

mi.inference(user_text="",first_call=True)
mi.inference(user_text="", last_ans=False)
mi.inference(last_ans=False)
mi.inference(last_ans=False)
mi.inference(last_ans=False)
mi.inference(last_ans=False)
mi.inference(last_ans=False)
mi.inference(last_ans=False)
mi.inference(last_ans=False)
mi.inference(last_ans=False)
mi.inference(last_ans=False)

### Script to generate synthetic data (1000 samples default)

In [None]:
import pandas as pd
import numpy as np
import random
import re
import json
import csv
from pathlib import Path

random.seed(8081)
np.random.seed(8081)

# ---------- Load source ----------
src = Path("./symptoms/symptoms_full.csv")
df = pd.read_csv(src)

# Normalize headers
df.columns = [c.strip().lower() for c in df.columns]

# Identify columns
cond_id_col = next((c for c in df.columns if "condition" in c and "id" in c), None)
cond_name_col = next((c for c in df.columns if c in ("condition","diagnosis","name","condition_name")), None)
subspec_col = next((c for c in df.columns if "subspecial" in c), None)
keywords_col = next((c for c in df.columns if "keyword" in c), None)

if not all([cond_id_col, subspec_col, keywords_col]):
    raise RuntimeError(f"Required columns not found. Got: {df.columns.tolist()}")

# Clean + helpers
df = df.dropna(subset=[cond_id_col, subspec_col]).copy()
df[cond_id_col] = df[cond_id_col].astype(int)

def split_keywords(x):
    if pd.isna(x):
        return []
    s = str(x).replace("|", ",").replace(";", ",")
    parts = [p.strip() for p in s.split(",") if p.strip()]
    seen = set(); out = []
    for p in parts:
        k = p.lower()
        if k not in seen:
            out.append(p); seen.add(k)
    return out

df["_kw_list"] = df[keywords_col].apply(split_keywords)
df["_cond_name"] = df[cond_name_col] if cond_name_col else ""

# build neighbor pools by subspecialty
by_sub = {}
for _, r in df.iterrows():
    by_sub.setdefault(str(r[subspec_col]), []).append(r)

# ---------- Natural-English rewriting ----------

PHRASE_REWRITES = {
    "shortness of breath": ["breathlessness", "I get winded easily", "hard to catch my breath"],
    "chest pain": ["chest tightness", "pressure in my chest", "pain in my chest"],
    "pelvic pain": ["pelvic discomfort", "an ache in my lower abdomen", "cramping in my pelvis"],
    "abdominal pain": ["belly pain", "stomach ache", "abdominal discomfort"],
    "lower back": ["ache in my lower back", "low back soreness", "low back pain"],
    "back pain": ["back ache", "ache in my back", "back soreness"],
    "heavy bleeding": ["very heavy periods", "heavy flow", "soaking pads quickly"],
    "irregular periods": ["my periods are irregular", "unpredictable periods"],
    "painful periods": ["painful cramps", "cramping with my period"],
    "vaginal bleeding": ["vaginal spotting", "unexpected vaginal bleeding"],
    "urinary frequency": ["peeing often", "going to the bathroom a lot"],
    "urinary urgency": ["a sudden urge to pee", "needing to pee right away"],
    "burning urination": ["it burns when I pee", "pain when urinating"],
    "dyspareunia": ["pain with sex", "sex is painful"],
    "nausea": ["feeling queasy", "nauseated"],
    "vomiting": ["throwing up", "vomiting"],
    "diarrhea": ["loose stools", "the runs"],
    "constipation": ["hard to pass stool", "constipation"],
    "bloating": ["feeling bloated", "abdominal swelling"],
    "fatigue": ["low energy", "feeling exhausted", "tired all the time"],
    "dizziness": ["feeling lightheaded", "woozy spells"],
    "headache": ["head pain", "throbbing headaches", "migraine-like pain"],
    "itching": ["itchiness", "persistent itching"],
    "discharge": ["unusual discharge", "unusual fluid"],
    "incontinence": ["leaking urine", "urine leakage"],
    "hematuria": ["blood in my urine", "pink or red urine"],
    "overactive bladder": ["urgent and frequent urination", "OAB-like symptoms"],
    "urinary incontinence": ["leaking urine", "stress incontinence"],
}

TOKEN_SWAPS = {
    "pain": ["ache", "discomfort", "soreness"],
    "bleeding": ["spotting", "blood loss"],
    "cramps": ["cramping", "spasms"],
    "swelling": ["bloating", "puffiness"],
    "burning": ["stinging", "burning sensation"],
    "discharge": ["fluid", "secretions"],
    "urgency": ["urgent need", "urge"],
    "frequency": ["frequent", "often"],
}

DURATIONS = [
    "for a few days", "for two weeks", "for several months", "on and off",
    "since yesterday", "recently", "over the past month"
]
MODIFIERS = [
    "worse at night", "after exercise", "during my period", "after intercourse",
    "with a low-grade fever", "with nausea", "with low back ache",
    "especially when standing", "when I cough or sneeze", "worse in the morning"
]

TEMPLATES = [
    "I've been dealing with {kws} {duration}, {mod}.",
    "Lately I've noticed {kws} {duration}.",
    "{kws} {duration}. Should I be concerned?",
    "For {duration}, it's been {kws}. Any guidance?",
    "I keep getting {kws} {duration}, {mod}.",
    "Having {kws} {duration}. What could this be?",
    "Symptoms include {kws} {duration}.",
    "It's mostly {kws} {duration}, {mod}.",
    "Recently it's been {kws}.",
    "Persistent {kws} {duration}.",
]

ANAT_LOCS = [
    "lower back", "pelvis", "pelvic area", "abdomen", "stomach", "chest",
    "throat", "groin", "hip", "knee", "shoulder", "neck", "flank", "side"
]

def _norm(s: str) -> str:
    return re.sub(r"\s+", " ", str(s).strip().lower())

def rewrite_phrase(phrase: str) -> str:
    base = phrase.strip()
    low = _norm(base)
    # phrase-level
    for k, opts in PHRASE_REWRITES.items():
        if k in low:
            return random.choice(opts)
    # token-level
    toks = base.split()
    changed = False
    for i, tok in enumerate(toks):
        core = re.sub(r"[^\w-]", "", tok.lower())
        if core in TOKEN_SWAPS and random.random() < 0.6:
            toks[i] = random.choice(TOKEN_SWAPS[core])
            changed = True
    if changed:
        return " ".join(toks)
    # grammar tweak: "<loc> pain" -> "pain in my <loc>"
    if "pain" in low:
        for loc in ANAT_LOCS:
            if loc in low:
                return random.choice([f"pain in my {loc}", f"an ache in my {loc}", f"{loc} pain"])
        return random.choice(["ongoing pain", "persistent pain", base])
    return base

def list_join(items):
    if not items: return ""
    if len(items) == 1: return items[0]
    return ", ".join(items[:-1]) + f", and {items[-1]}"

def choose_keywords(base_list, neighbor_pool, kmin=3, kmax=6):
    # base_list: original row keywords
    # neighbor_pool: list of additional keywords from neighbors
    pool = list(dict.fromkeys(base_list + neighbor_pool))  # de-dup, keep order
    if not pool:
        pool = ["fatigue", "bloating"]
    k = min(len(pool), random.randint(kmin, kmax))
    if k == 0: k = 1
    random.shuffle(pool)
    return pool[:k]

# Build a mapping subspecialty -> pool of other keywords
sub_kw_pool = {}
for sub, rows in by_sub.items():
    kw_accum = []
    for rr in rows:
        kw_accum.extend(rr["_kw_list"] or [])
    # de-duplicate, keep insertion order
    seen = set(); pooled = []
    for w in kw_accum:
        lw = w.lower()
        if lw not in seen:
            pooled.append(w); seen.add(lw)
    sub_kw_pool[sub] = pooled

# ---------- Create variants: 10 per row ----------
variants_per_row = 10
rows_out = []

for idx, r in df.iterrows():
    base_id = int(r[cond_id_col])
    subspec = str(r[subspec_col])
    base_kws = r["_kw_list"] or []
    neighbor_pool = sub_kw_pool.get(subspec, [])
    # remove base keywords from neighbor pool to emphasize "other sets"
    neighbor_pool_filtered = [w for w in neighbor_pool if _norm(w) not in {_norm(x) for x in base_kws}]

    for v in range(1, variants_per_row + 1):
        # pick a set of keywords (some from base, some from neighbor pool)
        chosen = choose_keywords(
            base_list=base_kws,
            neighbor_pool=random.sample(neighbor_pool_filtered, k=min(len(neighbor_pool_filtered), random.randint(2,6))),
            kmin=3, kmax=6
        )
        # rewrite some to natural English; ensure at least one exact from original base_kws remains
        realized = []
        had_exact_from_base = False
        base_set_norm = {_norm(x) for x in base_kws}
        for j, kw in enumerate(chosen):
            if _norm(kw) in base_set_norm and random.random() < (0.55 if j == 0 else 0.4):
                realized.append(kw)
                had_exact_from_base = True
            else:
                realized.append(rewrite_phrase(kw))
        if not had_exact_from_base and base_kws:
            realized[0] = base_kws[0]

        # build user_input
        sentence = random.choice(TEMPLATES).format(
            kws=list_join(realized),
            duration=random.choice(DURATIONS),
            mod=random.choice(MODIFIERS)
        )
        rows_out.append({
            "source_condition_id": base_id,
            "source_subspecialty": subspec,
            "variant_index": v,
            "variant_keywords": json.dumps(chosen),
            "user_input": sentence
        })

# Save to CSV
out_path = Path("./symptoms/keyword_sets_user_inputs.csv")
with out_path.open("w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["source_condition_id","source_subspecialty","variant_index","variant_keywords","user_input"], quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(rows_out)

# Preview a slice
preview = pd.DataFrame(rows_out).head(25)

out_path.as_posix()


## MOVING FORWARD IN MORNING

TO DO FOR LOGAN OR FOR BACKEND:
- Create subspecialty to subspecialty_key csv file to force static index all the time.
- Instantiate algorithm for subspecialty selection (Vector of percent): where $c_i$ is some predicted condition from user input
$$\text{subspecialty\_chance}_j = \sum_{i=0}^{k}{c_i1\{b(i)=j\}}, \forall j \in [1,s]$$
- Instantiate algorithm for doctor selection (vector of percent): where $\vec{d}_i$ is the experience level of each doctor on some given condition $i$
 $$\text{doctor\_chance}_j = ln(\sum_{i=0}^{k}{d_j^{(c_i)}})$$
- Consider power sharpening these
- make SINGLE function that takes user input string and converts into subspecialty pred vector and doctor pred vector
#### Stretch goals
- MCPT on data
- various visualizations of model rate of learning
- all for the construction of "more info" page
- Build lifting stuff?

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('./data/symptoms_full.csv')
sspec_data = pd.read_csv('./data/sspec_key_map.csv')

sympt = data.values

sspec = sspec_data.values

for i in range(6):
    
    loc = np.where(sympt[:, 1]==sspec[i,2])
    
    sympt[loc, 1] = sspec[i, 0]

print(sympt)

new_df = pd.DataFrame(data=sympt, columns=data.columns)
new_df = new_df.drop(columns=['subspecialty'])
new_df.to_csv('./data/symptoms_full_i.csv', index=False)