In [None]:
# !pip install torch
print("hello")

In [None]:
import numpy as np
import pandas as pd

# ------------- 1) pick the actions you want to compare -------------
ACTIONS = ["A_dose1", "A_dose2", "A_dose3", "A_dose4", "A_vax_any"]  # you already did A_booster

# ------------- 2) choose your outcome & alignment params -------------
OUTCOME_NAME = "pasc_score"         # or "depression_score"
OUTCOME_DATE = "date"               # the date col of the outcome file (e.g., pasc_df["date"])
OUTCOME_MODE = "delta_next"         # or "next_value"
MAX_FORWARD_DAYS = 120              # None to disable
ID_COL = "PARTICIPANT_ID"
VISIT_COL = "VISIT_START_DATE"
def zscore_Y(Y_list):
            all_y = np.concatenate([y for y in Y_list])
            mu, sd = all_y.mean(), all_y.std() or 1.0
            return [ (y - mu)/sd for y in Y_list ]
        # Then call run_cknn_numpy on the standardized Y_list_std instead of Y_list.
def build_A_list_in_same_order(X_visit_df, A_col, id_col="PARTICIPANT_ID", date_col="VISIT_START_DATE"):
    # patient order = first appearance order in X_visit_df (stable)
    ids = X_visit_df[id_col].drop_duplicates().tolist()
    g = X_visit_df[[id_col, date_col, A_col]].copy()
    g[date_col] = pd.to_datetime(g[date_col], errors="coerce")
    A_list = []
    for pid in ids:
        sub = g[g[id_col]==pid].sort_values(date_col)
        A_list.append(sub[A_col].astype("int64").to_numpy())
    return np.array(A_list, dtype=object)

# ------------- 3) seeds for mean ± CI -------------
SEEDS = [1,2,3,4,5,6,7,8,9,10]

# ------------- 4) helper: (X,A,Y) builder you used earlier -------------
# make sure this function is in scope (the cleaned, robust version you ended up with).
# def build_triplets_from_XA_and_outcome(...):  # <- from our earlier message

# ------------- 5) helper: run_cknn_numpy you already have -------------
# def run_cknn_numpy(X_list, A_list, Y_list, d_latent=16, seed=1337, k=50, temp=1.0)

# ------------- 6) main evaluation loop -------------
rows = []
for A_col in ACTIONS:
    for s in SEEDS:
        # Rebuild triplets for THIS action (robust and alignment-safe)
        X_list, A_list, Y_list, X_cols = build_triplets_from_XA_and_outcome(
            X_visit=X_visit_with_A,           # your per-visit table that contains all A_* columns & features
            outcome_df=pasc_df,               # or depression_df
            id_col=ID_COL,
            visit_col=VISIT_COL,
            A_col=A_col,                      # <-- iterate action here
            outcome_col=OUTCOME_NAME,
            outcome_date_col=OUTCOME_DATE,
            outcome_mode=OUTCOME_MODE,
            max_forward_days=MAX_FORWARD_DAYS
        )
        
        res = run_cknn_numpy(X_list, A_list, Y_list, d_latent=16, seed=s, k=50, temp=1.0)
        rows.append({"action": A_col, "seed": s, "mse": res["mse"], "dr_value": res["dr_value"]})

df = pd.DataFrame(rows)

# ------------- 7) summarize with mean ± 95% CI -------------
def summarize(group, col):
    mean = group[col].mean()
    sem  = group[col].std(ddof=1) / np.sqrt(len(group))
    ci95 = 1.96 * sem
    return pd.Series({f"{col}_mean": mean, f"{col}_ci95": ci95})

summary = (df.groupby("action")
             .apply(lambda g: pd.concat([summarize(g, "mse"), summarize(g, "dr_value")]))
             .reset_index())

print(summary.sort_values("action"))
# Optionally save:
summary.to_csv("cknn_vax_actions_summary.csv", index=False)


In [34]:
import math, random
import numpy as np

# =============== Utilities ===============

def add_bias(X):
    return np.concatenate([X, np.ones((X.shape[0],1), dtype=X.dtype)], axis=1)

def ridge_regression_fit(X, y, l2=1e-2):
    XT = X.T
    A = XT @ X
    I = np.eye(A.shape[0], dtype=X.dtype)
    I[-1, -1] = 0.0  # don't regularize bias
    w = np.linalg.solve(A + l2*I, XT @ y)
    return w

def ridge_regression_predict(W, X):
    return X @ W

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def logistic_regression_fit(X, y, l2=1e-3, lr=0.1, epochs=300, batch=2048, seed=1337):
    rng = np.random.default_rng(seed)
    N, D = X.shape
    w = rng.normal(0, 0.01, size=(D,)).astype(np.float32)
    y = y.astype(np.float32)
    for ep in range(epochs):
        perm = rng.permutation(N)
        for s in range(0, N, batch):
            e = min(N, s+batch)
            xb = X[perm[s:e]]; yb = y[perm[s:e]]
            p = sigmoid(xb @ w)
            # gradient (+ L2 on non-bias)
            grad = (xb.T @ (p - yb)) / (e - s)
            grad[:-1] += l2 * w[:-1]
            w -= lr * grad
    return w

def logistic_regression_predict_proba(W, X):
    return sigmoid(X @ W)

def flatten_time(phi, A, Y):
    """phi: (n,T,d) object -> (N,d), A,Y -> (N,)"""
    n = len(phi)
    ds = [p.shape[0] for p in phi]
    d = phi[0].shape[1]
    Ntot = sum(ds)
    out_phi = np.zeros((Ntot, d), dtype=np.float32)
    out_A = np.zeros((Ntot,), dtype=np.int64)
    out_Y = np.zeros((Ntot,), dtype=np.float32)
    idx = 0
    for i in range(n):
        T = phi[i].shape[0]
        out_phi[idx:idx+T] = phi[i]
        out_A[idx:idx+T] = A[i]
        out_Y[idx:idx+T] = Y[i]
        idx += T
    return out_phi, out_A, out_Y

# =============== History features (no torch) ===============

def build_history_features(X_obj, A_obj, Y_obj, d_latent=16, seed=1337, alpha=0.5):
    """
    X_obj[i]: (T_i, d_x) features per visit *before* treatment
    Returns Phi[i]: (T_i, d_latent)
    Idea: concat [x_t, a_{t-1}, y_{t-1}, EMA(x_{<t}), cum-mean(y_{<t})] -> random projection + tanh
    """
    rng = np.random.default_rng(seed)
    N = len(X_obj)
    d_x = X_obj[0].shape[1]

    d_raw = d_x + 1 + 1 + d_x + 1
    W = rng.normal(0, 1/np.sqrt(d_raw), size=(d_raw, d_latent)).astype(np.float32)
    b = rng.normal(0, 0.1, size=(d_latent,)).astype(np.float32)

    Phi = []
    for i in range(N):
        Xi = X_obj[i]; Ai = A_obj[i]; Yi = Y_obj[i]
        T = Xi.shape[0]
        phi_i = np.zeros((T, d_latent), dtype=np.float32)
        x_ema = np.zeros((d_x,), dtype=np.float32)
        y_csum = 0.0
        for t in range(T):
            x_t = Xi[t]
            a_prev = Ai[t-1] if t>0 else 0.0
            y_prev = Yi[t-1] if t>0 else 0.0
            if t>0:
                x_ema = alpha * x_ema + (1-alpha) * Xi[t-1]
                y_csum += Yi[t-1]
                y_cmean = y_csum / t
            else:
                x_ema = np.zeros_like(x_t)
                y_cmean = 0.0
            raw = np.concatenate([x_t, [a_prev], [y_prev], x_ema, [y_cmean]]).astype(np.float32)
            phi_i[t] = np.tanh(raw @ W + b)
        Phi.append(phi_i)
    return np.array(Phi, dtype=object)

# =============== ANN (HNSW or exact kNN fallback) ===============

try:
    import hnswlib
    class ANNIndex:
        def __init__(self, dim, space="l2", ef=100, M=32):
            self.index = hnswlib.Index(space=space, dim=dim)
            self.ef = ef; self.M = M; self.built = False
        def build(self, X, ids=None):
            if ids is None: ids = np.arange(X.shape[0])
            self.index.init_index(max_elements=X.shape[0], ef_construction=self.ef, M=self.M)
            self.index.add_items(X.astype(np.float32), ids.astype(np.int64))
            self.index.set_ef(self.ef); self.built = True
        def query(self, Q, k=50):
            return self.index.knn_query(Q.astype(np.float32), k=k)
except Exception:
    # exact kNN fallback (no dependency)
    class ANNIndex:
        def __init__(self, dim, space="l2", **kwargs):
            self.X = None
        def build(self, X, ids=None):
            self.X = X.astype(np.float32)
        def query(self, Q, k=50):
            # squared l2
            d2 = ((Q[:,None,:] - self.X[None,:,:])**2).sum(axis=2)
            idx = np.argsort(d2, axis=1)[:, :k]
            d  = np.take_along_axis(d2, idx, axis=1)
            return idx, d

# =============== DR-kNN estimator ===============

def dr_knn_mu(phi_tr, A_tr, Y_tr, query_phi, a_query, out_W, prop_W, k=50, temp=1.0):
    """
    Double-robust kNN: neighbors on phi among train rows with A==a_query.
    out_W: ridge on [phi, onehot(a)](+bias)
    prop_W: logistic on phi(+bias)
    """
    mask_a = (A_tr == a_query)
    phi_a = phi_tr[mask_a]
    ann = ANNIndex(dim=phi_tr.shape[1])
    ann.build(phi_a)

    def design_outcome(phi, a_vec):
        a0 = (a_vec==0).astype(np.float32).reshape(-1,1)
        a1 = (a_vec==1).astype(np.float32).reshape(-1,1)
        Z = np.concatenate([phi, a0, a1], axis=1)
        return add_bias(Z)

    def m_pred(phi, a_scalar):
        Z = design_outcome(phi, np.full((phi.shape[0],), a_scalar, dtype=np.int64))
        return ridge_regression_predict(out_W, Z)

    def e_pred(phi):
        return logistic_regression_predict_proba(prop_W, add_bias(phi))

    labels, dists = ann.query(query_phi, k=min(k, len(phi_a)))
    Wker = np.exp(-dists / max(temp, 1e-6))
    Wker /= (Wker.sum(axis=1, keepdims=True) + 1e-8)

    # train-side predictions once
    e_train = e_pred(phi_tr)
    e_train = e_train if a_query==1 else (1.0 - e_train)
    m_train_a = m_pred(phi_tr, a_query)

    mu = np.zeros(query_phi.shape[0], dtype=np.float32)
    # indices of global train rows corresponding to A==a_query
    idx_map = np.where(mask_a)[0]
    for i in range(query_phi.shape[0]):
        local = labels[i]
        global_idx = idx_map[local]
        w = Wker[i]

        res = Y_tr[global_idx] - m_train_a[global_idx]
        e_neighbors = e_train[global_idx]
        iw = 1.0 / np.clip(e_neighbors, 1e-3, 1-1e-3)
        w_tilde = w * iw; w_tilde /= (w_tilde.sum() + 1e-8)

        m_q = m_pred(query_phi[i:i+1], a_query)[0]
        mu[i] = m_q + np.sum(w_tilde * res)
    return mu

# =============== Runner: pass your lists here ===============

def run_cknn_numpy(X_list, A_list, Y_list, d_latent=16, seed=1337, k=50, temp=1.0):
    rng = np.random.default_rng(seed)
    n = len(X_list)
    idx = np.arange(n); rng.shuffle(idx)
    n_te = max(1, int(0.2*n))
    te_idx, tr_idx = idx[:n_te], idx[n_te:]

    # object arrays (ragged ok)
    X_obj = np.array([np.asarray(x, dtype=np.float32) for x in X_list], dtype=object)
    A_obj = np.array([np.asarray(a, dtype=np.int64) for a in A_list], dtype=object)
    Y_obj = np.array([np.asarray(y, dtype=np.float32) for y in Y_list], dtype=object)

    # history embeddings
    Phi = build_history_features(X_obj, A_obj, Y_obj, d_latent=d_latent, seed=seed)
    Phi_tr = Phi[tr_idx]; A_tr = A_obj[tr_idx]; Y_tr = Y_obj[tr_idx]
    Phi_te = Phi[te_idx]; A_te = A_obj[te_idx]; Y_te = Y_obj[te_idx]

    # flatten to visit-level
    phi_tr_flat, A_tr_flat, Y_tr_flat = flatten_time(Phi_tr, A_tr, Y_tr)
    phi_te_flat, A_te_flat, Y_te_flat = flatten_time(Phi_te, A_te, Y_te)

    # fit outcome model: y ~ [phi, onehot(a)] + bias
    a0 = (A_tr_flat==0).astype(np.float32).reshape(-1,1)
    a1 = (A_tr_flat==1).astype(np.float32).reshape(-1,1)
    Z_out_tr = np.concatenate([phi_tr_flat, a0, a1], axis=1)
    out_W = ridge_regression_fit(add_bias(Z_out_tr), Y_tr_flat, l2=1e-2)

    # fit propensity: a ~ phi + bias
    prop_W = logistic_regression_fit(add_bias(phi_tr_flat), A_tr_flat.astype(np.float32),
                                     l2=1e-3, lr=0.1, epochs=300, batch=2048, seed=seed)

    # DR-kNN mu(a|H) on test
    mu0 = dr_knn_mu(phi_tr_flat, A_tr_flat, Y_tr_flat, phi_te_flat, a_query=0,
                    out_W=out_W, prop_W=prop_W, k=k, temp=temp)
    mu1 = dr_knn_mu(phi_tr_flat, A_tr_flat, Y_tr_flat, phi_te_flat, a_query=1,
                    out_W=out_W, prop_W=prop_W, k=k, temp=temp)

    # evaluation proxy: observed-treatment MSE
    y_pred_obs = np.where(A_te_flat==1, mu1, mu0)
    mse = float(np.mean((Y_te_flat - y_pred_obs)**2))

    # simple policy: treat if mu1 > mu0, DR value estimate
    policy = (mu1 > mu0).astype(np.int64)
    # propensity for observed A on test
    e_te = logistic_regression_predict_proba(prop_W, add_bias(phi_te_flat))
    e_te = np.where(A_te_flat==1, e_te, 1.0 - e_te)
    # m_obs (outcome regression at observed A)
    a0_te = (A_te_flat==0).astype(np.float32).reshape(-1,1)
    a1_te = (A_te_flat==1).astype(np.float32).reshape(-1,1)
    Z_out_te = np.concatenate([phi_te_flat, a0_te, a1_te], axis=1)
    m_obs = ridge_regression_predict(out_W, add_bias(Z_out_te))
    dr_value = float(np.mean(((policy==A_te_flat)/np.clip(e_te,1e-3,1-1e-3)) * (Y_te_flat - m_obs)
                             + (policy*mu1 + (1-policy)*mu0)))

    return {
        "mse": mse,
        "dr_value": dr_value,
        "mu0": mu0, "mu1": mu1,
        "A_te": A_te_flat, "Y_te": Y_te_flat
    }
def knn_mu(phi_tr, A_tr, Y_tr, query_phi, a_query, k=50, temp=1.0):
    mask = (A_tr==a_query)
    X = phi_tr[mask]; y = Y_tr[mask]
    # exact kNN if no hnswlib
    d2 = ((query_phi[:,None,:]-X[None,:,:])**2).sum(2)
    idx = np.argsort(d2,1)[:, :min(k, len(X))]
    d  = np.take_along_axis(d2, idx, axis=1)
    W = np.exp(-d/ max(temp,1e-6)); W /= (W.sum(1,keepdims=True)+1e-8)
    return (W * y[idx]).sum(1)
def tlearner():
    # train
    Z0, y0 = phi_tr_flat[A_tr_flat==0], Y_tr_flat[A_tr_flat==0]
    Z1, y1 = phi_tr_flat[A_tr_flat==1], Y_tr_flat[A_tr_flat==1]
    W0 = ridge_regression_fit(add_bias(Z0), y0, l2=1e-2)
    W1 = ridge_regression_fit(add_bias(Z1), y1, l2=1e-2)
    # predict potential outcomes on test
    mu0 = ridge_regression_predict(W0, add_bias(phi_te_flat))
    mu1 = ridge_regression_predict(W1, add_bias(phi_te_flat))
    # compute MSE & DR value as before
def msm():
    e = logistic_regression_predict_proba(prop_W, add_bias(phi_te_flat))
    e_obs = np.where(A_te_flat==1, e, 1-e)
    # simple (non-sequential) stabilized weight per visit
    pA = A_te_flat.mean()  # crude marginal; or per-time-bin marginal
    num = np.where(A_te_flat==1, pA, 1-pA)
    w = num / np.clip(e_obs,1e-3,1-1e-3)
    # weighted outcome difference
    tau_msm = np.average(Y_te_flat, weights=w*(A_te_flat==1)) - np.average(Y_te_flat, weights=w*(A_te_flat==0))
# ================== Example call ==================
# results = run_cknn_numpy(X_list, A_list, Y_list, d_latent=16, seed=1337, k=50, temp=1.0)
# print("Observed-treatment MSE:", results["mse"])
# print("DR value (treat if mu1>mu0):", results["dr_value"])
seeds = [1,2,3,4,5,6,7,8,9,10]
rows = []
for s in seeds:
    res = run_cknn_numpy(X_list, A_list, Y_list, d_latent=16, seed=s, k=50, temp=1.0)
    rows.append((s, res["mse"], res["dr_value"]))
    arr = np.array([[r[1], r[2]] for r in rows], dtype=float)
    m_mse, m_dr = arr.mean(0)
    s_mse, s_dr = arr.std(0, ddof=1)
    print(f"MSE: {m_mse:.4f} ± {1.96*s_mse/np.sqrt(len(seeds)):.4f}")
    print(f"DR value: {m_dr:.4f} ± {1.96*s_dr/np.sqrt(len(seeds)):.4f}")


  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = um.true_divide(


MSE: 2.5270 ± nan
DR value: 0.0441 ± nan
MSE: 2.7286 ± 0.1767
DR value: 0.0966 ± 0.0461
MSE: 2.8893 ± 0.2130
DR value: 0.1570 ± 0.0726
MSE: 2.9185 ± 0.1776
DR value: 0.1746 ± 0.0631
MSE: 2.8543 ± 0.1777
DR value: 0.1484 ± 0.0656
MSE: 2.8340 ± 0.1619
DR value: 0.1602 ± 0.0614
MSE: 2.7869 ± 0.1667
DR value: 0.1831 ± 0.0674
MSE: 2.7830 ± 0.1545
DR value: 0.1893 ± 0.0634
MSE: 2.8206 ± 0.1606
DR value: 0.1843 ± 0.0600
MSE: 2.8211 ± 0.1514
DR value: 0.1837 ± 0.0566


In [3]:
import pandas as pd
import numpy as np
import re

def _to_dt(s):
    return pd.to_datetime(s, errors="coerce", utc=False).dt.tz_localize(None)

def _same_month(a, b):
    a = pd.to_datetime(a, errors="coerce"); b = pd.to_datetime(b, errors="coerce")
    return (a.dt.year == b.dt.year) & (a.dt.month == b.dt.month)

def add_vax_actions_to_visits(
    X_visit: pd.DataFrame,
    vax_dates: pd.DataFrame,
    id_col="PARTICIPANT_ID",
    visit_col="VISIT_START_DATE",
    booster_doses=(3,4,5)  # which doses count as "booster"
) -> pd.DataFrame:
    """
    vax_dates: wide table with columns like:
      PARTICIPANT_ID, vacc_vaccdt_1_date, vacc_vaccdt_2_date, ... (up to 5)
    Returns X_visit with new columns:
      A_dose1..A_doseK, A_vax_any, A_booster, months_since_last_vaccine
    """
    X = X_visit.copy()
    V = vax_dates.copy()

    # parse visit date
    X[visit_col] = _to_dt(X[visit_col])

    # find dose date columns in vax_dates
    dose_cols = []
    dose_idx = []
    for c in V.columns:
        m = re.search(r'(\d+)', c)
        if c.endswith("_date") and m:
            dose_cols.append(c)
            dose_idx.append(int(m.group(1)))
    # sort consistently by dose number
    dose_cols = [dc for _, dc in sorted(zip(dose_idx, dose_cols))]
    dose_numbers = sorted(dose_idx)

    # parse dose dates
    for c in dose_cols:
        V[c] = _to_dt(V[c])

    # merge dose dates onto each visit
    M = X.merge(V[[id_col] + dose_cols], on=id_col, how="left")

    # per-dose actions A_dosek
    for k, c in zip(dose_numbers, dose_cols):
        M[f"A_dose{k}"] = _same_month(M[c], M[visit_col]).astype("int8")

    # composites
    a_cols = [f"A_dose{k}" for k in dose_numbers]
    M["A_vax_any"] = (M[a_cols].sum(axis=1) > 0).astype("int8")

    booster_cols = [f"A_dose{k}" for k in dose_numbers if k in booster_doses]
    if booster_cols:
        M["A_booster"] = (M[booster_cols].sum(axis=1) > 0).astype("int8")
    else:
        M["A_booster"] = np.int8(0)

    # months_since_last_vaccine (<= visit date)
    def months_since(row):
        ref = row[visit_col]
        if pd.isna(ref): return np.float32(999.0)
        # consider any dose date <= visit date
        dates = [row[c] for c in dose_cols if pd.notna(row[c]) and row[c] <= ref]
        if not dates: return np.float32(999.0)
        last = max(dates)
        return np.float32((ref - last).days / 30.0)

    M["months_since_last_vaccine"] = M.apply(months_since, axis=1)

    return M
import pandas as pd
X_visit = pd.read_csv('X_visits.csv')
vax_dates = pd.read_csv('vax_dates.csv')

X_visit_with_A = add_vax_actions_to_visits(X_visit, vax_dates)
print("done")

In [None]:
# for x in X_visit_with_A.columns:
#     print(x,end=' ')
print("hello")

In [6]:
# X_visit_with_A.shape(158685, 670)
X_visit_with_A.to_csv('X_visit_with_A.csv',index=False)


In [28]:
# pasc_df = pd.read_csv('y_pasc_score2024.csv')
pasc_df.head()

Unnamed: 0,PARTICIPANT_ID,date,pasc_score
0,RA11305,2021-01-01,16.0
1,RA11305,2021-04-03,14.0
2,RA11305,2021-06-28,8.0
3,RA11305,2021-09-28,13.0
4,RA11305,2021-12-23,10.0


In [10]:
pasc_df.rename({"pasc_score_2024":"pasc_score"},axis="columns",inplace=True)

In [None]:

import os, math, random, time, json
from dataclasses import dataclass
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import hnswlib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# ------------------------------
# 1) Synthetic sequential data
# ------------------------------

def make_synthetic_longitudinal(N=1000, T=5, d_x=6, seed=1337):
    """
    Generate synthetic longitudinal data with confounding.
    Each patient i has time-varying covariates X_{i,t}, treatment A_{i,t} in {0,1},
    and outcome Y_{i,t}. Treatment depends on history; outcomes depend on both
    latent health and treatment.
    """
    rng = np.random.default_rng(seed)
    # latent "health" state per patient, evolves
    Z = rng.normal(0, 1, size=(N,))  # baseline health
    
    X = []
    A = []
    Y = []
    # create d_x covariates; first few are confounders linked to Z and past Y
    for i in range(N):
        Xi = []
        Ai = []
        Yi = []
        z = Z[i]
        y_prev = 0.0
        for t in range(T):
            # covariates: some depend on z and y_prev
            x = rng.normal(0,1,size=(d_x,))
            x[0] += 0.8*z
            x[1] += 0.6*y_prev
            x[2] += 0.4*z*y_prev
            
            # clinician policy: treat if (z + y_prev + x0) high, plus noise
            logits = 0.8*z + 0.7*y_prev + 0.5*x[0] + rng.normal(0,0.5)
            a = (logits > 0.0).astype(int)
            
            # outcome model: future health depends on current treatment and covariates
            # true treatment effect heterogeneity: beneficial if z is low (sicker)
            tau = 0.8 - 0.6*max(z, 0)  # smaller effect for healthier
            noise = rng.normal(0, 0.5)
            y = (0.6*y_prev + 0.5*x[0] + 0.3*x[1] + tau*a + 0.3*z + noise)
            
            Xi.append(x.astype(np.float32))
            Ai.append(int(a))
            Yi.append(float(y))
            
            y_prev = y
        
        X.append(np.array(Xi, dtype=np.float32))  # (T, d_x)
        A.append(np.array(Ai, dtype=np.int64))    # (T,)
        Y.append(np.array(Yi, dtype=np.float32))  # (T,)
    return np.array(X, dtype=object), np.array(A, dtype=object), np.array(Y, dtype=object)

# -----------------------------------------
# 2) Models: GRU encoder, outcome, policy
# -----------------------------------------

class GRUEncoder(nn.Module):
    def __init__(self, d_x, d_hidden=64, d_latent=16):
        super().__init__()
        self.gru = nn.GRU(input_size=d_x+2, hidden_size=d_hidden, batch_first=True)
        self.proj = nn.Linear(d_hidden, d_latent)
    def forward(self, X, A, Y):
        """
        X: (B,T,d_x), A: (B,T), Y: (B,T)
        Build history embedding for each time step t using prefix 0..t-1.
        We do teacher-forcing: shift A,Y by one with zeros at t=0.
        Returns embeddings Phi of shape (B,T,d_latent).
        """
        B,T,d_x = X.shape
        A_shift = torch.zeros_like(A)
        Y_shift = torch.zeros_like(Y)
        if T>1:
            A_shift[:,1:] = A[:,:-1].float()
            Y_shift[:,1:] = Y[:,:-1].float()
        inp = torch.cat([X, A_shift.unsqueeze(-1), Y_shift.unsqueeze(-1)], dim=-1)
        h, _ = self.gru(inp)  # (B,T,d_hidden)
        phi = self.proj(h)    # (B,T,d_latent)
        return phi

class MLP(nn.Module):
    def __init__(self, d_in, d_out=1, hidden=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_in, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
            nn.Linear(hidden, d_out)
        )
    def forward(self, x):
        return self.net(x)

def outcome_forward(outcome_net, phi, a):
    """
    phi: (B,T,d_latent), a: (B,T) int {0,1}
    returns preds (B,T)
    """
    B,T,d = phi.shape
    a_one = F.one_hot(a, num_classes=2).float()  # (B,T,2)
    x = torch.cat([phi, a_one], dim=-1).view(B*T, d+2)
    y = outcome_net(x).view(B,T)
    return y

def propensity_forward(prop_net, phi):
    """
    phi: (B,T,d_latent)
    returns p(a=1 | phi) in (B,T)
    """
    B,T,d = phi.shape
    x = phi.reshape(B*T, d)
    logit = prop_net(x).view(B,T)
    p1 = torch.sigmoid(logit)
    return p1

# -------------------------------
# 3) Losses: predictive + balance
# -------------------------------

def mmd_loss(res_treated, res_control, sigma=1.0):
    """Simple RBF-kernel MMD between 1D residual samples."""
    # res_*: (n,)
    def rbf(x, y):
        xx = x.unsqueeze(1)
        yy = y.unsqueeze(0)
        return torch.exp(-(xx-yy)**2/(2*sigma**2))
    Ktt = rbf(res_treated, res_treated).mean()
    Kcc = rbf(res_control, res_control).mean()
    Ktc = rbf(res_treated, res_control).mean()
    return Ktt + Kcc - 2*Ktc

# --------------------------------------------
# 4) Training with predictive + balancing loss
# --------------------------------------------

@dataclass
class TrainConfig:
    d_x: int = 6
    T: int = 5
    d_hidden: int = 64
    d_latent: int = 16
    hidden: int = 64
    lr: float = 1e-3
    epochs: int = 10
    batch_size: int = 64
    lam_mmd: float = 0.1
    seed: int = 1337

def train_cknn_lsh(X, A, Y, cfg: TrainConfig):
    torch.manual_seed(cfg.seed); np.random.seed(cfg.seed); random.seed(cfg.seed)
    N = len(X)
    idx = np.arange(N)
    tr_idx, te_idx = train_test_split(idx, test_size=0.2, random_state=cfg.seed)
    
    # pack into tensors
    def pack(idxs):
        # pad to uniform T if variable-length; here T is fixed
        Xb = torch.tensor(np.stack([X[i] for i in idxs], axis=0)) # (B,T,d_x)
        Ab = torch.tensor(np.stack([A[i] for i in idxs], axis=0)) # (B,T)
        Yb = torch.tensor(np.stack([Y[i] for i in idxs], axis=0)) # (B,T)
        return Xb, Ab, Yb
    Xtr, Atr, Ytr = pack(tr_idx)
    Xte, Ate, Yte = pack(te_idx)
    
    enc = GRUEncoder(cfg.d_x, cfg.d_hidden, cfg.d_latent)
    out_net = MLP(cfg.d_latent+2, 1, cfg.hidden)
    prop_net = MLP(cfg.d_latent, 1, cfg.hidden)
    params = list(enc.parameters()) + list(out_net.parameters()) + list(prop_net.parameters())
    opt = optim.Adam(params, lr=cfg.lr)
    
    B = Xtr.shape[0]
    steps_per_epoch = math.ceil(B / cfg.batch_size)
    
    for epoch in range(cfg.epochs):
        perm = torch.randperm(B)
        Xtr_sh = Xtr[perm]; Atr_sh = Atr[perm]; Ytr_sh = Ytr[perm]
        epoch_loss = 0.0
        for step in range(steps_per_epoch):
            s = step*cfg.batch_size; e = min((step+1)*cfg.batch_size, B)
            x = Xtr_sh[s:e]; a = Atr_sh[s:e]; y = Ytr_sh[s:e]
            phi = enc(x,a,y)               # (b,T,d_latent)
            yhat = outcome_forward(out_net, phi, a)  # (b,T)
            p1 = propensity_forward(prop_net, phi)   # (b,T)
            
            # Predictive loss (MSE + BCE for observed actions)
            loss_pred = F.mse_loss(yhat, y)
            # action loglik
            logp = a.float()*torch.log(p1+1e-6) + (1-a).float()*torch.log(1-p1+1e-6)
            loss_act = -logp.mean()
            
            # Balancing: MMD of residuals across treatment groups in the minibatch
            res = (y - yhat).detach()
            # pool across time
            a_vec = a.reshape(-1)
            res_vec = res.reshape(-1)
            if (a_vec==1).sum()>1 and (a_vec==0).sum()>1:
                mmd = mmd_loss(res_vec[a_vec==1], res_vec[a_vec==0])
            else:
                mmd = torch.tensor(0.0)
            
            loss = loss_pred + 0.1*loss_act + cfg.lam_mmd*mmd
            opt.zero_grad()
            loss.backward()
            opt.step()
            epoch_loss += loss.item()
        print(f"[Epoch {epoch+1}/{cfg.epochs}] loss={epoch_loss/steps_per_epoch:.4f}")
    
    # Build embeddings for train and test
    with torch.no_grad():
        phi_tr = enc(Xtr, Atr, Ytr).numpy()   # (n_tr,T,d_latent)
        phi_te = enc(Xte, Ate, Yte).numpy()   # (n_te,T,d_latent)
        # also store predictions for outcome models
        yhat_tr = outcome_forward(out_net, torch.tensor(phi_tr), Atr).numpy()
        p1_tr = propensity_forward(prop_net, torch.tensor(phi_tr)).numpy()
    
    model = {"enc": enc, "out_net": out_net, "prop_net": prop_net}
    data = {"Xtr": Xtr, "Atr": Atr, "Ytr": Ytr, "Xte": Xte, "Ate": Ate, "Yte": Yte,
            "phi_tr": phi_tr, "phi_te": phi_te, "yhat_tr": yhat_tr, "p1_tr": p1_tr,
            "tr_idx": tr_idx, "te_idx": te_idx}
    return model, data

# ------------------------------------
# 5) ANN index and DR-kNN estimator
# ------------------------------------

class ANNIndex:
    def __init__(self, dim, space="l2", ef=100, M=32):
        self.index = hnswlib.Index(space=space, dim=dim)
        self.ef = ef
        self.M = M
        self.built = False
    
    def build(self, X, ids=None):
        num = X.shape[0]
        if ids is None:
            ids = np.arange(num)
        self.index.init_index(max_elements=num, ef_construction=self.ef, M=self.M)
        self.index.add_items(X.astype(np.float32), ids.astype(np.int64))
        self.index.set_ef(self.ef)
        self.built = True
    
    def query(self, q, k=50):
        labels, dists = self.index.knn_query(q.astype(np.float32), k=k)
        return labels, dists

def flatten_time(phi, A, Y):
    """
    Flatten sequences into visit-level rows.
    phi: (n,T,d), A,Y: (n,T)
    returns arrays of shape (n*T, d), (n*T,), (n*T,)
    """
    n,T,d = phi.shape
    return phi.reshape(n*T, d), A.reshape(n*T), Y.reshape(n*T)

def dr_knn_mu(enc, out_net, prop_net, phi_tr, A_tr, Y_tr, query_phi, a_query, k=50, temp=1.0):
    """
    Estimate mu(a|H) for each query row using DR-kNN.
    - Build separate ANN per treatment value for clarity.
    """
    # Split train by treatment
    mask_a = (A_tr == a_query)
    phi_a = phi_tr[mask_a]
    Y_a = Y_tr[mask_a]
    
    # ANN on phi_a
    dim = phi_tr.shape[1]
    ann = ANNIndex(dim=dim)
    ann.build(phi_a, np.arange(len(phi_a)))
    
    # outcome and propensity on train
    with torch.no_grad():
        # outcome for treatment a on train
        phi_tr_t = torch.tensor(phi_tr, dtype=torch.float32)
        a_tr = torch.tensor(A_tr, dtype=torch.int64)
        yhat_tr = outcome_forward(out_net, phi_tr_t.view(1,-1,dim), a_tr.view(1,-1)).view(-1).numpy()
        # propensity on train
        p1_tr = propensity_forward(prop_net, phi_tr_t.view(1,-1,dim)).view(-1).numpy()
        # choose propensity for a
        e_tr = p1_tr if a_query==1 else (1-p1_tr)
    
    # Precompute yhat for treatment a on train rows
    # yhat under a differs; recompute with a vector of a's
    with torch.no_grad():
        B = phi_tr.shape[0]
        # create a vector a* of length n_tr*T
        a_star = np.full((phi_tr.shape[0],), a_query, dtype=np.int64)
    
    # For simplicity at visit-level, we treat query_phi as (Q,d)
    labels, dists = ann.query(query_phi, k=min(k, len(phi_a)))
    # kernel weights
    W = np.exp(-dists / max(temp, 1e-6))
    W /= (W.sum(axis=1, keepdims=True) + 1e-8)
    
    mu_hat = np.zeros(query_phi.shape[0], dtype=np.float32)
    for i in range(query_phi.shape[0]):
        idxs = labels[i]                             # indices within phi_a
        # Map back to global indices of train where A=a
        global_idxs = np.where(mask_a)[0][idxs]
        w = W[i]
        # Compute outcome regression for neighbors under a_query
        with torch.no_grad():
            phi_neighbors = torch.tensor(phi_tr[global_idxs], dtype=torch.float32).view(1,-1,dim)
            a_neighbors = torch.full((1,len(global_idxs)), a_query, dtype=torch.int64)
            m_neighbors = outcome_forward(out_net, phi_neighbors, a_neighbors).view(-1).numpy()
        # residuals for neighbors (observed a == a_query)
        res = Y_tr[global_idxs] - m_neighbors
        # propensity weights (stabilized)
        with torch.no_grad():
            p1_neighbors = propensity_forward(prop_net, torch.tensor(phi_tr[global_idxs]).view(1,-1,dim)).view(-1).numpy()
        e_neighbors = p1_neighbors if a_query==1 else (1-p1_neighbors)
        iw = 1.0 / np.clip(e_neighbors, 1e-3, 1-1e-3)
        w_tilde = w * iw
        w_tilde = w_tilde / (w_tilde.sum() + 1e-8)
        
        # outcome regression at query
        with torch.no_grad():
            q_phi = torch.tensor(query_phi[i]).view(1,1,dim)
            q_a = torch.tensor([[a_query]], dtype=torch.int64)
            m_q = outcome_forward(out_net, q_phi, q_a).item()
        
        mu_hat[i] = m_q + np.sum(w_tilde * res)
    return mu_hat

# ---------------------------------
# 6) Main demo: train & evaluate
# ---------------------------------

def main():
    X, A, Y = make_synthetic_longitudinal(N=800, T=5, d_x=6, seed=1337)
    cfg = TrainConfig(epochs=8, batch_size=64, d_x=6, T=5, d_latent=16, lam_mmd=0.1)
    model, data = train_cknn_lsh(X, A, Y, cfg)
    
    enc = model["enc"]; out_net = model["out_net"]; prop_net = model["prop_net"]
    phi_tr, phi_te = data["phi_tr"], data["phi_te"]
    
    # Flatten to visit-level
    phi_tr_flat, A_tr_flat, Y_tr_flat = flatten_time(phi_tr, data["Atr"].numpy(), data["Ytr"].numpy())
    phi_te_flat, A_te_flat, Y_te_flat = flatten_time(phi_te, data["Ate"].numpy(), data["Yte"].numpy())
    
    # Estimate mu(a|H) on test for a in {0,1}
    mu0 = dr_knn_mu(enc, out_net, prop_net, phi_tr_flat, A_tr_flat, Y_tr_flat, phi_te_flat, a_query=0, k=50, temp=1.0)
    mu1 = dr_knn_mu(enc, out_net, prop_net, phi_tr_flat, A_tr_flat, Y_tr_flat, phi_te_flat, a_query=1, k=50, temp=1.0)
    
    # Simple evaluation proxy: If observed A_te==1, compare Y to mu1; else compare to mu0.
    y_pred_obs = np.where(A_te_flat==1, mu1, mu0)
    mse = mean_squared_error(Y_te_flat, y_pred_obs)
    print(f"Test MSE against observed-treatment counterfactual: {mse:.4f}")
    
    # Simple "policy": treat if mu1 > mu0
    policy = (mu1 > mu0).astype(int)
    # Off-policy DR value estimate on test (single-step approximation)
    # Here we just use observed one-step outcomes; for multi-step, apply sequential DR.
    # Propensity on test:
    with torch.no_grad():
        dim = phi_te.shape[-1]
        p1_te = torch.sigmoid(model["prop_net"](torch.tensor(phi_te).view(-1,dim))).view(-1).numpy()
    e_te = np.where(A_te_flat==1, p1_te, 1-p1_te)
    with torch.no_grad():
        # outcome regression at observed A
        B = phi_te_flat.shape[0]
        a_obs = torch.tensor(A_te_flat.reshape(1,-1), dtype=torch.int64)
        m_obs = outcome_forward(out_net, torch.tensor(phi_te_flat).view(1,-1,dim), a_obs).view(-1).numpy()
    dr_value = np.mean( (policy==A_te_flat)/np.clip(e_te,1e-3,1-1e-3) * (Y_te_flat - m_obs) + 
                        ( (policy)*(mu1) + (1-policy)*(mu0) ) )
    print(f"Off-policy DR value (approx.): {dr_value:.4f}")
    
if __name__ == "__main__":
    main()


In [30]:
A_col = "A_booster"      # or "A_vax_any" or "A_dose3" etc.

X_list, A_list, Y_list, X_cols = build_triplets_from_XA_and_outcome(
    X_visit=X_visit_with_A,
    outcome_df=pasc_df,                 # columns: PARTICIPANT_ID, date, pasc_score
    id_col="PARTICIPANT_ID",
    visit_col="VISIT_START_DATE",
    A_col=A_col,                        # e.g., "A_booster"
    outcome_col="pasc_score",
    outcome_date_col="date",
    outcome_mode="delta_next",          # or "next_value"
    max_forward_days=120                # optional
)


In [29]:
import numpy as np
import pandas as pd
def asof_per_patient(left_df, right_df, id_col, left_time, right_time, outcome_col,direction="forward", allow_exact_matches=True):
    out = []
    for pid, L in left_df.groupby(id_col, sort=False):
        R = right_df[right_df[id_col] == pid]
        if L.empty:
            continue
        # sort & drop NaT within this patient
        L = L.copy(); R = R.copy()
        L[left_time] = pd.to_datetime(L[left_time], errors="coerce").dt.tz_localize(None)
        R[right_time] = pd.to_datetime(R[right_time], errors="coerce").dt.tz_localize(None)
        L = L[L[left_time].notna()].sort_values(left_time)
        R = R[R[right_time].notna()].sort_values(right_time)
        if R.empty:
            # still add rows with _y_next = NaN to keep alignment if you need it later
            tmp = L.copy()
            tmp["_y_next"] = np.nan
            tmp["_y_next_date"] = pd.NaT
        else:
            tmp = pd.merge_asof(
                L,
                R[[id_col, right_time, outcome_col]],
                left_on=left_time,
                right_on=right_time,
                direction=direction,
                allow_exact_matches=allow_exact_matches,
            ).rename(columns={outcome_col: "_y_next", right_time: "_y_next_date"})
        out.append(tmp)
    return pd.concat(out, axis=0, ignore_index=True)


def _clean_for_asof(df, id_col, time_col):
    df = df.copy()
    # parse to timezone-naive datetimes
    df[time_col] = pd.to_datetime(df[time_col], errors='coerce').dt.tz_localize(None)
    # drop rows with missing id or time (asof requires non-null)
    df = df[df[id_col].notna() & df[time_col].notna()]
    # sort by id then time (required for asof with "by")
    df = df.sort_values([id_col, time_col])
    # optional: drop duplicate rows on (id,time) if they exist
    df = df.drop_duplicates(subset=[id_col, time_col])
    return df
def build_triplets_from_XA_and_outcome(
    X_visit, outcome_df,
    id_col="PARTICIPANT_ID",
    visit_col="VISIT_START_DATE",
    A_col="A_booster",
    outcome_col="pasc_score",
    outcome_date_col="date",
    outcome_mode="next_value",
    max_forward_days=None
):
    dfX = _clean_for_asof(X_visit, id_col, visit_col)
    dfY = _clean_for_asof(outcome_df, id_col, outcome_date_col)

    # feature columns (exclude id/date/A and non-numeric)
    exclude = {id_col, visit_col, A_col}
    non_numeric = set(dfX.select_dtypes(include=["object"]).columns)
    X_cols = [c for c in dfX.columns if c not in exclude and c not in non_numeric]

    # NEXT outcome after each visit (no leakage)
    # out_next = (
    #     pd.merge_asof(
    #         dfX[[id_col, visit_col]],
    #         dfY[[id_col, outcome_date_col, outcome_col]],
    #         by=id_col,
    #         left_on=visit_col,
    #         right_on=outcome_date_col,
    #         direction="forward",
    #         allow_exact_matches=True,
    #     )
    #     .rename(columns={outcome_col: "_y_next", outcome_date_col: "_y_next_date"})
    # )
    # Use it like:
    left  = dfX[[id_col, visit_col]].copy()
    right = dfY[[id_col, outcome_date_col, outcome_col]].copy()
    out_next = asof_per_patient(
        left_df=left, right_df=right,
        id_col=id_col, left_time=visit_col, right_time=outcome_date_col,outcome_col=outcome_col,
        direction="forward", allow_exact_matches=True
    ).rename(columns={outcome_col: "_y_next", outcome_date_col: "_y_next_date"})
    


    if max_forward_days is not None:
        dt = (out_next["_y_next_date"] - dfX[visit_col]).dt.days
        out_next.loc[dt > max_forward_days, "_y_next"] = np.nan

    if outcome_mode == "delta_next":
        left  = dfX[[id_col, visit_col]].copy()
        right = dfY[[id_col, outcome_date_col, outcome_col]].copy()
        # out_prev = asof_per_patient(
        #     left_df=left, right_df=right,
        #     id_col=id_col, left_time=visit_col, right_time=outcome_date_col,outcome_col=outcome_col,
        #     direction="backward", allow_exact_matches=True
        # ).rename(columns={outcome_col: "_y_prev"})       
        # backward (previous outcome)
        out_prev = asof_per_patient(
            left_df=left, right_df=right,
            id_col=id_col, left_time=visit_col, right_time=outcome_date_col, outcome_col=outcome_col,
            direction="backward", allow_exact_matches=True
        )
        # <-- rename the columns that asof_per_patient created
        out_prev = out_prev.rename(columns={"_y_next": "_y_prev", "_y_next_date": "_y_prev_date"})
        
        # now this works:
        aligned = pd.concat([dfX[[id_col, visit_col]], out_next["_y_next"], out_prev["_y_prev"]], axis=1)
        aligned["Y_t"] = aligned["_y_next"] - aligned["_y_prev"]
        # out_prev = (
        #     pd.merge_asof(
        #         dfX[[id_col, visit_col]],
        #         dfY[[id_col, outcome_date_col, outcome_col]],
        #         by=id_col,
        #         left_on=visit_col,
        #         right_on=outcome_date_col,
        #         direction="backward",
        #         allow_exact_matches=True,
        #     ).rename(columns={outcome_col: "_y_prev"})
        # )
        # aligned = pd.concat([dfX[[id_col, visit_col]], out_next["_y_next"], out_prev["_y_prev"]], axis=1)
        # aligned["Y_t"] = aligned["_y_next"] - aligned["_y_prev"]
    elif outcome_mode == "next_value":
        aligned = pd.concat([dfX[[id_col, visit_col]], out_next["_y_next"]], axis=1)
        aligned["Y_t"] = aligned["_y_next"]
    else:
        raise ValueError("outcome_mode must be 'next_value' or 'delta_next'.")

    # Keep visits with a follow-up outcome
    keep = ~aligned["Y_t"].isna()
    dfX_kept = dfX.loc[keep].copy()
    aligned = aligned.loc[keep].copy()

    # Collect ragged sequences
    X_list, A_list, Y_list = [], [], []
    for pid, gX in dfX_kept.groupby(id_col):
        gX = gX.sort_values(visit_col)
        X_arr = gX[X_cols].to_numpy(dtype="float32")
        A_arr = gX[A_col].astype("int8").to_numpy()
        Y_arr = aligned.loc[gX.index, "Y_t"].astype("float32").to_numpy()
        if len(X_arr) == 0: 
            continue
        X_list.append(X_arr); A_list.append(A_arr); Y_list.append(Y_arr)

    return np.array(X_list, dtype=object), np.array(A_list, dtype=object), np.array(Y_list, dtype=object), X_cols

In [92]:
import pandas as pd
import numpy as np
# f="/Recover2507/project-files/RECOVERAdult_BiostatsDerived_202412_symptoms_deID.csv"
# a=pd.read_csv(f)
import os
path = "/sbgenomics/project-files/"
files = os.listdir(path)

# 0 RECOVERAdult_BDC_202412_answerdata_deID.tsv
# 1 RECOVERAdult_BDC_202412_biospecimens_deID.tsv
# 2 RECOVERAdult_BDC_202412_concepts_deID.tsv
# 3 RECOVERAdult_BDC_202412_demographics_deID.tsv
# 4 RECOVERAdult_BDC_202412_fitbit_deID.tsv
# 5 RECOVERAdult_BDC_202412_visits_deID.tsv
# 6 RECOVERAdult_BiostatsDerived_202412_core_proc_deID.csv
# 7 RECOVERAdult_BiostatsDerived_202412_symptoms_deID.csv
# 8 RECOVERAdult_BiostatsDerived_202412_visits_deID.csv
# 9 _1_RECOVERAdult_BDC_202412_answerdata_deID.tsv
# 10 _1_RECOVERAdult_BDC_202412_fitbit_deID.tsv
# 11 _1_RECOVERAdult_BiostatsDerived_202412_symptoms_deID.csv

# import time 
# 0 15173 ANSWER_NUMERIC_VAL 'youth (1, Correct) [Non-infected-27m post-index]' % Gastric Retention After 1 Hour [Infected-09m post-index]
# Index(['PARTICIPANT_ID', 'VISIT_ID', 'VISIT_TYPE', 'DATA_ENTRY_DATE',
#        'FORM_NAME', 'INSTANCE_NUM', 'FIELD_NAME', 'DATA_FIELD_NAME',
#        'FIELD_TYPE', 'ANSWER_LABEL', 'ANSWER_NUMERIC_VAL', 'ANSWER_TEXT_VAL',
#        'CONECPT_CD', 'CONCEPT_NAME'],
#       dtype='object')
# 1 13523 PBMC_CELL_COUNT  SPECIMEN_TYPE PLASMA_TREATMENT_TYPE
# 'PARTICIPANT_ID', 'KIT_ID', 'COLLECTION_DATE', 'SPECIMEN_TYPE',
#        'SPECIMEN_CONCEPT_CD', 'MONTHS_SINCE_INDEX_DATE', 'SPECIMEN_VOLUME',
#        'SPECIMEN_VOLUME_UNITS', 'SPECIMEN_THAW_COUNT',
#        'COLLECTION_TO_FREEZE_TIME_HOURS', 'PLASMA_TREATMENT_TYPE',
#        'PBMC_CELL_COUNT', 'PBMC_CELL_VIABILITY'],
# 2 concept id VISIT_ID	VISIT_TYPE	DATA_ENTRY_DATE	FORM_NAME	INSTANCE_NUM	FIELD_NAME	
# ['CONCEPT_CODE', 'FIELD_NAME', 'DATA_FIELD_NAME', 'FORM_NAME',
#        'CONCEPT_NAME', 'CONCEPT_PATH_NICE', 'CONCEPT_PATH'],
# #3 SEX_AT_BIRTH 15179, 20; ENROLL_ZIP_CODE DECEASED (15071 alive, 108 die) 'Female', 'Intersex', 'Male', 'Unknown'
# 'PARTICIPANT_ID', 'ENROLL_PROTOCOL', 'ENROLL_SITE_ID',
#        'ENROLL_HUB_SITE_ID', 'ENROLL_SITE_PATH', 'ENROLL_DATE',
#        'ENROLL_CATEGORY', 'ENROLL_INDEX_DATE', 'CROSSOVER_FLAG',
#        'CROSSOVER_INDEX_DATE', 'ONSTUDY_INFECTION', 'ONSTUDY_INFECTION_CNT',
#        'SEX_AT_BIRTH', 'DOB', 'AGE_AT_ENROLLMENT', 'ENROLL_ZIP_CODE',
#        'WITHDRAWN', 'WITHDRAW_DATE', 'DECEASED', 'DECEASED_DATE'],
# # 10,552 F, 4,086 M ;   
# # 5  VISIT_START_DATE  INFECTION_STATUS 
# ['VISIT_ID', 'PARTICIPANT_ID', 'VISIT_SITE_ID', 'VISIT_TYPE',
#        'VISIT_START_DATE', 'INFECTION_STATUS', 'MONTHS_POSTINDEX'],
#       dtype='object')
# # 6  15159 acute_yn 5558 1, 9600 0, infect_yn 2513 0, 12646 1
# record_id,acute_yn,infect_yn,infect_yn_anti_f,index_dt,enroll_dt,race___1,race___2,race___3,race___4,race___5,race___6,race___7,race___15,race____88,race_unique_an,biosex,dob,age_enroll,age_enrl_cat,preg_cohort_yn,cc_anxdep_base,cc_asthma_base,cc_autoimm_base,cc_bipolar_base,cc_cancer_base,cc_cfs_base,cc_clung_base,cc_cns_base,cc_copd_base,cc_cvd_base,cc_dementia_base,cc_diabetes_base,cc_fibromyalgia_base,cc_imm_base,cc_liver_base,cc_move_base,cc_nmusc_base,cc_o2home_base,cc_obesity_base,cc_othermh_base,cc_polyov_base,cc_pots_base,cc_renal_base,cc_seiz_base,cc_sickle_base,cc_stroke_base,cc_cvdspec___1_base,cc_cvdspec___2_base,cc_cvdspec___3_base,cc_cvdspec___4_base,cc_cvdspec___5_base,cc_cvdspec___6_base,cc_cvdspec___7_base,cc_cvdspec___8_base,cc_cvdspec___98_base,cc_cvdspec____88_base,cc_autoimmspec___1_base,cc_autoimmspec___2_base,cc_autoimmspec___3_base,cc_autoimmspec___23_base,cc_autoimmspec___4_base,cc_autoimmspec___5_base,cc_autoimmspec___6_base,cc_autoimmspec___7_base,cc_autoimmspec___8_base,cc_autoimmspec___9_base,cc_autoimmspec___10_base,cc_autoimmspec___11_base,cc_autoimmspec___12_base,cc_autoimmspec___13_base,cc_autoimmspec___14_base,cc_autoimmspec___15_base,cc_autoimmspec___16_base,cc_autoimmspec___17_base,cc_autoimmspec___18_base,cc_autoimmspec___19_base,cc_autoimmspec___20_base,cc_autoimmspec___21_base,cc_autoimmspec___22_base,cc_autoimmspec___98_base,cc_autoimmspec____88_base,cc_cancerspec___1_base,cc_cancerspec___2_base,cc_cancerspec___3_base,cc_cancerspec___4_base,cc_cancerspec___5_base,cc_cancerspec___6_base,cc_cancerspec___7_base,cc_cancerspec___8_base,cc_cancerspec___9_base,cc_cancerspec___10_base,cc_cancerspec___11_base,cc_cancerspec___12_base,cc_cancerspec___13_base,cc_cancerspec___14_base,cc_cancerspec___15_base,cc_cancerspec___16_base,cc_cancerspec___17_base,cc_cancerspec___18_base,cc_cancerspec___19_base,cc_cancerspec___20_base,cc_cancerspec___21_base,cc_cancerspec____88_base,cc_transplant_type___1_base,cc_transplant_type___2_base,cc_transplant_type___3_base,cc_transplant_type___4_base,cc_transplant_type___5_base,cc_transplant_type____88_base,cc_stroke_type___1_base,cc_stroke_type___2_base,cc_stroke_type___3_base,cc_stroke_type___4_base,cc_stroke_type___6_base,cc_stroke_type___98_base,cc_stroke_type____88_base,cc_cns_type___1_base,cc_cns_type___2_base,cc_cns_type___3_base,cc_cns_type___4_base,cc_cns_type___5_base,cc_cns_type___6_base,cc_cns_type____88_base,cc_potsspec___1_base,cc_potsspec___2_base,cc_potsspec___3_base,cc_potsspec___4_base,cc_potsspec___10_base,cc_potsspec___98_base,cc_potsspec____88_base,cc_nmusc_type___1_base,cc_nmusc_type___2_base,cc_nmusc_type___3_base,cc_nmusc_type___4_base,cc_nmusc_type___5_base,cc_nmusc_type___6_base,cc_nmusc_type____88_base,cc_move_type___1_base,cc_move_type___2_base,cc_move_type___3_base,cc_move_type___4_base,cc_move_type___5_base,cc_move_type___6_base,cc_move_type___7_base,cc_move_type___8_base,cc_move_type____88_base,cc_diabetesspec_base,cc_dialyn_base,cc_transplant_base,referral_type,spop___2,spop___3,spop___4,spop___99,education,fvacc_index,vacc_numb,vacc_vacctype_1,vacc_vaccothspec_1,vacc_vaccdt_1,vacc_vaccdt_2,vacc_vaccdt_3,enrl_reinfyn,enrl_reinfdt,rx_totalinf,acute_reinf_ovr,rx_carelevel___0,rx_carelevel___1,rx_carelevel___2,rx_carelevel___3,rx_carelevel___4,rx_carelevel___98,rx_carelevel____88,Spike,Nucleocapsid,OVER_89_FLAG,
# # 8 pasc_score_2023 pasc_jama2023 servious ? 
# 'PARTICIPANT_ID', 'ENROLL_PROTOCOL', 'ENROLL_SITE_ID',
#        'ENROLL_HUB_SITE_ID', 'ENROLL_SITE_PATH', 'ENROLL_DATE',
#        'ENROLL_CATEGORY', 'ENROLL_INDEX_DATE', 'CROSSOVER_FLAG',
#        'CROSSOVER_INDEX_DATE', 'ONSTUDY_INFECTION', 'ONSTUDY_INFECTION_CNT',
#        'SEX_AT_BIRTH', 'DOB', 'AGE_AT_ENROLLMENT', 'ENROLL_ZIP_CODE',
#        'WITHDRAWN', 'WITHDRAW_DATE', 'DECEASED', 'DECEASED_DATE'],
#7 15159 , 11 NQOL_CF_Tscore depression ca_stand5dpb  phq8_total  mmrc_dyspnea (breath system) compass_score, gad7_total
#11 record_id,redcap_event_name,ps_colldt,compass_score,gad7_total,phq8_total,hit6_total,mmrc_dyspnea,NQOL_CF_Tscore,phq9_total,promis_sleepdist_sf8a_Tscore,snore,mi_neuro_sum,NQOL_UEF_raw,promis_pf_sf4a_raw,vfq_25_score,saq_sumscore,pain_head___before_index,pain_chest___before_index,pain_abdomen___before_index,pain_pelvis___before_index,pain_joint___before_index,pain_muscle___before_index,pain_back___before_index,pain_skin___before_index,pain_feet___before_index,pain_mouth___before_index,pain_throat___before_index,nerve_tremor___before_index,nerve_abmove___before_index,nerve_numb___before_index,nerve_nomove___before_index,nerve_seizure___before_index,ps_fatigue___before_index,ps_malaise___before_index,ps_soreness___before_index,ps_weak___before_index,ps_fever___before_index,ps_temp___before_index,ps_cold___before_index,ps_sense___before_index,ps_smellsick___before_index,ps_sinus___before_index,ps_headache___before_index,ps_pain___before_index,ps_sob___before_index,ps_wheeze___before_index,ps_cough___before_index,ps_heart___before_index,ps_swelllegs___before_index,ps_gastro___before_index,ps_bladder___before_index,ps_nerve___before_index,ps_mood___before_index,ps_think___before_index,ps_sleep___before_index,ps_goofy___before_index,ps_color___before_index,ps_rash___before_index,ps_itching___before_index,ps_anaphylaxis___before_index,ps_dryeyes___before_index,ps_drymouth___before_index,ps_thirst___before_index,ps_vision___before_index,ps_hearing___before_index,ps_bald___before_index,ps_teeth___before_index,ps_menstrual___before_index,ps_menopause___before_index,ps_fertility___before_index,ps_sex___before_index,pain_other___before_index,nerve_other___before_index,ps_headachec___before_index,pain_head___around_index,pain_chest___around_index,pain_abdomen___around_index,pain_pelvis___around_index,pain_joint___around_index,pain_muscle___around_index,pain_back___around_index,pain_skin___around_index,pain_feet___around_index,pain_mouth___around_index,pain_throat___around_index,nerve_tremor___around_index,nerve_abmove___around_index,nerve_numb___around_index,nerve_nomove___around_index,nerve_seizure___around_index,ps_fatigue___around_index,ps_malaise___around_index,ps_soreness___around_index,ps_weak___around_index,ps_fever___around_index,ps_temp___around_index,ps_cold___around_index,ps_sense___around_index,ps_smellsick___around_index,ps_sinus___around_index,ps_headache___around_index,ps_pain___around_index,ps_sob___around_index,ps_wheeze___around_index,ps_cough___around_index,ps_heart___around_index,ps_swelllegs___around_index,ps_gastro___around_index,ps_bladder___around_index,ps_nerve___around_index,ps_mood___around_index,ps_think___around_index,ps_sleep___around_index,ps_goofy___around_index,ps_color___around_index,ps_rash___around_index,ps_itching___around_index,ps_anaphylaxis___around_index,ps_dryeyes___around_index,ps_drymouth___around_index,ps_thirst___around_index,ps_vision___around_index,ps_hearing___around_index,ps_bald___around_index,ps_teeth___around_index,ps_menstrual___around_index,ps_menopause___around_index,ps_fertility___around_index,ps_sex___around_index,pain_other___around_index,nerve_other___around_index,ps_headachec___around_index,pain_head___now,pain_chest___now,pain_abdomen___now,pain_pelvis___now,pain_joint___now,pain_muscle___now,pain_back___now,pain_skin___now,pain_feet___now,pain_mouth___now,pain_throat___now,nerve_tremor___now,nerve_abmove___now,nerve_numb___now,nerve_nomove___now,nerve_seizure___now,ps_fatigue___now,ps_malaise___now,ps_soreness___now,ps_weak___now,ps_fever___now,ps_temp___now,ps_cold___now,ps_sense___now,ps_smellsick___now,ps_sinus___now,ps_headache___now,ps_pain___now,ps_sob___now,ps_wheeze___now,ps_cough___now,ps_heart___now,ps_swelllegs___now,ps_gastro___now,ps_bladder___now,ps_nerve___now,ps_mood___now,ps_think___now,ps_sleep___now,ps_goofy___now,ps_color___now,ps_rash___now,ps_itching___now,ps_anaphylaxis___now,ps_dryeyes___now,ps_drymouth___now,ps_thirst___now,ps_vision___now,ps_hearing___now,ps_bald___now,ps_teeth___now,ps_menstrual___now,ps_menopause___now,ps_fertility___now,ps_sex___now,pain_other___now,nerve_other___now,ps_headachec___now,pain_head___now_not_b4_f,pain_chest___now_not_b4_f,pain_abdomen___now_not_b4_f,pain_pelvis___now_not_b4_f,pain_joint___now_not_b4_f,pain_muscle___now_not_b4_f,pain_back___now_not_b4_f,pain_skin___now_not_b4_f,pain_feet___now_not_b4_f,pain_mouth___now_not_b4_f,pain_throat___now_not_b4_f,nerve_tremor___now_not_b4_f,nerve_abmove___now_not_b4_f,nerve_numb___now_not_b4_f,nerve_nomove___now_not_b4_f,nerve_seizure___now_not_b4_f,ps_fatigue___now_not_b4_f,ps_malaise___now_not_b4_f,ps_soreness___now_not_b4_f,ps_weak___now_not_b4_f,ps_fever___now_not_b4_f,ps_temp___now_not_b4_f,ps_cold___now_not_b4_f,ps_sense___now_not_b4_f,ps_smellsick___now_not_b4_f,ps_sinus___now_not_b4_f,ps_headache___now_not_b4_f,ps_pain___now_not_b4_f,ps_sob___now_not_b4_f,ps_wheeze___now_not_b4_f,ps_cough___now_not_b4_f,ps_heart___now_not_b4_f,ps_swelllegs___now_not_b4_f,ps_gastro___now_not_b4_f,ps_bladder___now_not_b4_f,ps_nerve___now_not_b4_f,ps_mood___now_not_b4_f,ps_think___now_not_b4_f,ps_sleep___now_not_b4_f,ps_goofy___now_not_b4_f,ps_color___now_not_b4_f,ps_rash___now_not_b4_f,ps_itching___now_not_b4_f,ps_anaphylaxis___now_not_b4_f,ps_dryeyes___now_not_b4_f,ps_drymouth___now_not_b4_f,ps_thirst___now_not_b4_f,ps_vision___now_not_b4_f,ps_hearing___now_not_b4_f,ps_bald___now_not_b4_f,ps_teeth___now_not_b4_f,ps_menstrual___now_not_b4_f,ps_menopause___now_not_b4_f,ps_fertility___now_not_b4_f,ps_sex___now_not_b4_f,pain_other___now_not_b4_f,nerve_other___now_not_b4_f,ps_headachec___now_not_b4_f,pain_head___now_not_b4,pain_chest___now_not_b4,pain_abdomen___now_not_b4,pain_pelvis___now_not_b4,pain_joint___now_not_b4,pain_muscle___now_not_b4,pain_back___now_not_b4,pain_skin___now_not_b4,pain_feet___now_not_b4,pain_mouth___now_not_b4,pain_throat___now_not_b4,nerve_tremor___now_not_b4,nerve_abmove___now_not_b4,nerve_numb___now_not_b4,nerve_nomove___now_not_b4,nerve_seizure___now_not_b4,ps_fatigue___now_not_b4,ps_malaise___now_not_b4,ps_soreness___now_not_b4,ps_weak___now_not_b4,ps_fever___now_not_b4,ps_temp___now_not_b4,ps_cold___now_not_b4,ps_sense___now_not_b4,ps_smellsick___now_not_b4,ps_sinus___now_not_b4,ps_headache___now_not_b4,ps_pain___now_not_b4,ps_sob___now_not_b4,ps_wheeze___now_not_b4,ps_cough___now_not_b4,ps_heart___now_not_b4,ps_swelllegs___now_not_b4,ps_gastro___now_not_b4,ps_bladder___now_not_b4,ps_nerve___now_not_b4,ps_mood___now_not_b4,ps_think___now_not_b4,ps_sleep___now_not_b4,ps_goofy___now_not_b4,ps_color___now_not_b4,ps_rash___now_not_b4,ps_itching___now_not_b4,ps_anaphylaxis___now_not_b4,ps_dryeyes___now_not_b4,ps_drymouth___now_not_b4,ps_thirst___now_not_b4,ps_vision___now_not_b4,ps_hearing___now_not_b4,ps_bald___now_not_b4,ps_teeth___now_not_b4,ps_menstrual___now_not_b4,ps_menopause___now_not_b4,ps_fertility___now_not_b4,ps_sex___now_not_b4,pain_other___now_not_b4,nerve_other___now_not_b4,ps_headachec___now_not_b4,pain_head___any_yes,pain_chest___any_yes,pain_abdomen___any_yes,pain_pelvis___any_yes,pain_joint___any_yes,pain_muscle___any_yes,pain_back___any_yes,pain_skin___any_yes,pain_feet___any_yes,pain_mouth___any_yes,pain_throat___any_yes,nerve_tremor___any_yes,nerve_abmove___any_yes,nerve_numb___any_yes,nerve_nomove___any_yes,nerve_seizure___any_yes,ps_fatigue___any_yes,ps_malaise___any_yes,ps_soreness___any_yes,ps_weak___any_yes,ps_fever___any_yes,ps_temp___any_yes,ps_cold___any_yes,ps_sense___any_yes,ps_smellsick___any_yes,ps_sinus___any_yes,ps_headache___any_yes,ps_pain___any_yes,ps_sob___any_yes,ps_wheeze___any_yes,ps_cough___any_yes,ps_heart___any_yes,ps_swelllegs___any_yes,ps_gastro___any_yes,ps_bladder___any_yes,ps_nerve___any_yes,ps_mood___any_yes,ps_think___any_yes,ps_sleep___any_yes,ps_goofy___any_yes,ps_color___any_yes,ps_rash___any_yes,ps_itching___any_yes,ps_anaphylaxis___any_yes,ps_dryeyes___any_yes,ps_drymouth___any_yes,ps_thirst___any_yes,ps_vision___any_yes,ps_hearing___any_yes,ps_bald___any_yes,ps_teeth___any_yes,ps_menstrual___any_yes,ps_menopause___any_yes,ps_fertility___any_yes,ps_sex___any_yes,pain_other___any_yes,nerve_other___any_yes,ps_headachec___any_yes,pain_head___any_yes_notb4,pain_chest___any_yes_notb4,pain_abdomen___any_yes_notb4,pain_pelvis___any_yes_notb4,pain_joint___any_yes_notb4,pain_muscle___any_yes_notb4,pain_back___any_yes_notb4,pain_skin___any_yes_notb4,pain_feet___any_yes_notb4,pain_mouth___any_yes_notb4,pain_throat___any_yes_notb4,nerve_tremor___any_yes_notb4,nerve_abmove___any_yes_notb4,nerve_numb___any_yes_notb4,nerve_nomove___any_yes_notb4,nerve_seizure___any_yes_notb4,ps_fatigue___any_yes_notb4,ps_malaise___any_yes_notb4,ps_soreness___any_yes_notb4,ps_weak___any_yes_notb4,ps_fever___any_yes_notb4,ps_temp___any_yes_notb4,ps_cold___any_yes_notb4,ps_sense___any_yes_notb4,ps_smellsick___any_yes_notb4,ps_sinus___any_yes_notb4,ps_headache___any_yes_notb4,ps_pain___any_yes_notb4,ps_sob___any_yes_notb4,ps_wheeze___any_yes_notb4,ps_cough___any_yes_notb4,ps_heart___any_yes_notb4,ps_swelllegs___any_yes_notb4,ps_gastro___any_yes_notb4,ps_bladder___any_yes_notb4,ps_nerve___any_yes_notb4,ps_mood___any_yes_notb4,ps_think___any_yes_notb4,ps_sleep___any_yes_notb4,ps_goofy___any_yes_notb4,ps_color___any_yes_notb4,ps_rash___any_yes_notb4,ps_itching___any_yes_notb4,ps_anaphylaxis___any_yes_notb4,ps_dryeyes___any_yes_notb4,ps_drymouth___any_yes_notb4,ps_thirst___any_yes_notb4,ps_vision___any_yes_notb4,ps_hearing___any_yes_notb4,ps_bald___any_yes_notb4,ps_teeth___any_yes_notb4,ps_menstrual___any_yes_notb4,ps_menopause___any_yes_notb4,ps_fertility___any_yes_notb4,ps_sex___any_yes_notb4,pain_other___any_yes_notb4,nerve_other___any_yes_notb4,ps_headachec___any_yes_notb4,pain_head___funl,pain_chest___funl,pain_abdomen___funl,pain_pelvis___funl,pain_joint___funl,pain_muscle___funl,pain_back___funl,pain_skin___funl,pain_feet___funl,pain_mouth___funl,pain_throat___funl,nerve_tremor___funl,nerve_abmove___funl,nerve_numb___funl,nerve_nomove___funl,nerve_seizure___funl,ps_fatigue___funl,ps_malaise___funl,ps_soreness___funl,ps_weak___funl,ps_fever___funl,ps_temp___funl,ps_cold___funl,ps_sense___funl,ps_smellsick___funl,ps_sinus___funl,ps_headache___funl,ps_pain___funl,ps_sob___funl,ps_wheeze___funl,ps_cough___funl,ps_heart___funl,ps_swelllegs___funl,ps_gastro___funl,ps_bladder___funl,ps_nerve___funl,ps_mood___funl,ps_think___funl,ps_sleep___funl,ps_goofy___funl,ps_color___funl,ps_rash___funl,ps_itching___funl,ps_anaphylaxis___funl,ps_dryeyes___funl,ps_drymouth___funl,ps_thirst___funl,ps_vision___funl,ps_hearing___funl,ps_bald___funl,ps_teeth___funl,ps_menstrual___funl,ps_menopause___funl,ps_fertility___funl,ps_sex___funl,pain_other___funl,nerve_other___funl,ps_headachec___funl,pain_head___aft30d,pain_chest___aft30d,pain_abdomen___aft30d,pain_pelvis___aft30d,pain_joint___aft30d,pain_muscle___aft30d,pain_back___aft30d,pain_skin___aft30d,pain_feet___aft30d,pain_mouth___aft30d,pain_throat___aft30d,nerve_tremor___aft30d,nerve_abmove___aft30d,nerve_numb___aft30d,nerve_nomove___aft30d,nerve_seizure___aft30d,ps_fatigue___aft30d,ps_malaise___aft30d,ps_soreness___aft30d,ps_weak___aft30d,ps_fever___aft30d,ps_temp___aft30d,ps_cold___aft30d,ps_sense___aft30d,ps_smellsick___aft30d,ps_sinus___aft30d,ps_headache___aft30d,ps_pain___aft30d,ps_sob___aft30d,ps_wheeze___aft30d,ps_cough___aft30d,ps_heart___aft30d,ps_swelllegs___aft30d,ps_gastro___aft30d,ps_bladder___aft30d,ps_nerve___aft30d,ps_mood___aft30d,ps_think___aft30d,ps_sleep___aft30d,ps_goofy___aft30d,ps_color___aft30d,ps_rash___aft30d,ps_itching___aft30d,ps_anaphylaxis___aft30d,ps_dryeyes___aft30d,ps_drymouth___aft30d,ps_thirst___aft30d,ps_vision___aft30d,ps_hearing___aft30d,ps_bald___aft30d,ps_teeth___aft30d,ps_menstrual___aft30d,ps_menopause___aft30d,ps_fertility___aft30d,ps_sex___aft30d,pain_other___aft30d,nerve_other___aft30d,ps_headachec___aft30d,ps_anxiety___now,ps_depress___now,ps_sleepapnea___now,ps_sleepdist___now,ps_anxiety___before_index,ps_depress___before_index,ps_sleepapnea___before_index,ps_sleepdist___before_index,cp_sum,musc_sum,neuro_sum,high_hr_sus,pots_oh,pots_ot,pots,promis_global01,promis_global02,promis_global03,promis_global04,promis_global05,promis_global09r,promis_global06,promis_global10,promis_global08,promis_global07,ps_compass31_calc,compass31_faintfreq,compass31_faintsev,compass31_fainttraj,compass31_colorloc___1,compass31_colorloc___2,compass31_colortraj,compass31_sweatyn,compass31_dryeyesyn,compass31_drymouthtraj,compass31_fullrate,compass31_bloated,compass31_vomit,compass31_cramp,compass31_diarryn,compass31_diarrfreq,compass31_diarrsev,compass31_diarrtraj,compass31_constyn,compass31_constsev,compass31_consttraj,compass31_controlbladder,compass31_urinepass,compass31_emptybladder,compass31_lightyn,compass31_lightsev,compass31_focusyn,compass31_focussev,compass31_vistraj,gad_1,gad_2,gad_3,gad_4,gad_5,gad_6,gad_7,phq_1,phq_2,phq_3,phq_4,phq_5,phq_6,phq_7,phq_8,phq_9,hit6_severerecode,hit6_activitiesrecode,hit6_liedownrecode,hit6_tootiredrecode,hit6_concentraterecode,hit6_irritatedrecode,nqcog_nqcog64r1,nqcog_nqcog75r1,nqcog_nqcog77r1,nqcog_nqcog80r1,nqcog_nqcog22r1,nqcog_nqcog24r1,nqcog_nqcog25r1,nqcog_nqcog40r1,promis_sleep109,promis_sleep116,promis_sleep20,promis_sleep44,promis_sleep108,promis_sleep72,promis_sleep67,promis_sleep115,neuroqol_pfa40,neuroqol_pfa50,neuroqol_pfb21,neuroqol_pfa43,neuroqol_pfa35,neuroqol_pfa55,neuroqol_pfb26,neuroqol_nquex44,mi_neuro1recode,mi_neuro2recode,mi_neuro3recode,mi_neuro4recode,mi_neuro5recode,mi_neuro6recode,mi_neuro7recode,mi_neuro8recode,mi_neuro9recode,mi_neuro10recode,mi_neuro11recode,mi_neuro12recode,mi_neuro13recode,mi_neuro14recode,mi_neuro15recode,promis_pfa11,promis_pfa21,promis_pfa23,promis_pfa53,vfq_2recode,vfq_4recode,vfq_19recode,vfq_5recode,vfq_6recode,vfq_7recode,vfq_8recode,vfq_9recode,vfq_14recode,vfq_11recode,vfq_13recode,vfq_3recode,vfq_21recode,vfq_22recode,vfq_25recode,vfq_17recode,vfq_18recode,vfq_20recode,vfq_23recode,vfq_24recode,vfq_15c,vfq_16,vfq_16a,vfq_12recode,vfq_10recode,visit_month,visit_month_curr,instance,instance_curr,index_dt_curr,infect_yn_curr,ca_hrsupine1,ca_sbpsupine1,ca_dbpsupine1,ca_stand1sbp,ca_stand1dpb,ca_stand3sbp,ca_stand3dpb,ca_stand5sbp,ca_stand5dpb,ca_stand10sbp,ca_stand10dbp,ca_stand1hr,ca_stand3hr,ca_stand5hr,ca_stand10hr,
# start_time = time.time()
# df0 = pd.read_csv(path+files[0], sep='\t', encoding='latin1')
# print(df0.columns)
# df01 = pd.read_csv(path+files[1], sep='\t', encoding='latin1')
# print(df01.columns)
# df02 = pd.read_csv(path+files[2], sep='\t', encoding='latin1')
# print(df02.columns)
# df3 = pd.read_csv(path + files[8], sep=',', encoding='latin1')
# dfinfe = pd.read_csv(path+files[5], sep='\t', encoding='latin1')
# print(df05.columns)
# df06 = pd.read_csv(path+files[6], sep=',', encoding='latin1')
# print(df06.columns)
# dfm = pd.read_csv(path+files[7], sep=',', encoding='latin1')
# df1 = pd.read_csv(path+files[10], sep='\t', encoding='latin1')
# df11 = pd.read_csv(path+files[11], sep=',', encoding='latin1')
# print(df11.columns)

# print(time.time()-start_time, df1.shape)
# df3.head()

In [119]:
dfinfe.shape #PARTICIPANT_ID  VISIT_START_DATE  INFECTION_STATUS  MONTHS_POSTINDEX 158685
import numpy as np
import pandas as pd

def make_X(dynamic_df: pd.DataFrame,
           static_df: pd.DataFrame,
           id_col="PARTICIPANT_ID",
           date_col="VISIT_START_DATE"):
    """
    Combine dynamic + static into a numeric feature matrix X per visit.

    Inputs:
      dynamic_df: columns include [PARTICIPANT_ID, VISIT_START_DATE, INFECTION_STATUS, MONTHS_POSTINDEX, ...]
      static_df:  columns include [PARTICIPANT_ID, age, race, (optional zip, ...)]

    Returns:
      X_df   : DataFrame with [id_col, date_col] + numeric X columns
      X_cols: list of feature column names in X_df
    """
    df = dynamic_df.copy()
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df = df.sort_values([id_col, date_col])

    # Ensure one row per person in static (pick first if duplicates)
    s = static_df.copy()
    s = s.sort_values([id_col]).groupby(id_col, as_index=False).first()

    # Merge static onto each visit
    df = df.merge(s, on=id_col, how="left")

    # ---- calendar/seasonality
    m = df[date_col].dt.month
    df["cal_sin_month"] = np.sin(2*np.pi*m/12.0).astype("float32")
    df["cal_cos_month"] = np.cos(2*np.pi*m/12.0).astype("float32")

    # ---- age at visit (if available)
    if "AGE_AT_ENROLLMENT" in df.columns and "MONTHS_POSTINDEX" in df.columns:
        df["age_at_visit"] = (df["AGE_AT_ENROLLMENT"].astype(float) + df["MONTHS_POSTINDEX"].astype(float)/12.0).astype("float32")

    # ---- race one-hot (no sklearn)
    if "race" in df.columns:
        race_oh = pd.get_dummies(df["race"].astype("category"), prefix="race", dtype="float32")
        df = pd.concat([df.drop(columns=["race"]), race_oh], axis=1)

    # ---- infection status one-hot
    if "INFECTION_STATUS" in df.columns:
        inf_oh = pd.get_dummies(df["INFECTION_STATUS"].astype("category"), prefix="inf", dtype="float32")
        df = pd.concat([df, inf_oh], axis=1)

    # ---- optional: zip -> 3-digit prefix one-hot (avoid huge cardinality)
    if "ENROLL_ZIP_CODE" in df.columns:
        df["zip3"] = df["ENROLL_ZIP_CODE"].astype(str).str.slice(0, 3)
        zip3_oh = pd.get_dummies(df["zip3"].astype("category"), prefix="zip3", dtype="float32")
        df = pd.concat([df.drop(columns=["ENROLL_ZIP_CODE", "zip3"]), zip3_oh], axis=1)

    # Collect numeric feature columns (exclude id/date and remaining non-numeric)
    exclude = {id_col, date_col}
    non_numeric = set(df.select_dtypes(include=["object"]).columns)
    X_cols = [c for c in df.columns if c not in exclude and c not in non_numeric]

    # Light missing handling: numeric mean-impute
    for c in X_cols:
        if df[c].dtype.kind in "biufc":
            df[c] = df[c].astype("float32")
            if df[c].isna().any():
                df[c] = df[c].fillna(df[c].mean())

    X_df = df[[id_col, date_col] + X_cols].sort_values([id_col, date_col]).reset_index(drop=True)
    return X_df, X_cols

def to_patient_sequences(X_df: pd.DataFrame,
                         X_cols,
                         id_col="PARTICIPANT_ID",
                         date_col="VISIT_START_DATE"):
    """
    Turn the flat X_df into a list of per-patient numpy arrays (ragged sequences).
    """
    X_list = []
    for _, g in X_df.groupby(id_col):
        g = g.sort_values(date_col)
        X_list.append(g[X_cols].to_numpy(dtype="float32"))
    return np.array(X_list, dtype=object)


In [117]:
# dfassess = pd.read_csv(path+files[7], sep=',', encoding='latin1')
df_cleaned.columns#.head()
# df_cleaned.rename({'record_id':"PARTICIPANT_ID"},axis='columns',inplace=True)
# df_cleaned.to_csv('x_statistics.csv')

Index(['PARTICIPANT_ID', 'acute_yn', 'infect_yn', 'race___1', 'race___2',
       'race___3', 'race___4', 'race___5', 'race___6', 'race___7', 'race___15',
       'race____88', 'race_unique_an', 'dob', 'age_enroll', 'age_enrl_cat',
       'preg_cohort_yn', 'referral_type', 'spop___2', 'spop___3', 'spop___4',
       'spop___99', 'acute_reinf_ovr', 'rx_carelevel___0', 'rx_carelevel___1',
       'rx_carelevel___2', 'rx_carelevel___3', 'rx_carelevel___4',
       'rx_carelevel___98', 'rx_carelevel____88', 'ENROLL_DATE',
       'SEX_AT_BIRTH', 'AGE_AT_ENROLLMENT', 'ENROLL_ZIP_CODE', 'DECEASED'],
      dtype='object')

In [128]:
import numpy as np
import pandas as pd
import re

def suffix_to_int(col):
    m = re.search(r'(\d+)$', col)   # grab trailing digits after any underscores
    return int(m.group(1)) if m else None

def collapse_race_to_code(df, race_cols, multiracial_code=777, unknown_code=999, priority=None):
   # 0/1 clean-up
    tmp = df[race_cols].apply(pd.to_numeric, errors='coerce').fillna(0).clip(0,1).astype('int8')
    counts = tmp.sum(axis=1)

    # map col -> numeric code from its suffix
    col_to_code = {c: suffix_to_int(c) for c in race_cols}

    # default choice = column with max value (ties broken by first occurrence)
    choice_col = tmp.idxmax(axis=1)

    if priority:
        # If you want a specific tie-break order, override choice where multiple selected
        multi = counts > 1
        if multi.any():
            # pick the first column in 'priority' that is 1
            prio_arr = np.full(len(df), None, dtype=object)
            for c in priority:
                take = multi & (tmp[c] == 1) & pd.isna(prio_arr)
                prio_arr[take.values] = c
            # fall back to previous choice if somehow none matched
            choice_col = np.where(multi, prio_arr, choice_col)

    # map column names to numeric codes
    choice_code = pd.Series(choice_col, index=df.index).map(col_to_code)

    # assemble final race_code with special cases
    race_code = np.where(counts == 0, unknown_code,     # none selected
                np.where(counts > 1,  multiracial_code, # multiple selected
                         choice_code))                  # exactly one

    df = df.copy()
    
    df['race'] = pd.Series(race_code, index=df.index).astype('int32')
    return df
race_cols = ['race___1','race___2','race___3','race___4','race___5','race___6','race___7','race___15','race____88']

# Strategy A: mark multi as 777, none as 999
df_out = collapse_race_to_code(df_cleaned, race_cols, multiracial_code=777, unknown_code=999)


In [135]:
X_df.shape#X_df.columns
# (158685, 659)


(158685, 659)

In [131]:
# dfassess.head() record_id, ps_colldt, NQOL_CF_Tscore
# dfm = dfassess[['record_id', 'ps_colldt', 'NQOL_CF_Tscore']]
# dfm.rename({'record_id':'PARTICIPANT_ID', 'ps_colldt':'date'})
X_df, X_cols = make_X(dfinfe, df_out_drop)

# Optional: save a flat feature table for inspection
X_df.to_csv("X_visits.csv", index=False)

# Optional: convert to per-patient sequences (for the cKNN-LSH pipeline later)
X_obj = to_patient_sequences(X_df, X_cols)

In [130]:
# X_df.head()
# df_out.shape#()
# df_out['race_code']
print(df_out.shape)
df_out_drop = df_out.drop(race_cols, axis=1)
df_out_drop.shape

(15159, 36)


(15159, 27)

In [2]:
df_0 = pd.read_csv(path+files[0], sep='\t', encoding='latin1')
print(df_0.columns)

  df_0 = pd.read_csv(path+files[0], sep='\t', encoding='latin1')


Index(['PARTICIPANT_ID', 'VISIT_ID', 'VISIT_TYPE', 'DATA_ENTRY_DATE',
       'FORM_NAME', 'INSTANCE_NUM', 'FIELD_NAME', 'DATA_FIELD_NAME',
       'FIELD_TYPE', 'ANSWER_LABEL', 'ANSWER_NUMERIC_VAL', 'ANSWER_TEXT_VAL',
       'CONECPT_CD', 'CONCEPT_NAME'],
      dtype='object')


In [5]:
# forms = np.unique(df_0['FORM_NAME'])
# print(len(forms)) 
"pasc_symptoms" in forms
"vaccine_status" in forms
print("covid_treatment" in forms)

True


In [7]:
df_0pasc = df_0[df_0['FORM_NAME']=="pasc_symptoms"]
print(df_0pasc.shape)
df_0vacc = df_0[df_0['FORM_NAME']=="vaccine_status"]
print(df_0vacc.shape)
df_0covidtreat = df_0[df_0['FORM_NAME']=="covid_treatment"]
print(df_0covidtreat.shape)

(10919389, 14)
(342179, 14)
(194085, 14)


In [47]:
# df_0covidtreat[]

In [48]:
df_0vacc.head() #DATA_ENTRY_DATE 	DATA_FIELD_NAME
# np.unique(df_0vacc['DATA_FIELD_NAME'])
# df_0vacc[df_0vacc['DATA_FIELD_NAME']=='vacc_vaccdt_6'].head() #ANSWER_TEXT_VAL	 (26756, 14), 16,006, 13,239, 6760,2008,95
# df_0vacc[df_0vacc['DATA_FIELD_NAME']=='vacc_vacctype_1___1'].head() # ANSWER_LABEL (10132, 14), 16,006, 13,239,6760,2008,95
# df_0vacc[df_0vacc['DATA_FIELD_NAME']=='vacc_vaccyn_fu___1'].head() # ANSWER_LABEL No DATA_ENTRY_DATE


Unnamed: 0,PARTICIPANT_ID,VISIT_ID,VISIT_TYPE,DATA_ENTRY_DATE,FORM_NAME,INSTANCE_NUM,FIELD_NAME,DATA_FIELD_NAME,FIELD_TYPE,ANSWER_LABEL,ANSWER_NUMERIC_VAL,ANSWER_TEXT_VAL,CONECPT_CD,CONCEPT_NAME
27501692,RA12016,baseline_arm_1,enrollment,2022-01-01,vaccine_status,1,vacc_coord,vacc_coord___1,choice,Coordinator data entry,,,RC-RA1:vacc_coord___1|Inf|v0,Check this box if the coordinator is entering ...
27501693,RA112169,baseline_arm_1,enrollment,2022-01-01,vaccine_status,1,vacc_coord,vacc_coord___1,choice,Coordinator data entry,,,RC-RA1:vacc_coord___1|Inf|v0,Check this box if the coordinator is entering ...
27501694,RA1264,baseline_arm_1,enrollment,2022-01-01,vaccine_status,1,vacc_coord,vacc_coord___1,choice,Coordinator data entry,,,RC-RA1:vacc_coord___1|Inf|v0,Check this box if the coordinator is entering ...
27501695,RA17991,baseline_arm_1,enrollment,2022-01-01,vaccine_status,1,vacc_coord,vacc_coord___1,choice,Coordinator data entry,,,RC-RA1:vacc_coord___1|Inf|v0,Check this box if the coordinator is entering ...
27501696,RA18073,baseline_arm_1,enrollment,2022-01-01,vaccine_status,1,vacc_coord,vacc_coord___1,choice,Coordinator data entry,,,RC-RA1:vacc_coord___1|Inf|v0,Check this box if the coordinator is entering ...


In [66]:
import pandas as pd
import numpy as np
from typing import Optional, Sequence

VAX_NAMES = ["vacc_vaccdt_1","vacc_vaccdt_2","vacc_vaccdt_3","vacc_vaccdt_4"]

def _to_datetime(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s, errors="coerce", utc=False).dt.tz_localize(None)

def extract_vaccine_dates(events_df: pd.DataFrame,
                          patient_col: str = "PARTICIPANT_ID",
                          field_col: str = "DATA_FIELD_NAME",
                          value_col: str = "ANSWER_TEXT_VAL",
                          keep_first: bool = True) -> pd.DataFrame:
    """
    From a long table of events, grab the earliest date per (patient, vccK).
    Returns wide table with columns: patient_id, vcc1_date, vcc2_date, vcc3_date, vcc4_date
    """
    df = events_df[[patient_col, field_col, value_col]].copy()
    df = df[df[field_col].isin(VAX_NAMES)].copy()
    df["vax_date"] = _to_datetime(df[value_col])
    df = df[~df["vax_date"].isna()].copy()

    agg_fn = "min" if keep_first else "max"
    wide = (
        df.groupby([patient_col, field_col])["vax_date"]
          .agg(agg_fn)
          .reset_index()
          .pivot(index=patient_col, columns=field_col, values="vax_date")
          .rename(columns={k: f"{k}_date" for k in VAX_NAMES if k in df[field_col].unique()})
          .reset_index()
    )
    for k in VAX_NAMES:
        col = f"{k}_date"
        if col not in wide.columns:
            wide[col] = pd.NaT
    return wide[[patient_col] + [f"{k}_date" for k in VAX_NAMES]]

def _same_month(a: pd.Series, b: pd.Series) -> pd.Series:
    a = pd.to_datetime(a, errors="coerce")
    b = pd.to_datetime(b, errors="coerce")
    return (a.dt.year == b.dt.year) & (a.dt.month == b.dt.month)

def add_vaccine_actions_to_panel(panel_df: pd.DataFrame,
                                 vax_dates: pd.DataFrame,
                                 patient_col: str = "PARTICIPANT_ID",
                                 date_col: str = "date",
                                 composite_any_name: str = "A_vax_any",
                                 composite_booster_name: str = "A_booster") -> pd.DataFrame:
    """
    Joins per-patient vaccine dates to a monthly panel and creates:
      - A_vcc1..A_vcc4: 1 if dose date falls in that month, else 0
      - A_vax_any: 1 if *any* of vcc1..vcc4 occurs that month
      - A_booster: 1 if vcc3 or vcc4 occurs that month
      - months_since_last_vaccine: numeric confounder
    """
    panel = panel_df.copy()
    panel[date_col] = _to_datetime(panel[date_col])

    merged = panel.merge(vax_dates, on=patient_col, how="left")

    for k in VAX_NAMES:
        cdate = f"{k}_date"
        acol = f"A_{k}"
        if cdate not in merged.columns:
            merged[acol] = 0
            continue
        merged[acol] = _same_month(merged[cdate], merged[date_col]).astype("int8")

    merged[composite_any_name] = (merged[[f"A_{k}" for k in VAX_NAMES]].sum(axis=1) > 0).astype("int8")
    merged[composite_booster_name] = (merged[["A_vcc3", "A_vcc4"]].sum(axis=1) > 0).astype("int8")

    vax_cols = [f"{k}_date" for k in VAX_NAMES if f"{k}_date" in merged.columns]
    def months_since(row):
        ref = row[date_col]
        best = pd.NaT
        for c in vax_cols:
            d = row[c]
            if pd.isna(d) or pd.isna(ref) or d > ref:
                continue
            if pd.isna(best) or d > best:
                best = d
        if pd.isna(best) or pd.isna(ref):
            return np.float32(999.0)
        return np.float32((ref - best).days / 30.0)

    merged["months_since_last_vaccine"] = merged.apply(months_since, axis=1)

    return merged

import pandas as pd
# from add_vaccine_actions import extract_vaccine_dates, add_vaccine_actions_to_panel

# 1) Long-format events table with vaccine records
#    Must contain: patient_id, DATA_FIELD_NAME (vcc1..vcc4), ANSWER_TEXT_VAL (date string)
# events = pd.read_csv("events_long.csv")

# 2) Monthly panel (one row per patient_id × month) with a 'date' column
# panel = pd.read_csv("panel_monthly.csv", parse_dates=["date"])

# 3) Get per-patient vaccine dates from long table
vax_dates = extract_vaccine_dates(
    events_df=df_0vacc,
    field_col="DATA_FIELD_NAME",
    value_col="ANSWER_TEXT_VAL",  # parsed to datetime
)

# 4) Add A_vcc1..A_vcc4, A_vax_any, A_booster, months_since_last_vaccine to your panel
# panel_aug = add_vaccine_actions_to_panel(
#     panel_df=panel,
#     vax_dates=vax_dates,
#     patient_col="DATA_FIELD_NAME",
#     date_col="date",
# )

# panel_aug.to_csv("panel_with_vaccine_actions.csv", index=False)

In [79]:
vax_dates.shape#(13074, 5)
vax_dates.head()
vax_dates.to_csv("vax_dates.csv",index=False)

In [71]:
import numpy as np

df_x = pd.read_csv(path+files[11], sep=',', encoding='latin1')
# df_x.head()
print(df_x.shape) # record_id, ps_colldt is the da


# te (126805, 704)
# for x in df_x.columns:
#     print(x, end=',')
# df_x.head() #(126805, 704)

  df_x = pd.read_csv(path+files[11], sep=',', encoding='latin1')


(126805, 704)


In [None]:
df_vaccine = df_0[df_0['FORM_NAME'=='vaccine_status']]
df_vaccine.head()

In [99]:
df_baseage = pd.read_csv(path+files[3], sep='\t', encoding='latin1')
print(df_baseage.columns)

Index(['PARTICIPANT_ID', 'ENROLL_PROTOCOL', 'ENROLL_SITE_ID',
       'ENROLL_HUB_SITE_ID', 'ENROLL_SITE_PATH', 'ENROLL_DATE',
       'ENROLL_CATEGORY', 'ENROLL_INDEX_DATE', 'CROSSOVER_FLAG',
       'CROSSOVER_INDEX_DATE', 'ONSTUDY_INFECTION', 'ONSTUDY_INFECTION_CNT',
       'SEX_AT_BIRTH', 'DOB', 'AGE_AT_ENROLLMENT', 'ENROLL_ZIP_CODE',
       'WITHDRAWN', 'WITHDRAW_DATE', 'DECEASED', 'DECEASED_DATE'],
      dtype='object')


In [133]:
# df_baseage.rename({"PARTICIPANT_ID":'record_id'}, axis='columns',inplace=True)
X_visit.shape

NameError: name 'X_visit' is not defined

In [100]:
# df_baseage.head() # 
# base_colums = ['PARTICIPANT_ID', "SEX_AT_BIRTH", "AGE_AT_ENROLLMENT","ENROLL_ZIP_CODE"]
# df_base = df_baseage[base_colums]#enroll_protocal adult
# df_base.head()
# print(np.unique(df_base['ENROLL_PROTOCOL']))
df_baseage.shape

(15179, 20)

In [95]:
# !pip install torch
# import torch
df_base = pd.read_csv(path+files[6], sep=',', encoding='latin1')
df_base.shape #(15159, 177)

  df_base = pd.read_csv(path+files[6], sep=',', encoding='latin1')


(15159, 177)

In [132]:
# merged_df.head()
# df = merged_df
# df.loc[df['SEX_AT_BIRTH'] == 'Female', 'SEX_AT_BIRTH'] = 0
# df.loc[df['SEX_AT_BIRTH'] != 1, 'SEX_AT_BIRTH'] = 1
# df.loc[df['DECEASED'] == 'N', 'DECEASED'] = 0
# df.loc[df['DECEASED'] != 'N', 'DECEASED'] = 1

# df_dropped_multiple = df.drop(['infect_yn_anti_f', 'enroll_dt','index_dt'], axis=1)
# print(df_dropped_multiple.shape)
# threshold_non_nan = len(df) - 100 

# df_cleaned = df_dropped_multiple.dropna(axis=1, thresh=threshold_non_nan)
# print(df_cleaned.shape)
# df_cleaned.columns
# df_cleaned.head()
df_out_drop.to_csv("x_statics.csv",index=False)
# ['record_id', 'acute_yn', 'infect_yn', 'race___1', 'race___2',
#        'race___3', 'race___4', 'race___5', 'race___6', 'race___7', 'race___15',
#        'race____88', 'race_unique_an', 'dob', 'age_enroll', 'age_enrl_cat',
#        'preg_cohort_yn', 'referral_type', 'spop___2', 'spop___3', 'spop___4',
#        'spop___99', 'acute_reinf_ovr', 'rx_carelevel___0', 'rx_carelevel___1',
#        'rx_carelevel___2', 'rx_carelevel___3', 'rx_carelevel___4',
#        'rx_carelevel___98', 'rx_carelevel____88', 'ENROLL_DATE',
#        'SEX_AT_BIRTH', 'AGE_AT_ENROLLMENT', 'ENROLL_ZIP_CODE', 'DECEASED'],
#       dtype='object')

In [114]:
# len(np.unique(df_base['record_id']))
# merged_df = pd.merge(df_base, df_baseage[['record_id', 'ENROLL_DATE','SEX_AT_BIRTH','AGE_AT_ENROLLMENT','ENROLL_ZIP_CODE','DECEASED']], on='record_id', how='left')
# merged_df.to_csv("x_statics.csv",index=False)
# merged_df.head()
# #ENROLL_ZIP_CODE,infect_yn_anti_f, SEX_AT_BIRTH,AGE_AT_ENROLLMENT
# merged_df.head()
# df.loc[df['TargetColumn'] == 'Yes', 'TargetColumn'] = 1
# df.loc[df['TargetColumn'] != 1, 'TargetColumn'] = 0

In [96]:
df_base.head() #index_dt, infect_yn, record_id, acute_yn, 

Unnamed: 0,record_id,acute_yn,infect_yn,infect_yn_anti_f,index_dt,enroll_dt,race___1,race___2,race___3,race___4,...,rx_carelevel___0,rx_carelevel___1,rx_carelevel___2,rx_carelevel___3,rx_carelevel___4,rx_carelevel___98,rx_carelevel____88,Spike,Nucleocapsid,OVER_89_FLAG
0,RA11305,0.0,1,Infected,2020-01-11,2021-01-01,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,
1,RA16763,0.0,1,Infected,2019-04-08,2021-01-01,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,,
2,RA111690,0.0,1,Infected,2020-11-05,2022-01-01,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,,,
3,RA111095,0.0,1,Infected,2021-04-21,2022-01-01,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,,
4,RA13123,0.0,1,Infected,2020-04-05,2022-01-01,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,,


In [12]:
for x in df_base.columns:
    print(x, end=',')

record_id,acute_yn,infect_yn,infect_yn_anti_f,index_dt,enroll_dt,race___1,race___2,race___3,race___4,race___5,race___6,race___7,race___15,race____88,race_unique_an,biosex,dob,age_enroll,age_enrl_cat,preg_cohort_yn,cc_anxdep_base,cc_asthma_base,cc_autoimm_base,cc_bipolar_base,cc_cancer_base,cc_cfs_base,cc_clung_base,cc_cns_base,cc_copd_base,cc_cvd_base,cc_dementia_base,cc_diabetes_base,cc_fibromyalgia_base,cc_imm_base,cc_liver_base,cc_move_base,cc_nmusc_base,cc_o2home_base,cc_obesity_base,cc_othermh_base,cc_polyov_base,cc_pots_base,cc_renal_base,cc_seiz_base,cc_sickle_base,cc_stroke_base,cc_cvdspec___1_base,cc_cvdspec___2_base,cc_cvdspec___3_base,cc_cvdspec___4_base,cc_cvdspec___5_base,cc_cvdspec___6_base,cc_cvdspec___7_base,cc_cvdspec___8_base,cc_cvdspec___98_base,cc_cvdspec____88_base,cc_autoimmspec___1_base,cc_autoimmspec___2_base,cc_autoimmspec___3_base,cc_autoimmspec___23_base,cc_autoimmspec___4_base,cc_autoimmspec___5_base,cc_autoimmspec___6_base,cc_autoimmspec___7_base,cc_autoimm

In [72]:
df_y = pd.read_csv(path+files[8], sep=',', encoding='latin1')
df_y.head() #record_id, visit_dt,index_dt_curr, pasc_score_2024#, pasc_cc_2023, pasc_cc_2024
dfycolums = ['record_id','visit_dt','pasc_score_2024']
df_yse = df_y[dfycolums]

In [75]:
df_yse.shape #(126804, 3)

df_yse.to_csv('y.csv',index=False)

In [14]:
for x in df_y.columns:
    print(x, end = ",")

record_id,redcap_event_name,visit_dt,visit_month_curr,visit_month,instance_curr,instance,index_dt_curr,index_dt,infect_yn_curr,newinf_yn,newinf_dt,n_newinf,days_visit_indexcurr,days_visit_index,visit_missed,pasc_score_tf_2023,pasc_score_tf_2024,pasc_score_2023,pasc_score_2024,pasc_cc_2023,pasc_cc_2024,pasc_pg2023,pasc_pg2024,pasc_jama2023,pasc_jama2024,

In [84]:
df_yse.head()

Unnamed: 0,record_id,visit_dt,pasc_score_2024
0,RA11305,2021-01-01,16.0
1,RA11305,2021-04-03,14.0
2,RA11305,2021-06-28,8.0
3,RA11305,2021-09-28,13.0
4,RA11305,2021-12-23,10.0


In [89]:
# df_yse.rename({'record_id':'PARTICIPANT_ID', 'visit_dt':'date'},axis='columns',  inplace=True)
# df_yse.head()
df_yse.to_csv('y_pasc_score2024.csv',index=False)

In [137]:
dfm.shape #(126805, 3)
df_yse.shape #(126804, 3)

(126804, 3)

In [90]:
dfm.rename({'record_id':'PARTICIPANT_ID', 'ps_colldt':'date'},axis='columns',  inplace=True)
dfm.to_csv('y_nqol.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfm.rename({'record_id':'PARTICIPANT_ID', 'ps_colldt':'date'},axis='columns',  inplace=True)
