In [22]:
%matplotlib notebook
%load_ext autoreload
%autoreload 2
import numpy as np
import scipy.spatial
import pandas as pd
import sklearn.decomposition
import matplotlib.pyplot as plt
# import keras
from sklearn import preprocessing
from sklearn.decomposition import PCA
# %matplotlib inline
# Dimension reduction and clustering libraries
# import umap
# import hdbscan
import sys
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
from sklearn.model_selection import LeaveOneOut,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import pairwise_distances,mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
sys.path.insert(0, '../utils/') 
from readProfiles import *
from pred_models import *
from saveAsNewSheetToExistingFile import saveAsNewSheetToExistingFile

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler

# from utils import networksEvol, tsne, readProfiles
# import umap

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
RESULTS_DIR = Path("../results/Jiwoo/Genepanel")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

GENE_DIR = Path("./gene_sets")

TOP_N_LIST = [5, 10, 15, 20, 25, 30, 50, 75, 100, 150, 200, 300, 400, "all"]

procProf_dir='../'
#results_dir='../results/'

# 논문과 동일 컨셉
profileType = "normalized_variable_selected"
filter_perts = "highRepUnion"
repCorrFilePath = "../results/RepCor/RepCorrDF.xlsx"
filter_repCorr_params = [filter_perts, repCorrFilePath]
pertColName = "PERT"
moa_col = "Metadata_MoA"
nSamplesMOA = 9  # "more than 4 compounds" -> size > 4

In [None]:
# =========================
# 1) helpers
# =========================
def load_ranked_genes(dataset: str):
    # 예: ./gene_sets/genes_rankedLINCS.npy, ./gene_sets/genes_rankedCDRP-bio.npy
    gene_file = GENE_DIR / f"genes_ranked_{dataset}.npy"
    genes_ranked = np.load(gene_file, allow_pickle=True)
    return genes_ranked

def harmonize_moa_and_compounds(merg: pd.DataFrame, dataset: str) -> pd.DataFrame:
    merg = merg.copy()
    # 논문 코드 그대로
    if dataset == "LINCS":
        merg[moa_col] = merg["Metadata_moa"]
        merg.loc[merg["Metadata_moa"].isnull(), moa_col] = (
            merg.loc[merg["Metadata_moa"].isnull(), "moa"].astype(str).str.lower()
        )
        merg["Compounds"] = merg[pertColName].astype(str).str[0:13]
    elif dataset == "CDRP-bio":
        merg[moa_col] = merg["Metadata_moa"].astype(str).str.lower()
        merg["Compounds"] = merg[pertColName].astype(str).str[0:13]
    else:
        raise ValueError(f"dataset must be 'LINCS' or 'CDRP-bio'. got: {dataset}")
    return merg

def scale_modality(df: pd.DataFrame, feat_cols: list[str]) -> pd.DataFrame:
    """StandardScaler -> MinMax(0,1) per feature (논문 흐름)"""
    out = df.copy()
    ss = preprocessing.StandardScaler()
    mm = preprocessing.MinMaxScaler(feature_range=(0, 1))
    X = out[feat_cols].values.astype("float64")
    X = ss.fit_transform(X)
    X = mm.fit_transform(X)
    out.loc[:, feat_cols] = X
    return out

def select_moas_by_compound_count(merg: pd.DataFrame) -> tuple[pd.DataFrame, preprocessing.LabelEncoder]:
    """
    논문과 같이:
    - Compounds별 1개 샘플만 뽑아서 MoA별 compound 수를 셈
    - size > nSamplesMOA 인 MoA만 남김
    - multilabel(| 포함) 제거
    - LabelEncoder로 moa_num 부여
    """
    tmp = (
        merg.groupby(["Compounds"]).sample(1, random_state=0)
            .groupby([moa_col]).size()
            .reset_index(name="size")
            .sort_values("size", ascending=False)
    )
    selected = tmp[tmp["size"] > nSamplesMOA][moa_col].tolist()
    multi = [m for m in selected if isinstance(m, str) and "|" in m]
    selected = [m for m in selected if m not in multi]

    df_all = merg[merg[moa_col].isin(selected)].reset_index(drop=True).copy()

    le = preprocessing.LabelEncoder()
    le.fit(selected)
    df_all["Metadata_moa_num"] = le.transform(df_all[moa_col].tolist())

    print(f"[INFO] MoA kept: {len(selected)} (removed multilabel: {len(multi)})")
    print(f"[INFO] filtered rows: {df_all.shape[0]} | compounds: {df_all['Compounds'].nunique()} | MoAs: {df_all['Metadata_moa_num'].nunique()}")
    return df_all, le

def make_mlp_grid():
    # 논문 코드와 최대한 비슷한 grid (max_iter 크게!)
    param_space = {
        "hidden_layer_sizes": [(100,), (200,), (400,)],
        "activation": ["tanh", "relu"],
        "alpha": [0.0001, 0.05],
        "learning_rate": ["constant", "adaptive"],
    }
    base = MLPClassifier(random_state=5, max_iter=600)
    # 논문은 cv=3(혹은 2)로 그리드
    grid = GridSearchCV(base, param_space, n_jobs=4, cv=3)
    return grid

def proba_align(prob_src, classes_src, classes_ref):
    """prob_src를 classes_ref 순서로 재배열"""
    src_map = {c:i for i,c in enumerate(classes_src)}
    out = np.zeros((prob_src.shape[0], len(classes_ref)), dtype=float)
    for j, c in enumerate(classes_ref):
        if c in src_map:
            out[:, j] = prob_src[:, src_map[c]]
    return out

In [26]:
# =========================
# 2) main runner
# =========================
def run_dataset(dataset: str):
    print(f"\n========== RUN {dataset} ==========")

    # folds: 논문 기준
    n_splits = 5 if dataset == "LINCS" else 10
    sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=1)

    # 2-1) load profiles (논문 함수)
    merg, cp_features, l1k_features_full = read_paired_treatment_level_profiles(
        procProf_dir, dataset, profileType, filter_repCorr_params, 1
    )
    merg = harmonize_moa_and_compounds(merg, dataset)

    # 2-2) modality tables
    cp = merg[[pertColName, "Compounds", moa_col] + list(cp_features)].copy()
    ge = merg[[pertColName, "Compounds", moa_col] + list(l1k_features_full)].copy()

    cp_s = scale_modality(cp, list(cp_features))
    ge_s = scale_modality(ge, list(l1k_features_full))

    merged_scaled = pd.concat([cp_s, ge_s], axis=1)
    merged_scaled = merged_scaled.loc[:, ~merged_scaled.columns.duplicated()].copy()
    merged_scaled["Compounds"] = merged_scaled[pertColName].astype(str).str[0:13]

    # 2-3) filter MoAs
    df_all, le = select_moas_by_compound_count(merged_scaled)
    y_all = df_all["Metadata_moa_num"].values
    groups = df_all["Compounds"].values

    # CP는 TOP_N과 무관하니 미리 뽑아둠
    X_cp = df_all[list(cp_features)].values.astype("float64")

    # ranked genes
    genes_ranked = load_ranked_genes(dataset)

    all_rows_summary = []  # fold별 F1 저장
    all_preds_dump = []    # pred 저장(엑셀)

    for TOP_N in TOP_N_LIST:
        if TOP_N == "all":
            wanted = genes_ranked.tolist()
            topn_tag = "all"
        else:
            wanted = genes_ranked[:int(TOP_N)].tolist()
            topn_tag = str(TOP_N)

        # 실제 df 컬럼에 있는 gene만 사용 (KeyError 방지)
        available = [g for g in wanted if g in df_all.columns]
        missing = sorted(set(wanted) - set(available))
        if len(available) == 0:
            print(f"[WARN] TOP_N={TOP_N}: no genes matched columns. skip.")
            continue
        if len(missing) > 0:
            print(f"[TOP_N={TOP_N}] using {len(available)} genes (missing {len(missing)}) ex_missing={missing[:3]}")

        X_ge = df_all[available].values.astype("float64")
        X_early = np.concatenate([X_cp, X_ge], axis=1)

        for fold, (tr, te) in enumerate(sgkf.split(X_early, y_all, groups=groups), start=1):
            y_tr, y_te = y_all[tr], y_all[te]

            # oversample (논문)
            ros = RandomOverSampler(sampling_strategy="not majority", random_state=5)

            # --- CP
            Xtr_cp, ytr_cp = ros.fit_resample(X_cp[tr], y_tr)
            m_cp = make_mlp_grid()
            m_cp.fit(Xtr_cp, ytr_cp)
            pred_cp = m_cp.predict(X_cp[te])
            prob_cp = m_cp.predict_proba(X_cp[te])
            cls_cp = m_cp.best_estimator_.classes_

            # --- GE
            Xtr_ge, ytr_ge = ros.fit_resample(X_ge[tr], y_tr)
            m_ge = make_mlp_grid()
            m_ge.fit(Xtr_ge, ytr_ge)
            pred_ge = m_ge.predict(X_ge[te])
            prob_ge = m_ge.predict_proba(X_ge[te])
            cls_ge = m_ge.best_estimator_.classes_

            # --- Early Fusion
            Xtr_ef, ytr_ef = ros.fit_resample(X_early[tr], y_tr)
            m_ef = make_mlp_grid()
            m_ef.fit(Xtr_ef, ytr_ef)
            pred_ef = m_ef.predict(X_early[te])
            prob_ef = m_ef.predict_proba(X_early[te])
            cls_ef = m_ef.best_estimator_.classes_

            # --- Late Fusion (prob average; class align!)
            prob_ge_aligned = proba_align(prob_ge, cls_ge, cls_cp)
            prob_lf = (prob_cp + prob_ge_aligned) / 2.0
            pred_lf = cls_cp[np.argmax(prob_lf, axis=1)]

            # ===== scoring =====
            # 논문 figure에서 LINCS는 weighted, CDRP-bio는 (코드상) macro를 쓰는 구간이 있음.
            # 여기서는 dataset별로 맞춤:
            avg = "weighted" if dataset == "LINCS" else "macro"

            f1_cp = f1_score(y_te, pred_cp, average=avg)
            f1_ge = f1_score(y_te, pred_ge, average=avg)
            f1_ef = f1_score(y_te, pred_ef, average=avg)
            f1_lf = f1_score(y_te, pred_lf, average=avg)

            all_rows_summary += [
                dict(dataset=dataset, top_n=topn_tag, fold=fold, modality="CP", f1=f1_cp),
                dict(dataset=dataset, top_n=topn_tag, fold=fold, modality="GE", f1=f1_ge),
                dict(dataset=dataset, top_n=topn_tag, fold=fold, modality="Early Fusion", f1=f1_ef),
                dict(dataset=dataset, top_n=topn_tag, fold=fold, modality="Late Fusion", f1=f1_lf),
            ]

            # pred dump (논문처럼 나중에 엑셀에서 다시 f1 계산/시각화 가능)
            dump = pd.DataFrame({
                "dataset": dataset,
                "top_n": topn_tag,
                "fold": fold,
                "PERT": df_all.loc[te, pertColName].values,
                "Compounds": df_all.loc[te, "Compounds"].values,
                "Metadata_moa_num": y_te,
                "CP": pred_cp,
                "GE": pred_ge,
                "Early Fusion": pred_ef,
                "Late Fusion": pred_lf,
            })
            all_preds_dump.append(dump)

    # ===== save =====
    summary_df = pd.DataFrame(all_rows_summary)
    pred_df = pd.concat(all_preds_dump, ignore_index=True)

    summary_path = RESULTS_DIR / f"summary_{dataset}_MLP_TOPN_sweep.csv"
    pred_path = RESULTS_DIR / f"pred_moa_{dataset}_MLP_TOPN_sweep.xlsx"

    summary_df.to_csv(summary_path, index=False)
    pred_df.to_excel(pred_path, index=False)

    print(f"[SAVED] {summary_path}")
    print(f"[SAVED] {pred_path}")
    return summary_df, pred_df


In [None]:
# =========================
# 3) run both datasets
# =========================
# procProf_dir는 네 노트북에서 이미 설정된 값 사용(논문 코드처럼)
# procProf_dir = "/home/ubuntu/bucket/projects/2018_04_20_Rosetta/workspace/"
# -> 너 환경의 workspace로 설정되어 있어야 함.

sum_lincs, pred_lincs = run_dataset("LINCS")
sum_cdrp,  pred_cdrp  = run_dataset("CDRP-bio")




  cp_data_repLevel = pd.read_csv(


LINCS: Replicate Level Shapes (nSamples x nFeatures): cp:  52223 , 119 ,  l1k:  27837 , 978
l1k n of rep:  3.0
cp n of rep:  5.0
CP: from  9394  to  4647
l1k: from  8369  to  2338
CP and l1k high rep union:  5845
Treatment Level Shapes (nSamples x nFeatures+metadata): (5243, 122) (4431, 980) Merged Profiles Shape: (3828, 1101)
[INFO] MoA kept: 15 (removed multilabel: 0)
[INFO] filtered rows: 806 | compounds: 258 | MoAs: 15




In [None]:
out_dir = Path("../results/Jiwoo/Genepanel")
out_dir.mkdir(parents=True, exist_ok=True)

# 1) raw fold-level results
res_path_csv = out_dir / "moa_topN_fold_results.csv"
res.to_csv(res_path_csv, index=False)

# 2) summary table
summary_path_csv = out_dir / "moa_topN_summary.csv"
summary.to_csv(summary_path_csv, index=False)

# 3) Excel (raw + summary in separate sheets)
xlsx_path = out_dir / "moa_topN_results.xlsx"
with pd.ExcelWriter(xlsx_path, engine="openpyxl") as writer:
    res.to_excel(writer, sheet_name="fold_results", index=False)
    summary.to_excel(writer, sheet_name="summary", index=False)

print("[SAVED]", res_path_csv)
print("[SAVED]", summary_path_csv)
print("[SAVED]", xlsx_path)