In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier,Perceptron
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.model_selection import GridSearchCV,cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier  
from sklearn.compose import ColumnTransformer

RANDOM_STATE = 42


import warnings
warnings.filterwarnings("ignore")

In [2]:
def rank_transform_df(df):
    features = df.drop(columns='class').values
    ranked = np.argsort(np.argsort(features, axis=0), axis=0) + 1
    max_rank = np.max(ranked, axis=0, keepdims=True)
    normalized = ranked / max_rank
    ranked_df = pd.DataFrame(normalized, index=df.index, columns=df.columns.drop('class'))
    ranked_df['class'] = df['class'].values  
    return ranked_df

In [3]:
import json
with open('results/shap_top10.json', 'r') as file:
    shap_top10 = json.load(file)

df_train = pd.read_csv('data/df_train.txt', sep='\t')
df_test = pd.read_csv('data/df_test.txt', sep='\t')

df_train = df_train.rename(columns={'disease_code_level2': 'class'})
df_test = df_test.rename(columns={'disease_code_level2': 'class'})


In [4]:

df_disease_train_shap = {}
for key in shap_top10.keys() - {'CTRL'}:
    filtered_df = df_train[['ID','class']+shap_top10[key]]
    filtered_df = filtered_df[filtered_df['class'].isin([key, 'CTRL'])]
    filtered_df['class'] = filtered_df['class'].apply(lambda x: 1 if x == key else 0)
    filtered_df.set_index('ID', inplace=True)
    df_disease_train_shap[key] = filtered_df
    
df_disease_test_shap = {}
for key in shap_top10.keys() - {'CTRL'}:
    filtered_df = df_test[['ID','class']+shap_top10[key]]
    filtered_df = filtered_df[filtered_df['class'].isin([key, 'CTRL'])]
    filtered_df['class'] = filtered_df['class'].apply(lambda x: 1 if x == key else 0)
    filtered_df.set_index('ID', inplace=True)
    df_disease_test_shap[key] = filtered_df

sza_disease_shap = {}
for key in shap_top10.keys() - {'CTRL'}:
    train_df = df_disease_train_shap[key]
    test_df = df_disease_test_shap[key]
    combined_df = pd.concat([train_df, test_df], axis=0)
    sza_disease_shap[key] = combined_df

# test on th internal

In [5]:
# smote
from imblearn.over_sampling import SMOTE

df_disease_train_shap_smote = {}

for key, df in df_disease_train_shap.items():
    X = df.drop(columns=['class'])
    y = df['class']
    
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    
    df_resampled = X_res.copy()
    df_resampled['class'] = y_res
    df_resampled.index = [f"{key}_SMOTE_{i}" for i in range(len(df_resampled))]  
    
    df_disease_train_shap_smote[key] = df_resampled

In [6]:
df_disease_train_shap_smote_ranked = {}
for key, df in df_disease_train_shap_smote.items():
    ranked_df = rank_transform_df(df)
    df_disease_train_shap_smote_ranked[key] = ranked_df

df_disease_test_shap_ranked = {}
for key, df in df_disease_test_shap.items():
    ranked_df = rank_transform_df(df)
    df_disease_test_shap_ranked[key] = ranked_df

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score,balanced_accuracy_score
import pandas as pd
import os


def evaluate_models_for_each_disease(train_data_dict, test_data_dict):
    all_results = {}

    for disease_name in train_data_dict.keys():
        train_df = train_data_dict[disease_name]
        test_df = test_data_dict[disease_name]

        x_train = train_df.drop(columns='class').values
        y_train = train_df['class'].values
        x_test = test_df.drop(columns='class').values
        y_test = test_df['class'].values

        models = {
        "XGBoost": Pipeline([
            ("model", XGBClassifier(
                objective='binary:logistic',
                eval_metric='auc',
                use_label_encoder=False,
                scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train), 
                random_state=42
            ))
        ]),
        "Random Forest": Pipeline([
            ("model", RandomForestClassifier(
                class_weight='balanced',
                random_state=42
            ))
        ]),
        "Logistic Regression": Pipeline([
            ("scaler", StandardScaler()),  
            ("model", LogisticRegression(
                class_weight='balanced',
                random_state=42
            ))
        ]),
        "SVM": Pipeline([
            ("scaler", StandardScaler()),  
            ("model", SVC(
                kernel='rbf',
                class_weight='balanced',
                probability=True, 
                random_state=42
            ))
        ])
    }

        results = []

        for model_name, pipeline in models.items():
            pipeline.fit(x_train, y_train)

            y_pred = pipeline.predict(x_test)
            y_proba = pipeline.predict_proba(x_test)[:, 1]

            result = {
                "Model": model_name,
                "Accuracy": accuracy_score(y_test, y_pred),
                "Balanced Accuracy": balanced_accuracy_score(y_test, y_pred),
                "F1 Score": f1_score(y_test, y_pred, average="weighted"),
                "AUC": roc_auc_score(y_test, y_proba),
                "Disease": disease_name,
                "Features Used": x_train.shape[1]
            }
            results.append(result)

        all_results[disease_name] = pd.DataFrame(results)

    return all_results


all_model_results = evaluate_models_for_each_disease(
    df_disease_train_shap_smote_ranked,
    df_disease_test_shap_ranked
)

print(all_model_results)


{'SCZ':                  Model  Accuracy  Balanced Accuracy  F1 Score       AUC  \
0              XGBoost  0.705128           0.833333  0.758349  0.983897   
1        Random Forest  0.833333           0.905797  0.859613  0.981481   
2  Logistic Regression  0.769231           0.869565  0.809615  0.998390   
3                  SVM  0.807692           0.891304  0.839673  0.974235   

  Disease  Features Used  
0     SCZ             10  
1     SCZ             10  
2     SCZ             10  
3     SCZ             10  , 'LUC':                  Model  Accuracy  Balanced Accuracy  F1 Score       AUC  \
0              XGBoost  0.771084           0.862319  0.799080  0.987578   
1        Random Forest  0.819277           0.891304  0.839779  0.982402   
2  Logistic Regression  0.807229           0.884058  0.829637  0.993789   
3                  SVM  0.843373           0.905797  0.860060  0.980331   

  Disease  Features Used  
0     LUC             10  
1     LUC             10  
2     LUC       

# test on independent cohorts

In [10]:
# smote

sza_disease_shap_smote = {}

for key, df in sza_disease_shap.items():
    X = df.drop(columns='class')
    y = df['class']
    
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    resampled_df = X_resampled.copy()
    resampled_df['class'] = y_resampled
    resampled_df.index = [f"{key}_{i}" for i in range(len(resampled_df))] 
    
    sza_disease_shap_smote[key] = resampled_df

In [11]:

sza_disease_shap_smote_ranked = {
    key: rank_transform_df(df)
    for key, df in sza_disease_shap_smote.items()
}

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score,balanced_accuracy_score
import pandas as pd
import os

def run_models(disease_cohort, disease_name,cohort_number):

    # 1. data preparation
    available_genes = [col for col in disease_cohort.columns if col != 'class']
    available_genes = [gene for gene in available_genes if gene in sza_disease_shap_smote[disease_name].columns]
    
    sza_df = sza_disease_shap_smote[disease_name][available_genes + ['class']]
    sza_df_rank = rank_transform_df(sza_df)  
    test_df = disease_cohort[available_genes + ['class']]
    
    x_train = sza_df_rank.drop(columns='class').values
    y_train = sza_df_rank['class'].values
    x_test = test_df.drop(columns='class').values
    y_test = test_df['class'].values

    # 2. define model pipeline
    models = {
        "XGBoost": Pipeline([
            ("model", XGBClassifier(
                objective='binary:logistic',
                eval_metric='auc',
                use_label_encoder=False,
                scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train), 
                random_state=42
            ))
        ]),
        "Random Forest": Pipeline([
            ("model", RandomForestClassifier(
                class_weight='balanced',
                random_state=42
            ))
        ]),
        "Logistic Regression": Pipeline([
            ("scaler", StandardScaler()),  
            ("model", LogisticRegression(
                class_weight='balanced',
                random_state=42
            ))
        ]),
        "SVM": Pipeline([
            ("scaler", StandardScaler()),  
            ("model", SVC(
                kernel='rbf',
                class_weight='balanced',
                probability=True, 
                random_state=42
            ))
        ])
    }

    # 3. train model and calculate metrics
    results = []
    for model_name, pipeline in models.items():
        pipeline.fit(x_train, y_train)
        y_pred = pipeline.predict(x_test)
        y_proba = pipeline.predict_proba(x_test)[:, 1]
        
        # calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        auc = roc_auc_score(y_test, y_proba)
        balanced_acc = balanced_accuracy_score(y_test, y_pred)

        # export AUC data
        auc_df = pd.DataFrame({
            'y_true': y_test,
            'y_proba': y_proba
        })
        output_dir = f"results/public_data_auc/{disease_name}_cohort_{cohort_number}"
        os.makedirs(output_dir, exist_ok=True)
        auc_df.to_csv(f"{output_dir}/{model_name}_auc.tsv", sep="\t", index=False)

        # store results
        results.append({
            "Model": model_name,
            "Accuracy": accuracy,
            "Balanced Accuracy": balanced_acc,
            "F1 Score": f1,
            "AUC": auc,
            "Features Used": len(available_genes)
        })

    # 4. output results and return DataFrame
    result_df = pd.DataFrame(results)
    print(f"\n===== Results for {disease_name} =====")
    print(result_df.round(4))
    
    return result_df

# example
# results = run_models(disease_cohort, "Your_Disease_Name")

# with control cohort

In [16]:
# SEP GSE154918
sep_gse154918 = pd.read_csv('data/spesis_GSE154918_control_91.tsv', sep='\t')
sep_gse154918 = sep_gse154918[['ID','class']+shap_top10['SEP']]
sep_gse154918.set_index('ID', inplace=True)
sep_gse154918['class'] = sep_gse154918['class'].apply(lambda x: 1 if x == 'SEP' else 0)
print(sep_gse154918['class'].value_counts().to_dict())

sep_gse154918_rank = rank_transform_df(sep_gse154918)
sep_gse154918_res = run_models(sep_gse154918_rank,'SEP','GSE154918')


{1: 51, 0: 40}

===== Results for SEP =====
                 Model  Accuracy  Balanced Accuracy  F1 Score     AUC  \
0              XGBoost    0.9451             0.9510    0.9453  0.9907   
1        Random Forest    0.9341             0.9412    0.9343  0.9892   
2  Logistic Regression    0.9121             0.9216    0.9123  0.9941   
3                  SVM    0.9011             0.9118    0.9013  0.9936   

   Features Used  
0             10  
1             10  
2             10  
3             10  


In [17]:
## SEP GSE63311
sep_gse63311 = pd.read_csv('data/sepsis_GSE63311_control_83.tsv', sep='\t')

valid_genes = [gene for gene in shap_top10['SEP'] if gene in sep_gse63311.columns]
sep_gse63311 = sep_gse63311[['ID', 'class'] + valid_genes]
missing_genes = [gene for gene in shap_top10['SEP'] if gene not in sep_gse63311.columns]
print("Missing genes:", missing_genes)

sep_gse63311.set_index('ID', inplace=True)
sep_gse63311['class'] = sep_gse63311['class'].apply(lambda x: 1 if x == 'SEP' else 0) 
print(sep_gse63311['class'].value_counts().to_dict())

sep_gse63311_rank = rank_transform_df(sep_gse63311)
sep_gse63311_res = run_models(sep_gse63311_rank,'SEP','GSE63311')


Missing genes: ['KLHL3']
{1: 72, 0: 11}

===== Results for SEP =====
                 Model  Accuracy  Balanced Accuracy  F1 Score     AUC  \
0              XGBoost    0.6265             0.7462    0.6856  0.7715   
1        Random Forest    0.6265             0.7847    0.6845  0.8409   
2  Logistic Regression    0.5422             0.6976    0.6079  0.8119   
3                  SVM    0.5422             0.6976    0.6079  0.7879   

   Features Used  
0              9  
1              9  
2              9  
3              9  


In [18]:
## SEP GSE176260
sep_gse176260 = pd.read_csv('data/sepsis_GSE176260_261_control_295.tsv', sep='\t')

valid_genes = [gene for gene in shap_top10['SEP'] if gene in sep_gse176260.columns]
sep_gse176260 = sep_gse176260[['ID', 'class'] + valid_genes]
missing_genes = [gene for gene in shap_top10['SEP'] if gene not in sep_gse176260.columns]
print("Missing genes:", missing_genes)

sep_gse176260.set_index('ID', inplace=True)
sep_gse176260['class'] = sep_gse176260['class'].apply(lambda x: 1 if x == 'SEP' else 0) 
print(sep_gse176260['class'].value_counts().to_dict())

sep_gse176260_rank = rank_transform_df(sep_gse176260)
sep_gse176260_res = run_models(sep_gse176260_rank,'SEP','GSE176260')


Missing genes: ['KLHL3', 'ZNF442']
{1: 265, 0: 30}

===== Results for SEP =====
                 Model  Accuracy  Balanced Accuracy  F1 Score     AUC  \
0              XGBoost    0.5525             0.6623    0.6362  0.7680   
1        Random Forest    0.5424             0.6270    0.6282  0.7531   
2  Logistic Regression    0.4881             0.7003    0.5705  0.8660   
3                  SVM    0.5322             0.7396    0.6129  0.8047   

   Features Used  
0              8  
1              8  
2              8  
3              8  


In [19]:
# BD GSE124326
bd_gse124326 = pd.read_csv('data/bd_GSE124326_control_480.tsv', sep='\t')

## select shap10
valid_genes = [gene for gene in shap_top10['BD'] if gene in bd_gse124326.columns]
bd_gse124326 = bd_gse124326[['ID', 'class'] + valid_genes]
missing_genes = [gene for gene in shap_top10['BD'] if gene not in bd_gse124326.columns]
print("Missing genes:", missing_genes)
print(bd_gse124326['class'].value_counts().to_dict())

bd_gse124326.set_index('ID', inplace=True)
bd_gse124326['class'] = bd_gse124326['class'].apply(lambda x: 1 if x == 'BP' else 0) # typos when export data in R

bd_gse124326_rank = rank_transform_df(bd_gse124326)
bd_gse124326_res = run_models(bd_gse124326_rank,'BD','GSE124326')
print(bd_gse124326_res)

Missing genes: ['MT-ATP6']
{'CTRL': 240, 'BP': 240}

===== Results for BD =====
                 Model  Accuracy  Balanced Accuracy  F1 Score     AUC  \
0              XGBoost    0.6750             0.6750    0.6740  0.7235   
1        Random Forest    0.6312             0.6313    0.6244  0.7046   
2  Logistic Regression    0.6979             0.6979    0.6979  0.7629   
3                  SVM    0.6833             0.6833    0.6806  0.7409   

   Features Used  
0              9  
1              9  
2              9  
3              9  
                 Model  Accuracy  Balanced Accuracy  F1 Score       AUC  \
0              XGBoost  0.675000           0.675000  0.674044  0.723472   
1        Random Forest  0.631250           0.631250  0.624362  0.704583   
2  Logistic Regression  0.697917           0.697917  0.697852  0.762865   
3                  SVM  0.683333           0.683333  0.680650  0.740920   

   Features Used  
0              9  
1              9  
2              9  
3      

In [25]:
## RA GSE120178
ra_gse120178 = pd.read_csv('data/ra_GSE120178_control_147.tsv', sep='\t')

valid_genes = [gene for gene in shap_top10['RA'] if gene in ra_gse120178.columns]
ra_gse120178 = ra_gse120178[['ID', 'class'] + valid_genes]
missing_genes = [gene for gene in shap_top10['RA'] if gene not in ra_gse120178.columns]
print("Missing genes:", missing_genes)

ra_gse120178.set_index('ID', inplace=True)
ra_gse120178['class'] = ra_gse120178['class'].apply(lambda x: 1 if x == 'RA' else 0) 
print(ra_gse120178['class'].value_counts().to_dict())

ra_gse120178_rank = rank_transform_df(ra_gse120178)
ra_gse120178_res = run_models(ra_gse120178_rank,'RA','GSE120178')


Missing genes: []
{1: 127, 0: 20}

===== Results for RA =====
                 Model  Accuracy  Balanced Accuracy  F1 Score     AUC  \
0              XGBoost    0.4490             0.6179    0.5132  0.6866   
1        Random Forest    0.4082             0.6154    0.4642  0.6876   
2  Logistic Regression    0.5918             0.7217    0.6532  0.7701   
3                  SVM    0.4082             0.6575    0.4567  0.7724   

   Features Used  
0             10  
1             10  
2             10  
3             10  


# without control cohort



## add control


In [27]:
## DM GSE112594
control_100 = pd.read_csv('data/healthy_GSE134080_100.tsv', sep='\t')
dm_gse112594 = pd.read_csv('data/diabetes_GSE112594_195.tsv', sep='\t')

valid_genes = [gene for gene in shap_top10['DM']
               if gene in dm_gse112594.columns and gene in control_100.columns]

dm_gse112594 = dm_gse112594[['ID', 'class'] + valid_genes]

missing_in_case = [gene for gene in shap_top10['DM'] if gene not in dm_gse112594.columns]
missing_in_control = [gene for gene in shap_top10['DM'] if gene not in control_100.columns]
print("Missing in case:", missing_in_case)
print("Missing in control:", missing_in_control)

## add control
control_100 = control_100[['ID', 'class'] + valid_genes]
assert all(dm_gse112594.columns == control_100.columns)
dm_gse112594_add_control = pd.concat([dm_gse112594, control_100], axis=0)

dm_gse112594_add_control.set_index('ID', inplace=True)
dm_gse112594_add_control['class'] = dm_gse112594_add_control['class'].apply(lambda x: 1 if x == 'DM' else 0) 
print(dm_gse112594_add_control['class'].value_counts().to_dict())


dm_gse112594_add_control_rank = rank_transform_df(dm_gse112594_add_control)
dm_gse112594_add_control_res = run_models(dm_gse112594_add_control_rank,'DM','GSE112594')


Missing in case: []
Missing in control: []
{1: 195, 0: 100}

===== Results for DM =====
                 Model  Accuracy  Balanced Accuracy  F1 Score     AUC  \
0              XGBoost    0.6678             0.7244    0.6729  0.8071   
1        Random Forest    0.5898             0.6532    0.5915  0.7728   
2  Logistic Regression    0.6881             0.7300    0.6953  0.8400   
3                  SVM    0.6068             0.6538    0.6133  0.7514   

   Features Used  
0             10  
1             10  
2             10  
3             10  


In [28]:
## DM GSE124284
control_100 = pd.read_csv('data/healthy_GSE134080_100.tsv', sep='\t')
dm_gse124284 = pd.read_csv('data/diabetes_GSE124284_286.tsv', sep='\t')

valid_genes = [gene for gene in shap_top10['DM']
               if gene in dm_gse124284.columns and gene in control_100.columns]

dm_gse124284 = dm_gse124284[['ID', 'class'] + valid_genes]

missing_in_case = [gene for gene in shap_top10['DM'] if gene not in dm_gse124284.columns]
missing_in_control = [gene for gene in shap_top10['DM'] if gene not in control_100.columns]
print("Missing in case:", missing_in_case)
print("Missing in control:", missing_in_control)

## add control
control_100 = control_100[['ID', 'class'] + valid_genes]
assert all(dm_gse112594.columns == control_100.columns)
dm_gse124284_add_control = pd.concat([dm_gse124284, control_100], axis=0)


dm_gse124284_add_control.set_index('ID', inplace=True)
dm_gse124284_add_control['class'] = dm_gse124284_add_control['class'].apply(lambda x: 1 if x == 'DM' else 0) 
print(dm_gse124284_add_control['class'].value_counts().to_dict())


dm_gse124284_add_control_rank = rank_transform_df(dm_gse124284_add_control)
dm_gse112594_add_control_res = run_models(dm_gse124284_add_control_rank,'DM','GSE124284')

Missing in case: []
Missing in control: []
{1: 286, 0: 100}

===== Results for DM =====
                 Model  Accuracy  Balanced Accuracy  F1 Score     AUC  \
0              XGBoost    0.6036             0.7000    0.6227  0.7315   
1        Random Forest    0.5285             0.6265    0.5465  0.6417   
2  Logistic Regression    0.6010             0.6657    0.6238  0.6584   
3                  SVM    0.5725             0.6465    0.5950  0.6735   

   Features Used  
0             10  
1             10  
2             10  
3             10  


In [29]:
## DM GSE124400
control_100 = pd.read_csv('data/healthy_GSE134080_100.tsv', sep='\t')
dm_gse124400 = pd.read_csv('data/diabetes_GSE124400_493.tsv', sep='\t')

valid_genes = [gene for gene in shap_top10['DM']
               if gene in dm_gse124400.columns and gene in control_100.columns]

dm_gse124400 = dm_gse124400[['ID', 'class'] + valid_genes]

missing_in_case = [gene for gene in shap_top10['DM'] if gene not in dm_gse124400.columns]
missing_in_control = [gene for gene in shap_top10['DM'] if gene not in control_100.columns]
print("Missing in case:", missing_in_case)
print("Missing in control:", missing_in_control)

## add control
control_100 = control_100[['ID', 'class'] + valid_genes]
assert all(dm_gse124400.columns == control_100.columns)
dm_gse124400_add_control = pd.concat([dm_gse124400, control_100], axis=0)


dm_gse124400_add_control.set_index('ID', inplace=True)
dm_gse124400_add_control['class'] = dm_gse124400_add_control['class'].apply(lambda x: 1 if x == 'DM' else 0) 
print(dm_gse124400_add_control['class'].value_counts().to_dict())


dm_gse124400_add_control_rank = rank_transform_df(dm_gse124400_add_control)
dm_gse124400_add_control_res = run_models(dm_gse124400_add_control_rank,'DM','GSE124400')

Missing in case: []
Missing in control: []
{1: 493, 0: 100}

===== Results for DM =====
                 Model  Accuracy  Balanced Accuracy  F1 Score     AUC  \
0              XGBoost    0.5228             0.6931    0.5691  0.7919   
1        Random Forest    0.5126             0.6870    0.5584  0.7182   
2  Logistic Regression    0.5599             0.6915    0.6093  0.7327   
3                  SVM    0.5177             0.6781    0.5653  0.7438   

   Features Used  
0             10  
1             10  
2             10  
3             10  


In [30]:
## PD GSE124676
control_100 = pd.read_csv('data/healthy_GSE134080_100.tsv', sep='\t')
pd_gse124676 = pd.read_csv('data/processed/pd_gse124676_tpm_42.tsv', sep='\t')

valid_genes = [gene for gene in shap_top10['PS']
               if gene in pd_gse124676.columns and gene in control_100.columns]

pd_gse124676 = pd_gse124676[['ID', 'class'] + valid_genes]

missing_in_case = [gene for gene in shap_top10['PS'] if gene not in pd_gse124676.columns]
missing_in_control = [gene for gene in shap_top10['PS'] if gene not in control_100.columns]
print("Missing in case:", missing_in_case)
print("Missing in control:", missing_in_control)

## add control
control_100 = control_100[['ID', 'class'] + valid_genes]
assert all(pd_gse124676.columns == control_100.columns)
pd_gse124676_add_control = pd.concat([pd_gse124676, control_100], axis=0)


pd_gse124676_add_control.set_index('ID', inplace=True)
pd_gse124676_add_control['class'] = pd_gse124676_add_control['class'].apply(lambda x: 1 if x == 'PD' else 0)  #typos
print(pd_gse124676_add_control['class'].value_counts().to_dict())


pd_gse124676_add_control_rank = rank_transform_df(pd_gse124676_add_control)
pd_gse124676_add_control_res = run_models(pd_gse124676_add_control_rank,'PS','GSE124676')

Missing in case: ['RTL8C', 'FCMR']
Missing in control: []
{0: 100, 1: 42}

===== Results for PS =====
                 Model  Accuracy  Balanced Accuracy  F1 Score     AUC  \
0              XGBoost    0.7535             0.7905    0.7642  0.8305   
1        Random Forest    0.7676             0.8281    0.7776  0.8117   
2  Logistic Regression    0.8169             0.8700    0.8249  0.9871   
3                  SVM    0.7817             0.8450    0.7911  0.8717   

   Features Used  
0              8  
1              8  
2              8  
3              8  


In [31]:
## SEP GSE110487
control_100 = pd.read_csv('data/healthy_GSE134080_100.tsv', sep='\t')
sep_gse110487 = pd.read_csv('data/sep_GSE110487_62.tsv', sep='\t')

valid_genes = [gene for gene in shap_top10['SEP']
               if gene in sep_gse110487.columns and gene in control_100.columns]

sep_gse110487 = sep_gse110487[['ID', 'class'] + valid_genes]

missing_in_case = [gene for gene in shap_top10['SEP'] if gene not in sep_gse110487.columns]
missing_in_control = [gene for gene in shap_top10['SEP'] if gene not in control_100.columns]
print("Missing in case:", missing_in_case)
print("Missing in control:", missing_in_control)

## add control
control_100 = control_100[['ID', 'class'] + valid_genes]
assert all(sep_gse110487.columns == control_100.columns)
sep_gse110487_add_control = pd.concat([sep_gse110487, control_100], axis=0)


sep_gse110487_add_control.set_index('ID', inplace=True)
sep_gse110487_add_control['class'] = sep_gse110487_add_control['class'].apply(lambda x: 1 if x == 'SEP' else 0)  
print(sep_gse110487_add_control['class'].value_counts().to_dict())


sep_gse110487_add_control_rank = rank_transform_df(sep_gse110487_add_control)
sep_gse110487_add_control_res = run_models(sep_gse110487_add_control_rank,'SEP','GSE110487')

Missing in case: []
Missing in control: []
{0: 100, 1: 62}

===== Results for SEP =====
                 Model  Accuracy  Balanced Accuracy  F1 Score  AUC  \
0              XGBoost    0.8889              0.910    0.8904  1.0   
1        Random Forest    0.9259              0.940    0.9268  1.0   
2  Logistic Regression    0.9815              0.985    0.9816  1.0   
3                  SVM    0.9691              0.975    0.9693  1.0   

   Features Used  
0             10  
1             10  
2             10  
3             10  
