In [1]:
import pandas as pd
import os
import json
import ast
from utils.statistics import *

In [2]:
pairs = pd.read_csv("trait_condition_pairs.csv")

In [3]:
all_traits = pd.read_csv("all_traits.csv")["Trait"].tolist()
all_traits = [normalize_trait(at) for at in all_traits]

In [4]:
rel = pd.read_csv("trait_related_genes.csv")
rel['Related_Genes'] = rel['Related_Genes'].apply(ast.literal_eval)
t2g = pd.Series(rel['Related_Genes'].values, index=rel['Trait']).to_dict()  # the mapping from trait to genes

In [5]:
gene_info_path = './trait_related_genes.csv'
data_root = '/home/techt/Desktop/a4s/gold_subset'
output_root = './output_corrected'

condition = None

choose_lasso = 0
choose_lmm = 0
for trait in all_traits:
    print(f"Trait {trait} only")
    output_dir = os.path.join(output_root, trait)
    os.makedirs(output_dir, exist_ok=True)
    try:
        trait_data, _, _ = select_and_load_cohort(data_root, trait, is_two_step=False)
        trait_data = trait_data.drop(columns=['Age', 'Gender'], errors="ignore")

        Y = trait_data[trait].values
        X = trait_data.drop(columns=[trait]).values

        has_batch_effect = detect_batch_effect(X)
        if has_batch_effect:
            model_constructor = LMM
            choose_lmm += 1
        else:
            model_constructor = Lasso
            choose_lasso += 1

        param_values = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]
        best_config, best_performance = tune_hyperparameters(model_constructor, param_values, X, Y, trait_data.columns, trait, gene_info_path, condition)
        model = ResidualizationRegressor(model_constructor, best_config)
        normalized_X, _ = normalize_data(X)
        model.fit(normalized_X, Y)

        var_names = trait_data.columns.tolist()
        significant_genes = interpret_result(model, var_names, trait, condition)
        save_result(significant_genes, best_performance, output_dir)

    except:
        print(f"Error processing trait {trait}")
        continue

Trait Breast_Cancer only
The cross-validation performance: {'prediction': {'accuracy': 51.522000000000006, 'precision': 91.362, 'recall': 51.348, 'f1': 65.712}, 'selection': {'precision': 16.842, 'precision_at_50': 22.4, 'recall': 4.188000000000001, 'f1': 6.708000000000001, 'jaccard': 3.47, 'jaccard2': 2.066}}
The cross-validation performance: {'prediction': {'accuracy': 51.852, 'precision': 91.88, 'recall': 51.46, 'f1': 65.94800000000001}, 'selection': {'precision': 16.110000000000003, 'precision_at_50': 20.8, 'recall': 3.3519999999999994, 'f1': 5.55, 'jaccard': 2.854, 'jaccard2': 1.7}}
The cross-validation performance: {'prediction': {'accuracy': 49.54600000000001, 'precision': 89.96799999999999, 'recall': 49.882000000000005, 'f1': 64.08800000000001}, 'selection': {'precision': 14.738, 'precision_at_50': 20.8, 'recall': 1.434, 'f1': 2.612, 'jaccard': 1.3239999999999998, 'jaccard2': 0.5700000000000001}}
The cross-validation performance: {'prediction': {'accuracy': 90.69800000000001, '

In [6]:
choose_lmm, choose_lasso

(6, 39)

In [None]:
gene_info_path = './trait_related_genes.csv'
data_root = '/home/techt/Desktop/a4s/gold_subset'
output_root = './output_corrected'

choose_lasso2 = 0
choose_lmm2 = 0
for i, (index, row) in enumerate(pairs.iterrows()):
    try:
        trait, condition = row['Trait'], row['Condition']
        output_dir = os.path.join(output_root, trait)
        os.makedirs(output_dir, exist_ok=True)

        if condition in ['Age', 'Gender']:
            trait_data, _, _ = select_and_load_cohort(data_root, trait, condition, is_two_step=False)
            redundant_col = 'Age' if condition == 'Gender' else 'Gender'
            if redundant_col in trait_data.columns:
                trait_data = trait_data.drop(columns=[redundant_col])
        else:
            trait_data, condition_data, regressors = select_and_load_cohort(data_root, trait, condition, is_two_step=True, gene_info_path=gene_info_path)
            if regressors is None:
                print(f'No gene regressors for trait {trait} and condition {condition}')
                continue

            print("Common gene regressors for condition and trait", regressors)
            X_condition = condition_data[regressors].values
            Y_condition = condition_data[condition].values

            condition_type = 'binary' if len(np.unique(Y_condition)) == 2 else 'continuous'

            if condition_type == 'binary':
                if X_condition.shape[1] > X_condition.shape[0]:
                    model = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
                else:
                    model = LogisticRegression()
            else:
                if X_condition.shape[1] > X_condition.shape[0]:
                    model = Lasso()
                else:
                    model = LinearRegression()

            normalized_X_condition, _ = normalize_data(X_condition)
            model.fit(normalized_X_condition, Y_condition)

            regressors_in_trait = trait_data[regressors].values
            normalized_regressors_in_trait, _ = normalize_data(regressors_in_trait)
            if condition_type == 'binary':
                predicted_condition = model.predict_proba(normalized_regressors_in_trait)[:, 1]
            else:
                predicted_condition = model.predict(normalized_regressors_in_trait)

            trait_data[condition] = predicted_condition
            trait_data = trait_data.drop(columns=regressors)
            trait_data = trait_data.drop(columns=['Age', 'Gender'], errors='ignore')

        Y = trait_data[trait].values
        Z = trait_data[condition].values
        X = trait_data.drop(columns=[trait, condition]).values

        has_batch_effect = detect_batch_effect(X)
        if has_batch_effect:
            model_constructor = LMM
            choose_lmm2 += 1
        else:
            model_constructor = Lasso
            choose_lasso2 += 1

        param_values = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]
        best_config, best_performance = tune_hyperparameters(model_constructor, param_values, X, Y, trait_data.columns, trait, gene_info_path, condition, Z)

        model = ResidualizationRegressor(model_constructor, best_config)
        normalized_X, _ = normalize_data(X)
        normalized_Z, _ = normalize_data(Z)
        model.fit(normalized_X, Y, normalized_Z)

        var_names = trait_data.columns.tolist()
        significant_genes = interpret_result(model, var_names, trait, condition)
        save_result(significant_genes, best_performance, output_dir, condition)
    except Exception as e:
        print(f"Error processing row {i}, for the trait '{trait}' and the condition '{condition}'\n: {e}")
        continue


The cross-validation performance: {'prediction': {'accuracy': 53.092, 'precision': 92.946, 'recall': 51.86800000000001, 'f1': 66.352}, 'selection': {'precision': 4.474, 'precision_at_50': 8.4, 'recall': 3.3980000000000006, 'f1': 3.8620000000000005, 'jaccard': 1.972, 'jaccard2': 1.7280000000000002}}
The cross-validation performance: {'prediction': {'accuracy': 51.818000000000005, 'precision': 92.146, 'recall': 50.751999999999995, 'f1': 65.326}, 'selection': {'precision': 4.273999999999999, 'precision_at_50': 7.2, 'recall': 2.68, 'f1': 3.2960000000000003, 'jaccard': 1.6740000000000002, 'jaccard2': 1.35}}
The cross-validation performance: {'prediction': {'accuracy': 50.910000000000004, 'precision': 89.056, 'recall': 51.81999999999999, 'f1': 65.44}, 'selection': {'precision': 3.7479999999999998, 'precision_at_50': 6.8, 'recall': 1.202, 'f1': 1.818, 'jaccard': 0.9179999999999999, 'jaccard2': 0.692}}
The cross-validation performance: {'prediction': {'accuracy': 89.998, 'precision': 89.998, '

In [None]:
choose_lasso, choose_lasso2

In [None]:
choose_lmm, choose_lmm2

In [None]:
29 + 84 + 16 + 19

In [None]:
import numpy as np

In [None]:
param_values = np.logspace(-5, 1, base=10, num=13).tolist()

In [None]:
param_values