# Predictive models for changes in clinical variables related to T2D

Includes regression and classification models

## 1. Load data

In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
combined_table = pd.read_csv('results_2023_10_18/combined_data_table.csv', dtype={'public_client_id': str})

  combined_table = pd.read_csv('results_2023_10_18/combined_data_table.csv', dtype={'public_client_id': str})


In [3]:
combined_table.index = pd.MultiIndex.from_frame(combined_table[['public_client_id', 'days_in_program']])

In [4]:
combined_table.d_1y_HbA1C_class

public_client_id  days_in_program
01001621          11.0               NaN
01002183          13.0               1.0
01002471          41.0               0.0
01003555          15.0               0.0
01003758          55.0               0.0
                                    ... 
01995109          48.0               0.0
01995874          28.0               1.0
01997508          15.0               1.0
01997909          10.0               NaN
01998999          6.0                0.0
Name: d_1y_HbA1C_class, Length: 1131, dtype: float64

In [5]:
selected_columns_full = np.loadtxt('results_2023_10_18/selected_columns_full.txt', dtype=str, delimiter='\t').tolist()
chem_subset_cols = np.loadtxt('results_2023_10_18/chem_subset_cols.txt', dtype=str, delimiter='\t').tolist()
selected_chem_bp_cols = np.loadtxt('results_2023_10_18/selected_chem_bp_cols.txt', dtype=str, delimiter='\t').tolist()
selected_prot_cols = np.loadtxt('results_2023_10_18/selected_prot_cols.txt', dtype=str, delimiter='\t').tolist()
selected_met_cols = np.loadtxt('results_2023_10_18/selected_met_cols.txt', dtype=str, delimiter='\t').tolist()

In [6]:
chems_selected = ['HbA1C', 'GFR', 'GLUCOSE', 'INSULIN', 'HOMA-IR']
chems_to_column = {
    'HbA1C': 'GLYCOHEMOGLOBIN A1C',
    'GFR': 'GFR, MDRD',
    'GLUCOSE': 'GLUCOSE',
    'INSULIN': 'INSULIN',
    'HOMA-IR': 'HOMA-IR'
}

### Create X, y arrays from data matrices

In [7]:
X_all = combined_table[selected_columns_full].to_numpy()
X_selected = combined_table[chem_subset_cols].to_numpy()
X_chems = combined_table[selected_chem_bp_cols].to_numpy()
X_prots = combined_table[selected_prot_cols].to_numpy()
X_mets = combined_table[selected_met_cols].to_numpy()
#X_prs = combined_table[prs_cols].to_numpy()

y_current = {c: combined_table[chems_to_column[c]].to_numpy() for c in chems_selected}
y_next = {c: combined_table['next_' + c].to_numpy() for c in chems_selected}
y_next_1y = {c: combined_table['next_1y_' + c].to_numpy() for c in chems_selected}
y_next_2y = {c: combined_table['next_2y_' + c].to_numpy() for c in chems_selected}
y_delta = {c: combined_table['d_' + c].to_numpy() for c in chems_selected}
y_delta_1y = {c: combined_table['d_1y_' + c].to_numpy() for c in chems_selected}
y_delta_2y = {c: combined_table['d_2y_' + c].to_numpy() for c in chems_selected}
# TODO: also try to predict age/other clinical variables?
y_misc = {'age': combined_table['age'].to_numpy(), 'bmi': combined_table['bmi'].to_numpy()}
y_delta_class = {c: combined_table['d_' + c + '_class'].to_numpy() for c in chems_selected}
y_delta_class_1y = {c: combined_table['d_1y_' + c + '_class'].to_numpy() for c in chems_selected}

In [8]:
# TODO: demographic-only model
X_basic = combined_table[['age', 'is_m', 'bmi']].to_numpy()
X_prots_demographics = np.concatenate([X_prots, X_basic], 1)
X_mets_demographics = np.concatenate([X_mets, X_basic], 1)

In [9]:
client_ids_subset = np.array([x[0] for x in combined_table.index])

In [10]:
# print data sizes
print('All:', X_all.shape)
print('selected clinical:', X_selected.shape)
print('full clinical:', X_chems.shape)
print('prots:', X_prots.shape)
print('mets:', X_mets.shape)

All: (1131, 1042)
selected clinical: (1131, 14)
full clinical: (1131, 70)
prots: (1131, 262)
mets: (1131, 710)


In [12]:
# print 
for c, data in y_delta_class.items():
    print(c, '6 months:', data.sum())
    
for c, data in y_delta_class_1y.items():
    print(c, '1 year:', data[~np.isnan(data)].sum())

HbA1C 6 months: 185.0
GFR 6 months: 299.0
GLUCOSE 6 months: 250.0
INSULIN 6 months: 453.0
HOMA-IR 6 months: 455.0
HbA1C 1 year: 87.0
GFR 1 year: 204.0
GLUCOSE 1 year: 176.0
INSULIN 1 year: 247.0
HOMA-IR 1 year: 253.0


## 2. Running machine learning predictions

In [11]:
from sklearn.linear_model import ElasticNet, ElasticNetCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

from sklearn.model_selection import GroupKFold

cv = GroupKFold(10)

# scale data
scaler = StandardScaler()
X_all_scaled = scaler.fit_transform(X_all)
X_selected_scaled = scaler.fit_transform(X_selected)
X_chems_scaled = scaler.fit_transform(X_chems)
X_prots_scaled = scaler.fit_transform(X_prots)
X_mets_scaled = scaler.fit_transform(X_mets)


en = ElasticNet()

analyses = []
dependent_vars = []
current_vars = []
for k in y_current.keys():
    analyses.append('d_' + k.lower())
    dependent_vars.append(y_delta[k])
    current_vars.append(y_current[k])
    
for k in y_current.keys():
    analyses.append('next_' + k.lower())
    dependent_vars.append(y_next[k])
    current_vars.append(y_current[k])
    
# TODO: add current
for k in y_current.keys():
    analyses.append(k)
    dependent_vars.append(y_current[k])
    current_vars.append(np.zeros(y_current[k].shape))
    
for k in y_current.keys():
    analyses.append('next_1y_' + k.lower())
    dependent_vars.append(y_next_1y[k])
    current_vars.append(y_current[k])
    analyses.append('d_1y_' + k.lower())
    dependent_vars.append(y_delta_1y[k])
    current_vars.append(y_current[k])
    analyses.append('next_2y_' + k.lower())
    dependent_vars.append(y_next_2y[k])
    current_vars.append(y_current[k])
    analyses.append('d_2y_' + k.lower())
    dependent_vars.append(y_delta_2y[k])
    current_vars.append(y_current[k])
    
# vars with no "current" variable
for k in y_misc.keys():
    analyses.append(k.lower())
    dependent_vars.append(y_misc[k])
    current_vars.append(np.zeros(y_misc[k].shape))
    
class_analyses = []
class_dependent_vars = []
class_current_vars = []
for k in y_current.keys():
    class_analyses.append('d_class_' + k.lower())
    class_dependent_vars.append(y_delta_class[k])
    class_current_vars.append(y_current[k])
    class_analyses.append('d_class_1y_' + k.lower())
    class_dependent_vars.append(y_delta_class_1y[k])
    class_current_vars.append(y_current[k])

### Analysis functions

#### Random Forest grid search function for RFR (unused here)

In [12]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

param_distributions = {
    "max_features": [1, 2, 3, 5, None],
    "max_leaf_nodes": [10, 100, 500, None],
    "min_samples_leaf": [1, 2, 5, 10, 20, 50],
    "max_depth": [15, None],
}

def rf_grid_search(model, X_train, y_train, scoring='r2'):
    # see: https://inria.github.io/scikit-learn-mooc/python_scripts/ensemble_hyperparameters.html
    search_cv = RandomizedSearchCV(model, param_distributions=param_distributions, scoring=scoring)
    search_cv.fit(X_train, y_train)
    print(search_cv.best_params_, search_cv.best_score_)
    return search_cv

#### Continuous regression analysis function

In [13]:
chems_to_column

{'HbA1C': 'GLYCOHEMOGLOBIN A1C',
 'GFR': 'GFR, MDRD',
 'GLUCOSE': 'GLUCOSE',
 'INSULIN': 'INSULIN',
 'HOMA-IR': 'HOMA-IR'}

In [14]:
# set of chems to remove just for the comparison
related_chems = {'GFR': ['CREATININE ENZ, SER', 'BUN/CREAT RATIO'],
                 'HOMA-IR': ['INSULIN'],
                 'age': ['is_m']}

# if the value of duplicate_current_vals for a given key is in the labels, set "current" to 0. This is because
# in these cases, "current" is duplicated by something already present in the data.
duplicate_current_vals = {}
for analysis in analyses + class_analyses:
    if 'gfr' in analysis:
        duplicate_current_vals[analysis] = chems_to_column['GFR']
    if 'hba1c' in analysis:
        duplicate_current_vals[analysis] = chems_to_column['HbA1C']
    if 'glucose' in analysis:
        duplicate_current_vals[analysis] = chems_to_column['GLUCOSE']
    if 'insulin' in analysis:
        duplicate_current_vals[analysis] = chems_to_column['INSULIN']
    if 'homa' in analysis:
        duplicate_current_vals[analysis] = chems_to_column['HOMA-IR']


In [16]:
def run_analyses(X, analyses, dependent_vars, current_vars, 
                 client_ids_subset, model, return_params=False, labels=None,
                 use_random_search=False,
                 save_models=False,
                 save_model_prefix=''):
    """
    X - input data
    analyses - list of names of the analyses
    dependent_vars - list of arrays
    current_vars - list of arrays
    client_ids_subset - array of strings indicating client ids
    model - a scikit-learn model
    
    return_params - return the model parameters for each CV run. only for linear models.
    labels - list or array of labels for each of the covariates
    use_random_search - whether or not to use random parameter search
    """
    all_cv_scores = []
    cv = GroupKFold(10)
    weights_results = []
    predicted_values = []
    predicted_r2_values = []
    scaler = StandardScaler()
    for analysis, var, current in zip(analyses, dependent_vars, current_vars):
        # TODO: in some circumstances, set current to 0 if current duplicates one of the clinical variables present in the data.
        cv_score = []
        current = current.reshape((X.shape[0], 1))
        if analysis in duplicate_current_vals:
            if duplicate_current_vals[analysis] in labels:
                print('analysis contains duplicate value', duplicate_current_vals[analysis])
                current = np.zeros(current.shape)
        X_new = np.concatenate([X, current], 1)
        included_subset = (~np.isnan(var))
        X_new = X_new[included_subset]
        client_ids_subset_t = client_ids_subset[included_subset]
        var = var[included_subset]
        # this is a hack to remove the current variable from the list of variables.
        if analysis in labels:
            age_index = labels.index(analysis)
            X_new[:, age_index] = 0
        if analysis in chems_to_column and chems_to_column[analysis] in labels:
            cc_index = labels.index(chems_to_column[analysis])
            X_new[:, cc_index] = 0
        if analysis in related_chems:
            for c in related_chems[analysis]:
                if c in labels:
                    cc_index = labels.index(c)
                    X_new[:, cc_index] = 0
        predicted_values_analysis = np.zeros(var.shape)
        if return_params:
            cv_score = []
            cv_round = 0
            for train_index, test_index in cv.split(X_new, var, groups=client_ids_subset_t):
                # TODO: move scaling step here. don't scale the data beforehand.
                X_train = X_new[train_index]
                X_test = X_new[test_index]
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)
                filename = save_model_prefix + '_' + analysis + '_fold' + str(cv_round) + '.pkl'
                model_exists = os.path.exists(filename)
                if model_exists:
                    with open(filename, 'rb') as f:
                        model = pickle.load(f)
                else:
                    if use_random_search:
                        cv_model = rf_grid_search(model, X_train, var[train_index])
                        model = cv_model.best_estimator_
                    else:
                        model.fit(X_train, var[train_index])
                if hasattr(model, 'coef_'):
                    weights = list(model.coef_)
                elif hasattr(model, 'feature_importances_'):
                    weights = list(model.feature_importances_)
                else:
                    weights = [0]*(X_new.shape[1])
                if hasattr(model, 'intercept_'):
                    weights.append(model.intercept_)
                if hasattr(model, 'alpha'):
                    weights.append(model.alpha)
                elif hasattr(model, 'alpha_'):
                    weights.append(model.alpha_)
                if use_random_search:
                    for k in sorted(param_distributions.keys()):
                        weights.append(model.__dict__[k])
                weights.append(analysis)
                predictions = model.predict(X_test)
                predicted_values_analysis[test_index] = predictions
                score = r2_score(var[test_index], predictions)
                weights.append(score)
                cv_score.append(score)
                weights_results.append(weights)
                if save_models and not model_exists:
                    with open(filename, 'wb') as f:
                        pickle.dump(model, f)
                cv_round += 1
            all_cv_scores.append(cv_score)
            predicted_r2_values.append(r2_score(var, predicted_values_analysis))
            predicted_values.append(predicted_values_analysis)
            print(analysis, cv_score, np.mean(cv_score))
            print('total r^2: ', predicted_r2_values[-1])
        else:
            cv_score = cross_val_score(model, X_new, var, groups=client_ids_subset_t, cv=cv, scoring='r2')
            all_cv_scores.append(cv_score)
            print(analysis, cv_score, np.mean(cv_score))
    if return_params:
        columns = list(labels) + ['current']
        if hasattr(model, 'intercept_'):
            columns.append('intercept')
        if hasattr(model, 'alpha') or hasattr(model, 'alpha_'):
            columns.append('alpha')
        if use_random_search:
            for k in sorted(param_distributions.keys()):
                columns.append(k)
        df = pd.DataFrame(weights_results, columns=columns + ['target', 'r2_score'])
        return all_cv_scores, df, predicted_values, predicted_r2_values
    else:
        return all_cv_scores

#### Classification analysis function

In [17]:
from sklearn.metrics import balanced_accuracy_score, f1_score

def run_classification_analyses(X, analyses, dependent_vars, current_vars, 
                                client_ids_subset, model, return_params=False, labels=None,
                                use_random_search=False,
                                save_models=False,
                                save_model_prefix=''):
    """
    X - input data
    analyses - list of names of the analyses
    dependent_vars - list of arrays
    current_vars - list of arrays
    client_ids_subset - array of strings indicating client ids
    model - a scikit-learn model
    
    return_params - return the model parameters for each CV run. only for linear models.
    labels - list or array of labels for each of the covariates
    """
    all_ba_scores = []
    all_f1_scores = []
    cv = GroupKFold(10)
    weights_results = []
    predicted_values = []
    predicted_ba_scores = []
    predicted_f1_scores = []
    scaler = StandardScaler()
    for analysis, var, current in zip(analyses, dependent_vars, current_vars):
        cv_score = []
        f1_scores = []
        current = current.reshape((X.shape[0], 1))
        if analysis in duplicate_current_vals:
            if duplicate_current_vals[analysis] in labels:
                print('analysis contains duplicate value', duplicate_current_vals[analysis])
                current = np.zeros(current.shape)
        X_new = np.concatenate([X, current], 1)
        included_subset = (~np.isnan(var))
        X_new = X_new[included_subset]
        client_ids_subset_t = client_ids_subset[included_subset]
        var = var[included_subset]
        predicted_values_analysis = np.zeros(var.shape)
        if return_params:
            cv_score = []
            f1_scores = []
            cv_round = 0
            for train_index, test_index in cv.split(X_new, var, groups=client_ids_subset_t):
                X_train = X_new[train_index]
                X_test = X_new[test_index]
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)
                filename = save_model_prefix + '_' + analysis + '_fold' + str(cv_round) + '.pkl'
                model_exists = os.path.exists(filename)
                if model_exists:
                    with open(filename, 'rb') as f:
                        model = pickle.load(f)
                else:
                    if use_random_search:
                        cv_model = rf_grid_search(model, X_train, var[train_index])
                        model = cv_model.best_estimator_
                    else:
                        model.fit(X_train, var[train_index])
                if hasattr(model, 'coef_'):
                    weights = list(model.coef_[0,:])
                elif hasattr(model, 'feature_importances_'):
                    weights = list(model.feature_importances_)
                else:
                    weights = [0]*(X_new.shape[1])
                if hasattr(model, 'intercept_'):
                    weights.append(model.intercept_[0])
                if hasattr(model, 'C'):
                    weights.append(model.C)
                elif hasattr(model, 'C_'):
                    weights.append(model.C_[0])
                weights.append(analysis)
                predictions = model.predict(X_test)
                predicted_values_analysis[test_index] = predictions
                # TODO: change scoring method
                score = balanced_accuracy_score(var[test_index], predictions)
                f1 = f1_score(var[test_index], predictions)
                f1_scores.append(f1)
                weights.append(score)
                weights.append(f1)
                cv_score.append(score)
                weights_results.append(weights)
                if save_models and not model_exists:
                    with open(filename, 'wb') as f:
                        pickle.dump(model, f)
                cv_round += 1
            all_ba_scores.append(cv_score)
            all_f1_scores.append(f1_scores)
            predicted_ba_scores.append(balanced_accuracy_score(var, predicted_values_analysis))
            predicted_f1_scores.append(f1_score(var, predicted_values_analysis))
            predicted_values.append(predicted_values_analysis)
            print(analysis, cv_score, np.mean(cv_score))
            print('total balanced accuracy score: ', predicted_ba_scores[-1])
            print('total F1 score: ', predicted_f1_scores[-1])
        else:
            cv_score = cross_val_score(model, X_new, var, groups=client_ids_subset_t, cv=cv, scoring='balanced_accuracy')
            all_ba_scores.append(cv_score)
            print(analysis, cv_score, np.mean(cv_score))
    if return_params:
        columns = list(labels)
        if current_vars is not None:
            columns.append('current')
        if hasattr(model, 'intercept_'):
            columns.append('intercept')
        if hasattr(model, 'C') or hasattr(model, 'C_'):
            columns.append('C')
        df = pd.DataFrame(weights_results, columns=columns + ['target', 'balanced_accuracy_score', 'f1_score'])
        return all_ba_scores, all_f1_scores, df, predicted_values, predicted_ba_scores
    else:
        return all_ba_scores

### Regression Baseline

In [18]:
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

# TODO: do 10-fold cv
baseline_cv_scores = []
baseline_lr_cv_scores = []
for analysis, var, baseline in zip(analyses, dependent_vars, current_vars):
    kf = GroupKFold(10)
    X = baseline.reshape((len(baseline), 1))
    if 'd_' in analysis:
        baseline = np.zeros(baseline.shape)
    included_subset = (~np.isnan(var))
    X = X[included_subset]
    baseline = baseline[included_subset]
    client_ids_subset_t = client_ids_subset[included_subset]
    var = var[included_subset]
    scores = []
    lr_scores = []
    for train_indices, test_indices in kf.split(X, var, groups=client_ids_subset_t):
        lr.fit(X[train_indices], var[train_indices])
        var_test = lr.predict(X[test_indices])
        scores.append(r2_score( var[test_indices], baseline[test_indices]))
        lr_scores.append(r2_score(var[test_indices], var_test))
    print(analysis, scores, np.mean(scores), np.mean(lr_scores))
    baseline_cv_scores.append(scores)
    baseline_lr_cv_scores.append(lr_scores)

d_hba1c [-0.05445994727608383, -0.008291923016248592, -0.062461465926812565, -0.05943257785749245, -0.07063992359121318, -0.0640839284585728, -0.008337132807920256, -0.02805086384734401, -0.00771379493024682, -0.02650168877900172] -0.038997324649093626 0.205063842812146
d_gfr [-0.008003258938506397, -0.008942276903969537, -5.814048558194784e-05, -0.0555466511088174, -0.014459631040406151, -0.00885745115794867, -0.13308034545930392, -0.01986480104574495, -2.5999049590330614e-05, -0.0003323715121705817] -0.024917092670203988 0.10987133400396662
d_glucose [-9.258528991207271e-05, -0.002184116732982355, -0.026471986159432692, -0.01145041199771235, -0.0015797269829183058, -0.023904448579273208, -9.542212361934865e-05, -0.0558965181078761, -0.001807452125111686, -0.005237981861864416] -0.012872064996070253 0.1515667418801425
d_insulin [-0.024715603125468277, -0.05551536241182542, -0.05398258572225423, -0.04159558827715992, -0.011342549192969509, -0.0197321401509587, -0.02666352169326469, -0.

#### Saving baseline results

In [19]:
import json

output_cv = {'baseline': baseline_cv_scores, 'baseline_linear_regression': baseline_lr_cv_scores}
output_cv = {k: {a: list(x) for a, x in zip(analyses, v)} for k, v in output_cv.items()}
with open('results_2023_10_18/baseline_regression.json', 'w') as f:
    json.dump(output_cv, f, indent=1)

### Classification Baseline

In [20]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

# TODO: do 10-fold cv
baseline_lr_cv_scores = []
baseline_lr_cv_ba_scores = []
for analysis, var, baseline in zip(class_analyses, class_dependent_vars, class_current_vars):
    kf = GroupKFold(10)
    X = baseline.reshape((len(baseline), 1))
    if 'd_' in analysis:
        baseline = np.zeros(baseline.shape)
    included_subset = (~np.isnan(var))
    X = X[included_subset]
    baseline = baseline[included_subset]
    client_ids_subset_t = client_ids_subset[included_subset]
    var = var[included_subset]
    scores = []
    lr_scores = []
    for train_indices, test_indices in kf.split(X, var, groups=client_ids_subset_t):
        lr.fit(X[train_indices], var[train_indices])
        var_test = lr.predict(X[test_indices])
        scores.append(balanced_accuracy_score( var[test_indices], var_test))
        lr_scores.append(f1_score(var[test_indices], var_test))
    print(analysis, scores, lr_scores, np.mean(scores), np.mean(lr_scores))
    baseline_lr_cv_scores.append(lr_scores)
    baseline_lr_cv_ba_scores.append(scores)

d_class_hba1c [0.5210526315789473, 0.5, 0.5260953608247423, 0.5282312925170068, 0.5405525846702317, 0.5333333333333333, 0.5, 0.5225146198830409, 0.5, 0.5573453608247423] [0.09523809523809525, 0.0, 0.1111111111111111, 0.11764705882352941, 0.15384615384615385, 0.125, 0.0, 0.09999999999999999, 0.0, 0.21052631578947367] 0.5229125183632044 0.09133687348083633
d_class_1y_hba1c [0.5, 0.5, 0.49122807017543857, 0.5625, 0.5833333333333334, 0.5, 0.5, 0.5, 0.5, 0.5] [0.0, 0.0, 0.0, 0.2222222222222222, 0.2857142857142857, 0.0, 0.0, 0.0, 0.0, 0.0] 0.5137061403508772 0.050793650793650794
d_class_gfr [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.525, 0.5, 0.5, 0.5] [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.09523809523809523, 0.0, 0.0, 0.0] 0.5025000000000001 0.009523809523809523
d_class_1y_gfr [0.5216450216450216, 0.4737171464330413, 0.5011074197120708, 0.5693877551020409, 0.5272727272727273, 0.5434782608695652, 0.5471794871794872, 0.5263157894736842, 0.5005537098560354, 0.5476190476190476] [0.15384615384615385, 0.14814814

In [21]:
import json

output_cv = {'baseline_logistic_regression': baseline_lr_cv_scores}
output_cv = {k: {a: list(x) for a, x in zip(class_analyses, v)} for k, v in output_cv.items()}
with open('results_2023_10_18/baseline_classification_f1.json', 'w') as f:
    json.dump(output_cv, f, indent=1)
    
output_cv = {'baseline_logistic_regression': baseline_lr_cv_ba_scores}
output_cv = {k: {a: list(x) for a, x in zip(class_analyses, v)} for k, v in output_cv.items()}
with open('results_2023_10_18/baseline_classification_balanced_acc.json', 'w') as f:
    json.dump(output_cv, f, indent=1)

### Run Regression models for all feature sets

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet, ElasticNetCV, Lasso, LassoCV, Ridge, RidgeCV

import json
import os

In [23]:
demo_columns = ['age', 'is_m', 'bmi']
all_X_vals = [X_selected, X_chems, X_prots, X_mets, X_all, X_basic]
all_labels = [chem_subset_cols, selected_chem_bp_cols, selected_prot_cols, selected_met_cols,
              selected_columns_full, demo_columns]
all_feature_names = ['selected_clinical', 'clinical', 'prots', 'mets_imputed', 'all', 'demo']
all_models = [ElasticNet, LassoCV, ElasticNetCV, Ridge, RidgeCV]
model_names = ['ElasticNet', 'LassoCV', 'ElasticNetCV', 'Ridge', 'RidgeCV']
all_model_params = [{},
                    {'selection': 'random', 'precompute': True, 'verbose': False, 'tol': 0.1},
                    {'selection': 'random', 'precompute': True, 'verbose': False, 'tol': 0.1},
                    {},
                    {}]

In [24]:
for X, labels, feature_name in zip(all_X_vals, all_labels, all_feature_names):
    model_cv_scores = {}
    model_weights = {}
    print('Feature set:', feature_name, '######')
    if os.path.exists('results_2023_10_18/{0}_lassocv_weights.csv'.format(feature_name)) and\
       os.path.exists('results_2023_10_18/{0}_regression.json'.format(feature_name)):
        print('results already exist')
        continue
    for Model, model_name, model_params in zip(all_models, model_names, all_model_params):
        print(model_name)
        m = Model(**model_params)
        cv_scores, weights, predictions, r2 = run_analyses(X, 
                                                           analyses, 
                                                           dependent_vars, 
                                                           current_vars, client_ids_subset, m, True, labels, False,
                                                           True, 'models_2023_10_18/' + feature_name + '_' + model_name)
        model_cv_scores[model_name] = cv_scores
        model_weights[model_name] = weights
        print()
    output_cv = {k: {a: list(x) for a, x in zip(analyses, v)} for k, v in model_cv_scores.items()}
    with open('results_2023_10_18/{0}_regression.json'.format(feature_name), 'w') as f:
        json.dump(output_cv, f, indent=1)
    model_weights['LassoCV'].to_csv('results_2023_10_18/{0}_lassocv_weights.csv'.format(feature_name), index=None)
    with open('results_2023_10_18/{0}_regression_model_weights.pkl'.format(feature_name), 'wb') as f:
        pickle.dump(model_weights, f)
    print('results saved\n')

Feature set: selected_clinical ######
results already exist
Feature set: clinical ######
results already exist
Feature set: prots ######
results already exist
Feature set: mets_imputed ######
results already exist
Feature set: all ######
results already exist
Feature set: demo ######
results already exist


### Run Classification models for all feature sets

In [25]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeClassifier, RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import json
import time

In [26]:
demo_columns = ['age', 'is_m', 'bmi']
all_X_vals = [X_selected, X_chems, X_prots, X_mets, X_all, X_basic]
all_labels = [chem_subset_cols, selected_chem_bp_cols, selected_prot_cols, selected_met_cols,
              selected_columns_full, demo_columns]
all_feature_names = ['selected_clinical', 'clinical', 'prots', 'mets_imputed', 'all', 'demo']
all_class_models = [LogisticRegression, LogisticRegression, LogisticRegressionCV, LogisticRegressionCV, RidgeClassifier, RidgeClassifierCV]
class_model_names = ['LogReg', 'LogRegLasso', 'LogRegLassoCV', 'LogRegElasticNetCV', 'RidgeClassifier', 'RidgeClassifierCV']
class_model_params = [{'max_iter': 1000},
                      {'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 1000},
                      {'penalty': 'l1', 'solver': 'saga', 'scoring': 'f1', 'verbose': False},
                      {'penalty': 'elasticnet', 'solver': 'saga', 'scoring': 'f1', 'verbose': False, 'l1_ratios': [0.5]},
                      {},
                      {}]

In [27]:
import os

for X, labels, feature_name in zip(all_X_vals, all_labels, all_feature_names):
    t0 = time.time()
    model_cv_scores = {}
    model_weights = {}
    model_f1_scores = {}
    print('Feature set:', feature_name, '#########')
    if os.path.exists('results_2023_10_18/{0}_classification_lassocv_weights.csv'.format(feature_name)) and\
       os.path.exists('results_2023_10_18/{0}_classification_lasso_weights.csv'.format(feature_name)) and \
       os.path.exists('results_2023_10_18/{0}_classification_f1.json'.format(feature_name)):
        print('results already exist')
        continue
    for Model, model_name, model_params in zip(all_class_models, class_model_names, class_model_params):
        print(model_name)
        m = Model(**model_params)
        ba_scores, f1_scores, weights, predictions, ba = run_classification_analyses(X,
                                                                                 class_analyses, 
                                                                                 class_dependent_vars, 
                                                                                 class_current_vars,
                                                                                 client_ids_subset, m, True, labels, False,
                                                                True, 'models_2023_10_18/' + feature_name + '_classification_' + model_name)
        model_cv_scores[model_name] = ba_scores
        model_f1_scores[model_name] = f1_scores
        model_weights[model_name] = weights
        print('\n')
    output_cv = {k: {a: list(x) for a, x in zip(class_analyses, v)} for k, v in model_cv_scores.items()}
    with open('results_2023_10_18/{0}_classification_balanced_acc.json'.format(feature_name), 'w') as f:
        json.dump(output_cv, f, indent=1)
    output_f1 = {k: {a: list(x) for a, x in zip(class_analyses, v)} for k, v in model_f1_scores.items()}
    with open('results_2023_10_18/{0}_classification_f1.json'.format(feature_name), 'w') as f:
        json.dump(output_f1, f, indent=1)
    model_weights['LogRegLassoCV'].to_csv('results_2023_10_18/{0}_classification_lassocv_weights.csv'.format(feature_name), index=None)
    model_weights['LogRegLasso'].to_csv('results_2023_10_18/{0}_classification_lasso_weights.csv'.format(feature_name), index=None)
    with open('results_2023_10_18/{0}_classification_model_weights.pkl'.format(feature_name), 'wb') as f:
        pickle.dump(model_weights, f)
    print('results saved')
    print('time elapsed:', time.time() - t0)
    print('\n')

Feature set: selected_clinical #########
results already exist
Feature set: clinical #########
results already exist
Feature set: prots #########
results already exist
Feature set: mets_imputed #########
results already exist
Feature set: all #########
results already exist
Feature set: demo #########
results already exist


### Advanced models for regression analysis

In [28]:
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

advanced_models = [RandomForestRegressor, SVR, LinearSVR, KNeighborsRegressor]
advanced_model_names = ['RandomForestRegressor', 'SVR', 'LinearSVR', 'KNeighborsRegressor']
advanced_model_params = [{'verbose': False, 'n_jobs': -1, 'max_depth': 15},
                         {'verbose': False, 'C': 1},
                         {'verbose': False, 'C': 1,  'max_iter': 10000},
                         {}]

In [29]:
for X, labels, feature_name in zip(all_X_vals, all_labels, all_feature_names):
    model_cv_scores = {}
    model_weights = {}
    print('Feature set:', feature_name, '######')
    if os.path.exists('results_2023_10_18/{0}_rfr_weights.csv'.format(feature_name)) and\
       os.path.exists('results_2023_10_18/{0}_advanced_models_regression.json'.format(feature_name)):
        print('results already exist')
        continue
    for Model, model_name, model_params in zip(advanced_models, advanced_model_names, advanced_model_params):
        print(model_name)
        m = Model(**model_params)
        cv_scores, weights, predictions, r2 = run_analyses(X, 
                                                           analyses, 
                                                           dependent_vars, 
                                                           current_vars, client_ids_subset, m, True, labels, False,
                                                          False, 'models_2023_10_18/' + feature_name + '_' + model_name)
        model_cv_scores[model_name] = cv_scores
        model_weights[model_name] = weights
        print()
    output_cv = {k: {a: list(x) for a, x in zip(analyses, v)} for k, v in model_cv_scores.items()}
    with open('results_2023_10_18/{0}_advanced_models_regression.json'.format(feature_name), 'w') as f:
        json.dump(output_cv, f, indent=1)
    model_weights['RandomForestRegressor'].to_csv('results_2023_10_18/{0}_rfr_weights.csv'.format(feature_name), index=None)
    print('results saved\n')

Feature set: selected_clinical ######
results already exist
Feature set: clinical ######
results already exist
Feature set: prots ######
results already exist
Feature set: mets_imputed ######
results already exist
Feature set: all ######
results already exist
Feature set: demo ######
results already exist


### Advanced models for classification

In [30]:
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

advanced_models = [RandomForestClassifier, SVC, LinearSVC, KNeighborsClassifier]
advanced_model_names = ['RandomForestClassifier', 'SVC', 'LinearSVC', 'KNeighborsClassifier']
advanced_model_params = [{'verbose': False, 'n_jobs': -1, 'max_depth': 15},
                         {'verbose': False, 'C': 1},
                         {'verbose': False, 'C': 1, 'max_iter': 10000},
                         {}]

In [31]:
for X, labels, feature_name in zip(all_X_vals, all_labels, all_feature_names):
    t0 = time.time()
    model_cv_scores = {}
    model_weights = {}
    model_f1_scores = {}
    print('Feature set:', feature_name, '#########')
    if os.path.exists('results_2023_10_18/{0}_classification_rfc_weights.csv'.format(feature_name)) and\
       os.path.exists('results_2023_10_18/{0}_advanced_models_classification_f1.json'.format(feature_name)):
        print('results already exist')
        continue
    for Model, model_name, model_params in zip(advanced_models, advanced_model_names, advanced_model_params):
        print(model_name)
        m = Model(**model_params)
        ba_scores, f1_scores, weights, predictions, ba = run_classification_analyses(X,
                                                                                 class_analyses, 
                                                                                 class_dependent_vars, 
                                                                                 class_current_vars,
                                                                                 client_ids_subset, m, True, labels, False,
                                                                                False, 'models_2023_10_18/' + feature_name + '_classification_' + model_name)
        model_cv_scores[model_name] = ba_scores
        model_f1_scores[model_name] = f1_scores
        model_weights[model_name] = weights
        print('\n')
    output_cv = {k: {a: list(x) for a, x in zip(class_analyses, v)} for k, v in model_cv_scores.items()}
    with open('results_2023_10_18/{0}_advanced_models_classification_balanced_acc.json'.format(feature_name), 'w') as f:
        json.dump(output_cv, f, indent=1)
    output_f1 = {k: {a: list(x) for a, x in zip(class_analyses, v)} for k, v in model_f1_scores.items()}
    with open('results_2023_10_18/{0}_advanced_models_classification_f1.json'.format(feature_name), 'w') as f:
        json.dump(output_f1, f, indent=1)
    model_weights['RandomForestClassifier'].to_csv('results_2023_10_18/{0}_classification_rfc_weights.csv'.format(feature_name), index=None)
    print('results saved')
    print('time elapsed:', time.time() - t0)
    print('\n')

Feature set: selected_clinical #########
results already exist
Feature set: clinical #########
results already exist
Feature set: prots #########
results already exist
Feature set: mets_imputed #########
results already exist
Feature set: all #########
results already exist
Feature set: demo #########
results already exist
