In [1]:
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.inspection import permutation_importance
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
import joblib

In [2]:
params = pd.read_csv('Data/calculated_parameters_for_merged.txt',sep="\t", index_col=1).iloc[:,1:]
print(params.shape)
meta = pd.read_csv('Data/merged_metadata.txt', sep="\t", index_col=0)
new_cat = []
for a in list(meta['category']): ### put all diseases from one cohort into one bucket
    if a == "healthy":
        new_cat.append(1)
    else:
        new_cat.append(0)
meta['category'] = new_cat
print(meta.shape)

merged = pd.merge(params, meta, left_index=True, right_index=True)

print(merged.head())
print(merged.shape)

(4612, 6)
(4598, 2)
                GMHI_good  GMHI_bad  Frac_of_core_functions_among_all  \
MMPD37460080ST          0         4                          0.620370   
MMPD56131850ST          1         0                          0.706897   
MMPD19623701ST          1         1                          0.619403   
MMPD91893113ST          1         3                          0.559524   
MMPD45969642ST          1         1                          0.606667   

                Frac_of_core_functions_found  Species_found_together  \
MMPD37460080ST                      0.396450                0.000000   
MMPD56131850ST                      0.242604                0.414634   
MMPD19623701ST                      0.491124                0.512195   
MMPD91893113ST                      0.556213                0.243902   
MMPD45969642ST                      0.538462                0.560976   

                Contributions_per_species  category         cohort  
MMPD37460080ST                  43.4000

In [7]:
### train model

def run_random_forest_with_permutation_importance(df, target_col, cohort_col, n_estimators=100, cv_folds=5):
    # Split the dataframe into features and target
    X = df.drop(columns=[target_col, cohort_col])
    y = df[target_col]
    
    # Define the random forest model
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)

    # Perform cross-validation to optimize the model using GridSearchCV (hyperparameter tuning)
    param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None]}
    grid_search = GridSearchCV(rf, param_grid, cv=cv_folds, scoring='accuracy')
    grid_search.fit(X, y)

    # Get the best model from the grid search
    best_rf = grid_search.best_estimator_
    
    # Fit the best model to the data
    best_rf.fit(X, y)
    
    # Calculate accuracy and ROC AUC score
    y_pred = best_rf.predict(X)
    accuracy = accuracy_score(y, y_pred)
    auc_score = roc_auc_score(y, best_rf.predict_proba(X)[:, 1])
    
    print(f"Optimal Accuracy: {accuracy:.4f}")
    print(f"ROC AUC Score: {auc_score:.4f}")
    
    # Calculate the optimal accuracy threshold using ROC curve analysis
    fpr, tpr, thresholds = roc_curve(y, best_rf.predict_proba(X)[:, 1])
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    
    print(f"Optimal Accuracy Threshold: {optimal_threshold:.4f}")
    
    # Save the trained model to a file
    model_filename = "q2_pd_model.pkl"
    joblib.dump(best_rf, model_filename)
    print(f"Model saved to {model_filename}")

    return best_rf, accuracy, auc_score, optimal_threshold

# Example usage:
# df is your dataframe, 'target' is the target column, 'cohort' is the cohort column
model, accuracy, auc, threshold = run_random_forest_with_permutation_importance(merged, target_col='category', cohort_col='cohort')

Optimal Accuracy: 0.7824
ROC AUC Score: 0.9220
Optimal Accuracy Threshold: 0.3872
Model saved to random_forest_model_with_gupta.pkl


In [10]:
### train a complete model without GMHI parameters


def run_random_forest_with_permutation_importance(df, target_col, cohort_col, n_estimators=100, cv_folds=5):
    # Split the dataframe into features and target
    X = df.drop(columns=[target_col, cohort_col,'GMHI_good','GMHI_bad'])
    y = df[target_col]
    
    # Define the random forest model
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)

    # Perform cross-validation to optimize the model using GridSearchCV (hyperparameter tuning)
    param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None]}
    grid_search = GridSearchCV(rf, param_grid, cv=cv_folds, scoring='accuracy')
    grid_search.fit(X, y)

    # Get the best model from the grid search
    best_rf = grid_search.best_estimator_
    
    # Fit the best model to the data
    best_rf.fit(X, y)
    
    # Calculate accuracy and ROC AUC score
    y_pred = best_rf.predict(X)
    accuracy = accuracy_score(y, y_pred)
    auc_score = roc_auc_score(y, best_rf.predict_proba(X)[:, 1])
    
    print(f"Optimal Accuracy: {accuracy:.4f}")
    print(f"ROC AUC Score: {auc_score:.4f}")
    
    # Calculate the optimal accuracy threshold using ROC curve analysis
    fpr, tpr, thresholds = roc_curve(y, best_rf.predict_proba(X)[:, 1])
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    
    print(f"Optimal Accuracy Threshold: {optimal_threshold:.4f}")
    
    # Save the trained model to a file
    model_filename = "q2_pd_model_no_gmhi.pkl"
    joblib.dump(best_rf, model_filename)
    print(f"Model saved to {model_filename}")

    return best_rf, accuracy, auc_score, optimal_threshold

# Example usage:
# df is your dataframe, 'target' is the target column, 'cohort' is the cohort column
model, accuracy, auc, threshold = run_random_forest_with_permutation_importance(merged, target_col='category', cohort_col='cohort')

Optimal Accuracy: 0.7643
ROC AUC Score: 0.9176
Optimal Accuracy Threshold: 0.3832
Model saved to q2_pd_model_no_gmhi.pkl
