# Random Forest Classifiers / AUROC

In this notebook we will train Random Forest Classifiers on case vs. control status within each study, and use AUROC to determine their success
 ____ 

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold,cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc,roc_auc_score

import scipy.stats
import pandas as pd
import numpy as np
from utils import *

%matplotlib inline

## Collect Reads
First, load the merged table constructed earlier

In [None]:
merged_table = pd.read_csv('../data/merged_table.csv')

## Train Random Forest Classifiers, Calculate AUROC
Iterating through each study, train an RFC with fivefold cross-validation, and calculate AUROC for each model. 

In [None]:
# intialize dataframe
df = pd.DataFrame({'study':[],'AUC':[]})

# iterate through studies
for study in merged_table['study'].unique():
    res_temp = merged_table[merged_table['study'] == study].copy()
    
    # pivot into abundance matrix and CLR transform
    X = pd.pivot_table(res_temp, index = 'sample_id', columns = 'genus', values = 'reads').fillna(0.0)
    X = X.apply(clr_transform_individual, axis=1)

    # add dummy variables for case and control
    res_temp.loc[:, 'condition']= res_temp['condition'].replace({'control':0,'case':1})
    
    # split into testing and training groups
    y = res_temp.drop_duplicates(subset = 'sample_id').set_index('sample_id')['condition']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # built RFC with cross-validation
    model = RandomForestClassifier(n_estimators=100, random_state=1)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    
    # test RFC
    y_pred = cross_val_predict(model, X, y, cv=cv, method='predict_proba')[:, 1]
    
    # score using AUROC
    auroc = roc_auc_score(y, y_pred)
    
    # add to dataframe
    df = pd.concat([df, pd.DataFrame({'study':[study],'AUC':[auroc]})])

## Format Results
Add metadata and count to the resulting dataframe

In [None]:
# add metadata
df['disease'] = df['study'].map(merged_table.drop_duplicates(subset = 'study').set_index('study')['disease'].to_dict())
df['URT'] = df['study'].str.split(',').str[-1]

# add count
df['count'] = merged_table.groupby('study')['sample_id'].nunique().reindex(df['study']).fillna(0).values
df.sort_values(by = ['URT','disease'], inplace = True)
df.AUC.mean()

## Specify Color Encoding
Import the disease-specific color dictionary we've been using 

In [None]:
color_dict = {'Asthma':'#a6cee3',
              'COVID-19':'#1f78b4', 
              'Influenza':'#b2df8a',
              'Pneumonia':'#33a02c',
              'RSV':'#fb9a99',
              'RTI':'#e31a1c',
              'Resp. Allergies':'#fdbf6f',
              'Rhinosinusitis':'#ff7f00',
              'COPD':'#cab2d6',
              'Tonsillitis':'#6a3d9a'}

## Plot AUROC Results
Plot results of AUROC by study, color encoding for disease and URT sampling site

In [None]:
df['disease'] = df['disease'].astype('category')

auroc = (ggplot(
    df, aes(y = 'AUC', x = 'study'))
    +geom_point(aes(color = 'disease'),size = 5)
    +geom_segment(aes(xend='study', color = 'disease'), yend=0, size = 2)
    +annotate(geom_rect, xmin=0, xmax=21.5, ymin=0.5, ymax=1.0,
              fill = 'salmon', alpha=0.15)
    +annotate(geom_rect, xmin=21.5, xmax=32.5, ymin=0.5, ymax=1.0,
              fill = 'cadetblue', alpha=0.15)
    +ylim(0.5,1.0)
    +labs(x = '', y = 'AUROC')
    +scale_x_discrete(limits = df['study'])
    +scale_color_manual(color_dict, guide = None)
    +coord_flip()
    +theme_minimal()+theme(figure_size=(6,8),text=element_text(size=15)))
auroc

In [None]:
count = (ggplot(df, 
        aes(x = 'study',y = 'count'))
    +geom_col(aes(fill = 'disease'),size = 5)

    +annotate(geom_rect, xmin=0, xmax=21.5, ymin=0, ymax=1200,
              fill = 'salmon', alpha=0.15)
    +annotate(geom_rect, xmin=21.5, xmax=32.5, ymin=0, ymax=1200,
              fill = 'cadetblue', alpha=0.15)
    +ylim(0,1200)
    +labs(y = 'Sample Count', x = '')
    +scale_x_discrete(limits = df['study'], labels = '', guide = None)
    +scale_fill_manual(color_dict)
    +coord_flip()
    +theme_minimal()+theme(figure_size=(6,8),text=element_text(size=15)))
count

In [None]:
ggsave(auroc, '../visualizations/auroc.svg',dpi = 300, format ='svg')
ggsave(count, '../visualizations/count.svg',dpi = 300, format = 'svg')

## Calculate Associations
Calculate any association between AUROC and sample count, as well as AUROC between URT sampling sites

In [None]:
scipy.stats.pearsonr(df['AUC'], df['count'])

In [None]:
scipy.stats.ttest_ind(df[df['URT'] == ' NP']['AUC'],
                      df[df['URT'] == ' OP']['AUC'])

No significant associations