This notebook containts the code to generate classifiers, plots and statistics for models M1-M6 (see the article methods section).
Source code for the pipeline used can be found in src/pipeline.py

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src import pipeline
from sklearn import ensemble, model_selection

Data import setup:

In [None]:
provirus_type_data_path = "data/Table_5_df_complete_2_799.csv"
longitudianl_data_path = "data/Table_6_df_features_IS_longitudinal.csv"
index_col = "Description"
feature_names = {
    "cat1": [
        "is_other_cell",
        "is_CD4_T_cell",
        "is_CD8_T_cell",
        "is_B_cell",
        "is_myeloid_cell",
        "is_proinflammatory_factor",
        "Response",
        "Count",
        "RF",
    ],
    "cat2": ["tpm"],
    "cat3": ["atac_count", "contactCount"],
}

Base model and model tuning setup:

In [None]:
base_model = ensemble.RandomForestClassifier()
rfc_param_grid = {
    "n_estimators": [100, 20],
    "criterion": ["gini", "log_loss"],
    "max_features": ["sqrt", "log2"],
    "min_samples_split": [3, 5, 10],
    "min_samples_leaf": [1, 4],
    "class_weight": [None, "balanced"],
}
tuning_cv = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=10)

1. M1 - multiclass classification of enriched signatures harboring intact- versus defective proviruses in ART-treated patients and elite controllers.

In [None]:
M1 = pipeline.ClassificationPipeline(data_path=provirus_type_data_path, index_name=index_col, X_names=feature_names, y_name='type')
M1.calc_feat_importance()

In [None]:
M1.select_n_features(5)
M1.tune_model(base_model=base_model, param_grid=rfc_param_grid, crossvalidator=tuning_cv, scoring_metric='f1_macro')
M1.evaluate_model()
M1.draw_plots(kind='kde')
M1.draw_plots(kind='box')

In [None]:
pipeline.test_median_f1s(M1.f1s['Cat. 1'], M1.f1s['Cat. 1 & 2'])
pipeline.test_median_f1s(M1.f1s['Cat. 1'], M1.f1s['Cat. 1 & 3'])
pipeline.test_median_f1s(M1.f1s['Cat. 1'], M1.f1s['Cat. 1, 2 & 3'])

2. M2 - binary classification of enriched signatures harboring intact- versus defective proviruses in ART-treated patients.

In [None]:
M2 = pipeline.ClassificationPipeline(data_path=provirus_type_data_path, index_name=index_col, X_names=feature_names, y_name='provirus', filter_col='patient',filter_val='ART')
M2.calc_feat_importance()

In [None]:
M2.select_n_features(5)
M2.tune_model(base_model=base_model, param_grid=rfc_param_grid, crossvalidator=tuning_cv, scoring_metric='f1')
M2.evaluate_model()
M2.draw_plots(kind='kde')
M2.draw_plots(kind='box')

In [None]:
pipeline.test_median_f1s(M2.f1s['Cat. 1'], M2.f1s['Cat. 1 & 2'])
pipeline.test_median_f1s(M2.f1s['Cat. 1'], M2.f1s['Cat. 1 & 3'])
pipeline.test_median_f1s(M2.f1s['Cat. 1'], M2.f1s['Cat. 1, 2 & 3'])

3. M3 - binary classification of enriched signatures harboring intact- versus defective proviruses in elite controllers.

In [None]:
M3 = pipeline.ClassificationPipeline(data_path=provirus_type_data_path, index_name=index_col, X_names=feature_names, y_name='provirus', filter_col='patient',filter_val='EC')
M3.calc_feat_importance()

In [None]:
M3.select_n_features(5)
M3.tune_model(base_model=base_model, param_grid=rfc_param_grid, crossvalidator=tuning_cv, scoring_metric='f1')
M3.evaluate_model()
M3.draw_plots(kind='kde')
M3.draw_plots(kind='box')

In [None]:
pipeline.test_median_f1s(M3.f1s['Cat. 1'], M3.f1s['Cat. 1 & 2'])
pipeline.test_median_f1s(M3.f1s['Cat. 1'], M3.f1s['Cat. 1 & 3'])
pipeline.test_median_f1s(M3.f1s['Cat. 1'], M3.f1s['Cat. 1, 2 & 3'])

4. Comparison of M2 and M3

In [None]:
f1s_art_molten = M2.f1s.melt()
f1s_art_molten['patients']='ART'
f1s_ec_molten = M3.f1s.melt()
f1s_ec_molten['patients']='EC'
f1s_combined = pd.concat([f1s_art_molten, f1s_ec_molten])

fig, ax = plt.subplots()
sns.set(font_scale=1.2)
sns.boxplot(data=f1s_combined, x='variable', y='value',palette="Dark2", hue='patients')
ax.set_ylim(0, 1.2)
ax.set_ylabel("F1 score")
ax.set_xlabel("Attribute categories")
#plt.savefig("data/output/f1s_art_vs_ec.svg", dpi=600, format="svg")
print(f"Median F1 scores (ART):\n{M2.f1s.median()}")
print(f"Median F1 scores (EC):\n{M3.f1s.median()}")

In [None]:
pipeline.test_median_f1s(M2.f1s['Cat. 1'], M3.f1s['Cat. 1'])
pipeline.test_median_f1s(M2.f1s['Cat. 1 & 2'], M3.f1s['Cat. 1 & 2'])
pipeline.test_median_f1s(M2.f1s['Cat. 1 & 3'], M3.f1s['Cat. 1 & 3'])
pipeline.test_median_f1s(M2.f1s['Cat. 1, 2 & 3'], M3.f1s['Cat. 1, 2 & 3'])

5. M4 - multiclass classification of immunologic signatures enriched in pretreatment HIV-1-infected individuals, patients subjected to a short- and long period of ART and elite controllers.

In [None]:
M4 = pipeline.ClassificationPipeline(data_path=longitudianl_data_path, index_name=index_col, X_names=feature_names, y_name='type')
M4.calc_feat_importance()

In [None]:
M4.select_n_features(5)
M4.tune_model(base_model=base_model, param_grid=rfc_param_grid, crossvalidator=tuning_cv, scoring_metric='f1_macro')
M4.evaluate_model()
M4.draw_plots(kind='kde')
M4.draw_plots(kind='box')

6. M5 - as in M4, but excluding elite controllers.

In [None]:
M5 = pipeline.ClassificationPipeline(data_path=longitudianl_data_path, index_name=index_col, X_names=feature_names, y_name='type', filter_col='type', filter_exclude='EC_long')
M5.calc_feat_importance()

In [None]:
M5.select_n_features(5)
M5.tune_model(base_model=base_model, param_grid=rfc_param_grid, crossvalidator=tuning_cv, scoring_metric='f1_macro')
M5.evaluate_model()
M5.draw_plots(kind='kde')
M5.draw_plots(kind='box')

7. M6 - as in M4, but excluding pretreatment HIV-1-infected individuals.

In [None]:
M6 = pipeline.ClassificationPipeline(data_path=longitudianl_data_path, index_name=index_col, X_names=feature_names, y_name='type', filter_col='type', filter_exclude='ART_untreat')
M6.calc_feat_importance()

In [None]:
M6.select_n_features(5)
M6.tune_model(base_model=base_model, param_grid=rfc_param_grid, crossvalidator=tuning_cv, scoring_metric='f1_macro')
M6.evaluate_model()
M6.draw_plots(kind='kde')
M6.draw_plots(kind='box')