In [1]:
#sfisch: Version 0.4.0 is needed to use with our model
from autogluon.tabular import TabularPredictor
#sfisch6: we use this module to pull our trained model and full feature matrix from huggingface
from huggingface_hub import snapshot_download, hf_hub_download
import pandas as pd

In [2]:
#sfisch6: load model
model_dir = snapshot_download(repo_id="sfisch/hu.MAP3.0_AutoGluon")
predictor = TabularPredictor.load(f"{model_dir}/huMAP3_20230503_complexportal_subset10kNEG_notScaled_accuracy")

Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

In [3]:
#sfisch6: load full feature matrix
full_fpath = hf_hub_download(repo_id="sfisch/hu.MAP3.0", filename='full/humap3_full_feature_matrix_20220625.csv.gz',
                      repo_type='dataset')

#sfisch6: we use only 500 rows for example purposes, the predictions can be run on the full feature matrix
full_featmat = pd.read_csv(full_fpath, compression="gzip", nrows=500) 

In [4]:
full_trim = full_featmat.drop(["acc1","acc2"], axis=1)

In [5]:
full_trim.head()

Unnamed: 0,Ce_1111_poisson,Ce_1111_wcc,Ce_1111_apex,Ce_1111_pq_euc,Ce_6mg_1203_poisson,Ce_6mg_1203_wcc,Ce_6mg_1203_apex,Ce_6mg_1203_pq_euc,Ce_BNF_wan_60_1209_poisson,Ce_BNF_wan_60_1209_wcc,...,neg_ln_pval_bp3_293T_Z4,pair_count_bp3_293T_Z4,neg_ln_pval_bp3_HCT116_Z2,pair_count_bp3_HCT116_Z2,neg_ln_pval_bp3_HCT116_Z4,pair_count_bp3_HCT116_Z4,neg_ln_pval_bp3_293T_HCT116_Z2,pair_count_bp3_293T_HCT116_Z2,neg_ln_pval_bp3_293T_HCT116_Z4,pair_count_bp3_293T_HCT116_Z4
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,1.535341,1.0,,,,,0.498518,1.0,1.412862,1.0
3,,,,,,,,,,,...,,,1.621166,2.0,,,0.909286,3.0,,
4,,,,,,,,,,,...,1.636575,1.0,4.565785,5.0,,,7.99066,12.0,1.340686,1.0


In [6]:
# generate predictions
y_pred = predictor.predict(full_trim)
print("Predictions:  \n", y_pred)

Predictions:  
 0      1
1     -1
2     -1
3     -1
4     -1
      ..
495   -1
496   -1
497   -1
498   -1
499   -1
Name: IntAct_train, Length: 500, dtype: int64


In [7]:
# generate probabilities for the predictions
pred_df_probs = predictor.predict_proba(full_trim)

In [8]:
# looking at the probability of an interaction (1)
pred_df_probs.sort_values(1)

Unnamed: 0,-1,1
46,0.999630,0.000370
7,0.999609,0.000391
337,0.999563,0.000437
14,0.999557,0.000443
350,0.999552,0.000448
...,...,...
365,0.000902,0.999098
379,0.000425,0.999575
232,0.000396,0.999604
230,0.000305,0.999695


In [9]:
full_featmat['pred_prob'] = pred_df_probs[1]

In [10]:
full_featmat.head()

Unnamed: 0,acc1,acc2,Ce_1111_poisson,Ce_1111_wcc,Ce_1111_apex,Ce_1111_pq_euc,Ce_6mg_1203_poisson,Ce_6mg_1203_wcc,Ce_6mg_1203_apex,Ce_6mg_1203_pq_euc,...,pair_count_bp3_293T_Z4,neg_ln_pval_bp3_HCT116_Z2,pair_count_bp3_HCT116_Z2,neg_ln_pval_bp3_HCT116_Z4,pair_count_bp3_HCT116_Z4,neg_ln_pval_bp3_293T_HCT116_Z2,pair_count_bp3_293T_HCT116_Z2,neg_ln_pval_bp3_293T_HCT116_Z4,pair_count_bp3_293T_HCT116_Z4,pred_prob
0,P04217,P04217,,,,,,,,,...,,,,,,,,,,0.982139
1,Q9UBI1,Q92667,,,,,,,,,...,,,,,,,,,,0.003521
2,Q9UBI1,P38432,,,,,,,,,...,1.0,,,,,0.498518,1.0,1.412862,1.0,0.003294
3,Q9UBI1,Q92804,,,,,,,,,...,,1.621166,2.0,,,0.909286,3.0,,,0.002296
4,Q9UBI1,Q01650,,,,,,,,,...,1.0,4.565785,5.0,,,7.99066,12.0,1.340686,1.0,0.006896


In [12]:
full_featmat.sort_values("pred_prob", ascending=False)[['acc1','acc2','pred_prob']].to_csv(
    "huMAP3_unlabeledpredictions_first_500pairs.pairsWprob", index=False, header=False, sep='\t')
