In [1]:
import pandas as pd
import numpy as np
from protlearn.features import aaindex1
from protlearn.preprocessing import remove_unnatural
from bioservices import UniProt

#### Merge HEK293T and HCT116 PPIs

In [2]:
# BioPlex positives
t = pd.read_csv('BioPlex_293T_Network_10K_Dec_2019.tsv', sep='\t')[['UniprotA','UniprotB']]
h = pd.read_csv('BioPlex_HCT116_Network_5.5K_Dec_2019.tsv', sep='\t')[['UniprotA','UniprotB']]
th = pd.concat([t,h])
th = th[th.duplicated()]
th = th[th.UniprotA!='UNKNOWN']
th['Interaction'] = 1

# BioPlex negatives
ut = pd.read_csv('BioPlex_BaitPreyPairs_noFilters_293T_10K_Dec_2019.tsv', sep='\t')
uh = pd.read_csv('BioPlex_BaitPreyPairs_noFilters_HCT116_5.5K_Dec_2019.tsv', sep='\t')
uth = pd.concat([ut[ut.pInt<0.1][['bait_geneid','db_protein_id']],uh[uh.pInt<0.1][['bait_geneid','db_protein_id']]])
uth = uth[uth.duplicated()]
uth['UniprotB'] = uth.db_protein_id.str.split('|').apply(lambda x: x[1])
uth['Interaction'] = 0
th.shape, uth.shape

((14947, 3), (1714911, 4))

#### Map NCBI Entrez Gene IDs to UniProt accession numbers and sequences

In [3]:
u = UniProt(verbose=False)

uth_geneid = u.mapping("P_ENTREZGENEID", "ACC", query=' '.join([str(i) for i in uth.bait_geneid.unique()]))
uth_geneid = pd.DataFrame([(i, uth_geneid[i]) for i in uth_geneid])
uth_geneid = uth_geneid.iloc[:-2]
uth_geneid = uth_geneid.explode(1).drop_duplicates(1)
uth_geneid.columns = ['bait_geneid','UniprotA']
# uth_geneid.to_pickle('uth_geneid.pkl.gz')

uniprot_id = set(list(set(th.UniprotA.unique().tolist() + \
                          th.UniprotB.unique().tolist())) + \
                      list(uth_geneid.UniprotA.unique()) + \
                      list(uth.UniprotB.unique()))
uniprot_id = np.array_split(list(uniprot_id), 1446)
uniprot_seq = pd.concat([u.get_df(list(i), limit=None) for i in uniprot_id])
uniprot_seq = uniprot_seq.drop_duplicates('Entry')

#### Compute AAindex

In [5]:
uniprot_seq = uniprot_seq[uniprot_seq.Status=='reviewed']
uniprot_seq['Sequence'] = uniprot_seq.Sequence.str.replace('U','C')
uniprot_seq = uniprot_seq[~uniprot_seq.Sequence.str.contains('X')]

aaind, inds = aaindex1(uniprot_seq.Sequence.tolist())
uniprot_seq['aaind'] = list(aaind)

uth_geneid = pd.read_pickle('uth_geneid.pkl.gz')
uth['bait_geneid'] = uth.bait_geneid.astype(int)
uth_geneid['bait_geneid'] = uth_geneid.bait_geneid.astype(int)
uth = pd.merge(uth_geneid, uth, on='bait_geneid')[['UniprotA','UniprotB','Interaction']]


df_int = pd.merge(uniprot_seq.rename(columns={'Entry':'UniprotA'})[['UniprotA','Sequence','aaind']], 
                  pd.concat([th,uth]), on='UniprotA')
df_int = pd.merge(uniprot_seq.rename(columns={'Entry':'UniprotB'})[['UniprotB','Sequence','aaind']], df_int, on='UniprotB')
df_int = df_int.drop_duplicates(['UniprotA','UniprotB'])
df_int['aaind'] = list(np.mean([df_int.aaind_x.tolist(), df_int.aaind_y.tolist()], axis=0))
df_int.to_pickle('bioplex_interactions_aaindex1.pkl.gz')
df_int.Interaction.value_counts()

0    1344588
1       8716
Name: Interaction, dtype: int64