In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import db_utils, sqlalchemy
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import seaborn as sns
from scipy.stats import fisher_exact
from scipy import stats
import math
import os
import gzip
import re
import glob
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import math

In [None]:
# Define a function that ensures an element is a list of strings
def ensure_list_of_strings(value):
    if isinstance(value, list):
        return [str(val) for val in value]
    elif isinstance(value, str):
        # Split the string by comma or any other delimiter used in the data
        return [val.strip() for val in value.split(',')]
    else:
        return []

In [3]:
# Connect to the MySQL database and read the preprocessed 3did into a DataFrame
eng = sqlalchemy.create_engine('mysql://', creator= db_utils.get_connection)
query= pd.read_sql_query('''select * from chopyan_db.3did_agg_DDI_PDB as df''', con= eng)
DDI_CS = pd.DataFrame(query)
DDI_CS

Unnamed: 0,DDI_type,PDB_ID,Chain_ID1,DomainID1,DomainName1,DomainStart1,DomainEnd1,Chain_ID2,DomainID2,DomainName2,DomainStart2,DomainEnd2,Score,Zscore,Chain1Iface,Chain2Iface,isInterchain
0,PF00001_PF00001,1gzm,A,PF00001,7tm_1,54,306,B,PF00001,7tm_1,54,306,26.18,7.82923,"E:196,E:196,E:196,E:196,E:197,N:199,E:201,E:20...","F:228,K:231,E:232,A:233,E:232,F:228,V:227,F:22...",1
1,PF00001_PF00001,1hzx,B,PF00001,7tm_1,54,246,B,PF00001,7tm_1,224,306,42.49,7.03413,"I:54,I:54,I:54,N:55,N:55,N:55,T:58,L:72,L:72,L...","V:300,P:303,V:304,A:299,V:300,P:303,Y:306,V:25...",0
2,PF00001_PF00001,1l9h,B,PF00001,7tm_1,54,246,B,PF00001,7tm_1,224,306,40.17,9.39410,"I:54,I:54,N:55,N:55,N:55,T:58,L:72,L:72,L:72,L...","V:300,P:303,A:299,V:300,P:303,Y:306,V:250,M:25...",0
3,PF00001_PF00001,2j4y,A,PF00001,7tm_1,54,306,B,PF00001,7tm_1,54,306,26.85,6.86878,"N:199,E:201,E:201,E:201,S:202,S:202,I:205,V:20...","Q:236,F:228,K:231,E:232,F:228,E:232,F:228,F:22...",1
4,PF00001_PF00001,2vt4,C,PF00001,7tm_1,58,343,D,PF00001,7tm_1,58,343,20.02,6.36751,"L:152,V:160,T:164,T:164,A:167,I:168,I:168,A:17...","F:315,A:206,A:206,A:210,W:181,A:210,I:214,W:18...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235443,PF18841_PF17433,1wmr,A,PF18841,B_solenoid_dext,321,354,A,PF17433,Glyco_hydro_49N,19,184,9.52,4.52235,"D:341,D:341,D:341,K:343,K:343,V:345,V:345,V:34...","R:153,S:182,P:183,S:101,S:182,Q:97,W:99,L:178,...",0
235444,PF18841_PF17433,1x0c,B,PF18841,B_solenoid_dext,321,354,B,PF17433,Glyco_hydro_49N,19,184,9.52,4.45403,"D:341,D:341,D:341,K:343,K:343,V:345,V:345,V:34...","R:153,S:182,P:183,S:101,S:182,Q:97,W:99,L:178,...",0
235445,PF18841_PF17433,2z8g,A,PF18841,B_solenoid_dext,321,354,A,PF17433,Glyco_hydro_49N,19,184,9.24,4.49346,"D:341,D:341,D:341,K:343,K:343,V:345,V:345,V:34...","R:153,S:182,P:183,S:101,S:182,Q:97,W:99,L:178,...",0
235446,PF18841_PF17433,3wwg,C,PF18841,B_solenoid_dext,321,354,C,PF17433,Glyco_hydro_49N,19,184,9.24,4.61412,"D:341,D:341,D:341,K:343,K:343,V:345,V:345,V:34...","R:153,S:182,P:183,S:101,S:182,Q:97,W:99,L:178,...",0


In [4]:
# Group the DataFrame based in 'DDI_type' so that there is one row per DDI type and keep domain names as columns
# Create a new column holding a list of the two domain names calles 'domain_IDs'
DDI_grouped = DDI_CS.groupby('DDI_type').agg({'DomainName1': 'first', 'DomainName2': 'first'})
DDI_grouped['domain_IDs'] = [[Name1, Name2] for Name1, Name2 in zip(DDI_grouped['DomainName1'].tolist(), DDI_grouped['DomainName2'].tolist())]
DDI_grouped.reset_index(inplace=True)

In [5]:
DDI_grouped

In [8]:
# Create a dictionary holding the Pfam-Pfam combination of a DDI as key and the associated domain names as values
# Should later be used to translate the combinations of short name into the aquivalent DDI type 
DDI_type_domainIDs = {}
for DDI_type, domain_IDs in zip(DDI_grouped['DDI_type'].tolist(), DDI_grouped['domain_IDs'].tolist()):
    DDI_type_domainIDs[DDI_type] = domain_IDs

In [9]:
DDI_type_domainIDs

{'PF00001_PF00001': ['7tm_1', '7tm_1'],
 'PF00001_PF00048': ['7tm_1', 'IL8'],
 'PF00001_PF00085': ['7tm_1', 'Thioredoxin'],
 'PF00001_PF00087': ['7tm_1', 'Toxin_TOLIP'],
 'PF00001_PF00322': ['7tm_1', 'Endothelin'],
 'PF00001_PF00339': ['7tm_1', 'Arrestin_N'],
 'PF00001_PF00400': ['7tm_1', 'WD40'],
 'PF00001_PF00503': ['7tm_1', 'G-alpha'],
 'PF00001_PF00516': ['7tm_1', 'GP120'],
 'PF00001_PF00534': ['7tm_1', 'Glycos_transf_1'],
 'PF00001_PF00959': ['7tm_1', 'Phage_lysozyme'],
 'PF00001_PF02202': ['7tm_1', 'Tachykinin'],
 'PF00001_PF02752': ['7tm_1', 'Arrestin_C'],
 'PF00001_PF07361': ['7tm_1', 'Cytochrom_B562'],
 'PF00001_PF07686': ['7tm_1', 'V-set'],
 'PF00001_PF10413': ['7tm_1', 'Rhodopsin_N'],
 'PF00002_PF00002': ['7tm_2', '7tm_2'],
 'PF00002_PF00123': ['7tm_2', 'Hormone_2'],
 'PF00002_PF00214': ['7tm_2', 'Calc_CGRP_IAPP'],
 'PF00002_PF00301': ['7tm_2', 'Rubredoxin'],
 'PF00002_PF00473': ['7tm_2', 'CRF'],
 'PF00002_PF00503': ['7tm_2', 'G-alpha'],
 'PF00002_PF00534': ['7tm_2', 'Glycos

In [10]:
# ProtCID files were received from the authors; there is one file for each domain combination that has
# at least one cluster
# Files were named in the way (ShortName Domain1)(ShortName Domain2)_xxxx.txt.gz
# Processing steps:
# Unzip the gzipped files in the source directory (dir_name) into another directory (dir1_name)

dir_name = '/Users/johgeist/Documents/AG_Luck/Project_Thesis/Files/ProtCID/DomainClusterData_20211020/'
dir1_name = '/Users/johgeist/Documents/AG_Luck/Project_Thesis/Files/ProtCID/DomainClusterData_20211020_unzipped/'


def decompress(infile, tofile):
    with open(infile, 'rb') as inf, open(tofile, 'w', encoding='utf8') as tof:
        decom_str = gzip.decompress(inf.read()).decode('utf-8')
        tof.write(decom_str)
        
#for infile in glob.glob(dir_name + '*.gz'):
    #tofile = dir1_name + '/' + infile.split('/')[-1][:-3]
    #print(tofile)
    #decompress(infile, tofile)


In [11]:
# ProtCID files contain a table-like structure that has a summary row before each cluster containing the surface 
# area and metrics from the individual 3d structures in one cluster
# Followed by that summarizing row there are rows referring to a single 3D structure within the cluster.
# Parse the ProtCID files and generate a DataFrame that stores all relevant information for each cluster of a DDI
# type.
# Make two temporary dataframes (one of the summarizing row and one grouping the single 3D structures of one 
# cluster together, that were joined to the final df_joined after dropping unnecessary information. 
# Add additional columns that save the domain names, the file name, the ClusterID and the DDI type from
# the dictionary I created before.
file = '/Users/johgeist/Documents/AG_Luck/Project_Thesis/Files/ProtCID/DomainClusterData_20211020_unzipped/(Adenine_deam_C)(Amidohydro_1)_1471.txt'
domain_names = re.findall(r'\((.*?)\)', file)
df = pd.read_csv(file, index_col=False, sep='\t', low_memory=False)
temp = df.drop_duplicates(subset='ClusterID',keep='first')
temp = temp.drop(labels = ['Relation', 'CFID', 'SpaceGroup', 'CrystForm', 'PdbID', 'DomainInterfaceID',
                           'InterfaceUnit', 'ChainPfamArch', 'InAsu', 'InPdb', 'InPisa', 'ASU', 'PDBBA',
                           'PDBBAID', 'PISABA', 'PISABAID', 'UnpCode', 'Name', 'Species', '#CfgRelation',
                           '#EntryRelation' , '#CfgCluster','#EntryCluster', '#EntryHomo', '#EntryHetero',
                           '#EntryIntra', 'ClusterInterface'], axis=1)
temp = temp[temp['ClusterID'].apply(lambda x: str(x).isdigit())]
temp_1 = df.dropna(subset=['CFID']).groupby('ClusterID').agg({'PdbID':lambda x: ','.join(set([i for i in x if type(i) == str])),
                        #'ChainPfamArch':lambda x: ','.join(set([i for i in x if type(i) == str])),
                        'UnpCode':lambda x: ','.join(set([i for i in x if type(i) == str])),
                        'Species':lambda x: ','.join(set([i for i in x if type(i) == str])), 
                        #'CrystForm':lambda x: ','.join(set([i for i in x if type(i) == str])),
                        'CFID': lambda x: ','.join(set([str(i) for i in x if type(i) == float]))})
#temp_1.reset_index(inplace = True)
#temp_1.set_index('ClusterID', inplace = True)
#temp.reset_index(inplace = True)
temp.set_index('ClusterID', inplace= True)
df_joined = temp.join(temp_1, on='ClusterID', how='left')
# #df_joined['DDI_type'] = DDI_type_domainIDs.get(':'.join(domain_names), np.nan)
df_joined.reset_index(inplace= True)
for DDI_type, IDs in DDI_type_domainIDs.items():
    if len(domain_names) == 1:
        if (domain_names[0] == IDs[0]) and (domain_names[0] == IDs[1]):
            df_joined.insert(0, 'DDI_type', DDI_type)   
    elif len(domain_names) > 1:   
        if (domain_names[0] in IDs) and (domain_names[1] in IDs):
            df_joined.insert(0, 'DDI_type', DDI_type)
df_joined.insert(0, 'DomainNames', ' '.join(domain_names))
df_joined.insert(1, 'FileName', file.split('/')[-1])



for file in glob.glob(dir1_name + '*.txt'):
    if file == '/Users/johgeist/Documents/AG_Luck/Project_Thesis/Files/ProtCID/DomainClusterData_20211020_unzipped/(Adenine_deam_C)(Amidohydro_1)_1471.txt':
        pass
    else:
        df_joined = df_joined.copy(deep= True)
        domain_names = re.findall(r'\((.*?)\)', file)
        
        df = pd.read_csv(file, index_col=False, sep='\t', low_memory=False)
        temp = df.drop_duplicates(subset='ClusterID',keep='first')
        temp = temp.drop(labels = ['Relation', 'CFID', 'SpaceGroup', 'CrystForm', 'PdbID', 'DomainInterfaceID',
                                   'InterfaceUnit', 'ChainPfamArch', 'InAsu', 'InPdb', 'InPisa', 'ASU', 'PDBBA',
                                   'PDBBAID', 'PISABA', 'PISABAID', 'UnpCode', 'Name', 'Species', '#CfgRelation',
                                   '#EntryRelation' , '#CfgCluster','#EntryCluster', '#EntryHomo', '#EntryHetero',
                                   '#EntryIntra', 'ClusterInterface'], axis=1)
        temp = temp[temp['ClusterID'].apply(lambda x: str(x).isdigit())]
        temp_1 = df.dropna(subset=['CFID']).groupby('ClusterID').agg({'PdbID':lambda x: ','.join(set([i for i in x if type(i) == str])),
                                #'ChainPfamArch':lambda x: ','.join(set([i for i in x if type(i) == str])),
                                'UnpCode':lambda x: ','.join(set([i for i in x if type(i) == str])),
                                'Species':lambda x: ','.join(set([i for i in x if type(i) == str])),
                                #'CrystForm':lambda x: ','.join(set([i for i in x if type(i) == str])),
                                'CFID': lambda x: ','.join(set([str(i) for i in x if type(i) == float]))})
        #temp_1.reset_index(inplace = True)
        #temp_1.set_index('ClusterID', inplace = True)
        #temp.reset_index(inplace = True)
        temp.set_index('ClusterID', inplace= True)
        df_joined1 = temp.join(temp_1, on='ClusterID', how='left' )
        #df_joined1['DDI_type'] = DDI_type_domainIDs.get(':'.join(domain_names), np.nan)
        df_joined1.reset_index(inplace= True)
        for DDI_type, IDs in DDI_type_domainIDs.items():
            if len(domain_names) == 1:
                if (domain_names[0] == IDs[0]) and (domain_names[0] == IDs[1]):
                    df_joined1['DDI_type'] = DDI_type
            elif len(domain_names) > 1:   
                if (domain_names[0] in IDs) and (domain_names[1] in IDs):
                    df_joined1['DDI_type'] = DDI_type
        df_joined1.insert(0, 'DomainNames', ' '.join(domain_names))
        df_joined1.insert(1, 'FileName', file.split('/')[-1])
        df_joined= pd.concat([df_joined,df_joined1], ignore_index = True)

In [12]:
df_joined

Unnamed: 0,DomainNames,FileName,DDI_type,ClusterID,SurfaceArea,MinSeqIdentity,MediumSurfaceArea,PdbID,UnpCode,Species,CFID
0,Adenine_deam_C Amidohydro_1,(Adenine_deam_C)(Amidohydro_1)_1471.txt,PF13382_PF01979,1,1622.5920,78.0,1660.35,"3t8l,3t81,3nqb",ADEC2_AGRFC;ADEC2_AGRFC,Agrobacterium tumefaciens;Agrobacterium tumefa...,1.0
1,Adenine_deam_C Amidohydro_1,(Adenine_deam_C)(Amidohydro_1)_1471.txt,PF13382_PF01979,2,366.8667,78.0,380.60,"3t8l,3t81,3nqb",ADEC2_AGRFC;ADEC2_AGRFC,Agrobacterium tumefaciens;Agrobacterium tumefa...,1.0
2,Adenine_deam_C Amidohydro_1,(Adenine_deam_C)(Amidohydro_1)_1471.txt,PF13382_PF01979,3,159.1500,78.0,163.85,"3t8l,3t81,3nqb",ADEC2_AGRFC;ADEC2_AGRFC,Agrobacterium tumefaciens;Agrobacterium tumefa...,1.0
3,Med21 Med4,(Med21)(Med4)_19935.txt,PF11221_PF10018,1,481.3500,27.0,506.75,"5oqm,5n9j","MED21_SCHPO;MED4_SCHPO,MED21_YEAST;MED4_YEAST",Saccharomyces cerevisiae (strain ATCC 204508 /...,"2.0,6.0"
4,Synaptobrevin,(Synaptobrevin)_17022.txt,PF00957_PF00957,1,348.0500,100.0,366.90,"6wvw,1sfc",VAMP2_RAT,Rattus norvegicus,"8.0,5.0"
...,...,...,...,...,...,...,...,...,...,...,...
64561,1-cysPrx_C,(1-cysPrx_C)_1.txt,PF10417_PF10417,4,181.0100,64.0,214.55,"2v41,5b6n,5b6m","Q1AN22_AREMA,PRDX6_HUMAN","Homo sapiens,ARENICOLA MARINA","9.0,30.0"
64562,1-cysPrx_C,(1-cysPrx_C)_1.txt,PF10417_PF10417,5,351.3625,48.0,396.80,"3sbc,5jcg,5ucx","TSA1_YEAST,PRDX3_HUMAN","Homo sapiens,Saccharomyces cerevisiae","18.0,5.0"
64563,1-cysPrx_C,(1-cysPrx_C)_1.txt,PF10417_PF10417,6,130.6750,48.0,133.70,"4k1f,2i81","A5K421_PLAVS,Q4QF76_LEIMA","Leishmania major,Plasmodium vivax SaI-1","16.0,15.0"
64564,1-cysPrx_C,(1-cysPrx_C)_1.txt,PF10417_PF10417,7,179.2750,55.0,249.65,"4k1f,2z9s","PRDX1_RAT,Q4QF76_LEIMA","Leishmania major,Rattus norvegicus","16.0,24.0"


In [22]:
# In df_joined the ClusterIDs were provided by the ProtCID files partly as string -> change the data type of 
# the column to integers
df_joined['ClusterID'] = pd.to_numeric(df_joined['ClusterID'])

In [23]:
df_joined

Unnamed: 0,DomainNames,FileName,DDI_type,ClusterID,SurfaceArea,MinSeqIdentity,MediumSurfaceArea,PdbID,UnpCode,Species,CFID
0,Adenine_deam_C Amidohydro_1,(Adenine_deam_C)(Amidohydro_1)_1471.txt,PF13382_PF01979,1,1622.5920,78.0,1660.35,"3t8l,3t81,3nqb",ADEC2_AGRFC;ADEC2_AGRFC,Agrobacterium tumefaciens;Agrobacterium tumefa...,1.0
1,Adenine_deam_C Amidohydro_1,(Adenine_deam_C)(Amidohydro_1)_1471.txt,PF13382_PF01979,2,366.8667,78.0,380.60,"3t8l,3t81,3nqb",ADEC2_AGRFC;ADEC2_AGRFC,Agrobacterium tumefaciens;Agrobacterium tumefa...,1.0
2,Adenine_deam_C Amidohydro_1,(Adenine_deam_C)(Amidohydro_1)_1471.txt,PF13382_PF01979,3,159.1500,78.0,163.85,"3t8l,3t81,3nqb",ADEC2_AGRFC;ADEC2_AGRFC,Agrobacterium tumefaciens;Agrobacterium tumefa...,1.0
3,Med21 Med4,(Med21)(Med4)_19935.txt,PF11221_PF10018,1,481.3500,27.0,506.75,"5oqm,5n9j","MED21_SCHPO;MED4_SCHPO,MED21_YEAST;MED4_YEAST",Saccharomyces cerevisiae (strain ATCC 204508 /...,"2.0,6.0"
4,Synaptobrevin,(Synaptobrevin)_17022.txt,PF00957_PF00957,1,348.0500,100.0,366.90,"6wvw,1sfc",VAMP2_RAT,Rattus norvegicus,"8.0,5.0"
...,...,...,...,...,...,...,...,...,...,...,...
64561,1-cysPrx_C,(1-cysPrx_C)_1.txt,PF10417_PF10417,4,181.0100,64.0,214.55,"2v41,5b6n,5b6m","Q1AN22_AREMA,PRDX6_HUMAN","Homo sapiens,ARENICOLA MARINA","9.0,30.0"
64562,1-cysPrx_C,(1-cysPrx_C)_1.txt,PF10417_PF10417,5,351.3625,48.0,396.80,"3sbc,5jcg,5ucx","TSA1_YEAST,PRDX3_HUMAN","Homo sapiens,Saccharomyces cerevisiae","18.0,5.0"
64563,1-cysPrx_C,(1-cysPrx_C)_1.txt,PF10417_PF10417,6,130.6750,48.0,133.70,"4k1f,2i81","A5K421_PLAVS,Q4QF76_LEIMA","Leishmania major,Plasmodium vivax SaI-1","16.0,15.0"
64564,1-cysPrx_C,(1-cysPrx_C)_1.txt,PF10417_PF10417,7,179.2750,55.0,249.65,"4k1f,2z9s","PRDX1_RAT,Q4QF76_LEIMA","Leishmania major,Rattus norvegicus","16.0,24.0"


In [24]:
#df_joined.to_csv('/Users/johannageist/Documents/AG_Luck/Projektarbeit/Files/ProtCID/Master_Dataframe_ProtCID/master_df_ProtCID_20220201.csv', index= False, sep= '\t')

In [25]:
# Read in the previously generated 3did subset holding only those DDI types with at least one heterodimeric
# structure and one interchain interface as a DataFrame
df_inter_hetero = pd.read_csv('/Users/johgeist/Downloads/3did_project_dataframes_new/3did_metrics_inter_hetero_subset.csv', index_col=0)

In [26]:
df_inter_hetero

Unnamed: 0_level_0,DomainID1,DomainID2,DomainName1,DomainName2,Num_structures,Num_interchain_structures,Score,Zscore,contacts1,PDB_ID_max,Interchain_fraction,Num_intrachain_structures
DDI_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
PF00001_PF00048,PF00001,PF00048,7tm_1,IL8,8,8,13.26,5.101720,51,5UIW,1.000000,0
PF00001_PF00085,PF00001,PF00085,7tm_1,Thioredoxin,7,6,1.57,1.963520,7,6H7O,0.857143,1
PF00001_PF00087,PF00001,PF00087,7tm_1,Toxin_TOLIP,1,1,15.76,5.828060,44,6WJC,1.000000,0
PF00001_PF00322,PF00001,PF00322,7tm_1,Endothelin,4,4,31.19,7.365980,78,5GLH,1.000000,0
PF00001_PF00339,PF00001,PF00339,7tm_1,Arrestin_N,6,3,18.24,5.657100,38,6TKO,0.500000,3
...,...,...,...,...,...,...,...,...,...,...,...,...
PF18782_PF00559,PF18782,PF00559,NAD2,Vif,1,1,7.74,3.639300,23,6NIL,1.000000,0
PF18784_PF00071,PF18784,PF00071,CRM1_repeat_2,Ras,34,34,8.76,4.306780,25,3NBZ,1.000000,0
PF18801_PF00072,PF18801,PF00072,RapH_N,Response_reg,1,1,10.48,4.961570,22,3Q15,1.000000,0
PF18806_PF00071,PF18806,PF00071,Importin_rep_3,Ras,1,1,0.05,0.237612,7,2X19,1.000000,0


In [27]:
# Filter df_joined for those clusters that include the max scoring structure in 3did
df_joined['PdbID'] = df_joined['PdbID'].str.split(',')

In [28]:
df_max_Score_joined = pd.merge(df_joined, df_inter_hetero.filter(['DDI_type', 'PDB_ID_max']), on= 'DDI_type')

In [29]:
df_max_Score_joined

Unnamed: 0,DomainNames,FileName,DDI_type,ClusterID,SurfaceArea,MinSeqIdentity,MediumSurfaceArea,PdbID,UnpCode,Species,CFID,PDB_ID_max
0,Med21 Med4,(Med21)(Med4)_19935.txt,PF11221_PF10018,1,481.3500,27.0,506.75,"[5oqm, 5n9j]","MED21_SCHPO;MED4_SCHPO,MED21_YEAST;MED4_YEAST",Saccharomyces cerevisiae (strain ATCC 204508 /...,"2.0,6.0",5N9J
1,PI3Ka SH2,(PI3Ka)(SH2)_13501.txt,PF00613_PF00017,1,382.7833,97.0,347.40,"[5swo, 5swp, 5itd, 5ul1, 5xgh, 4l1b, 3hiz, 5sw...",PK3CA_HUMAN;P85A_HUMAN,Homo sapiens;Homo sapiens,"3.0,2.0,4.0,1.0,6.0",5SWR
2,Pox_Rap94 RNA_pol_Rpb2_7,(Pox_Rap94)(RNA_pol_Rpb2_7)_35453.txt,PF03294_PF04560,1,480.2750,99.0,527.35,"[6rfl, 6ric]","Q1PIU7_9POXV;B9U1Q1_9POXV,B9U1I7_9POXV;B9U1Q1_...",Vaccinia virus GLV-1h68;Vaccinia virus GLV-1h68,"2.0,1.0",6RFL
3,Adeno_hexon_C Adeno_PVIII,(Adeno_hexon_C)(Adeno_PVIII)_1493.txt,PF01310_PF03678,1,396.3250,33.0,408.70,"[6qi5, 6z7n, 5tx1, 6b1t, 3zif, 6cgv, 6yba]","CAPSH_ADE05;CAP8_ADE05,B2ZX09_ADE41;B5SNS9_ADE...","Lizard adenovirus 2;Lizard adenovirus 2,Human ...","3.0,2.0,7.0,4.0,5.0,1.0,6.0",6CGV
4,Adeno_hexon_C Adeno_PVIII,(Adeno_hexon_C)(Adeno_PVIII)_1493.txt,PF01310_PF03678,2,298.4818,25.0,374.75,"[6qi5, 6z7n, 5tx1, 6b1t, 6cgv, 6yba]","CAPSH_ADE05;CAP8_ADE05,B2ZX09_ADE41;B5SNS9_ADE...","Lizard adenovirus 2;Lizard adenovirus 2,Human ...","3.0,2.0,4.0,5.0,1.0,6.0",6CGV
...,...,...,...,...,...,...,...,...,...,...,...,...
15273,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,4,190.0111,82.0,180.90,"[3vh8, 7k81, 3wuw, 5t6z, 7k80, 5b39, 6v3j]","I3ZN84_HUMAN;KI3L1_HUMAN,A0A411J078_HUMAN;I6LE...",Homo sapiens;Homo sapiens,"10.0,12.0,3.0,4.0,11.0,16.0,9.0",5T70
15274,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,5,170.8200,82.0,164.70,"[3vh8, 5t70, 7k81, 3wuw, 5t6z, 7k80, 5b39, 6v3j]","KI3L1_HUMAN;HLAB_HUMAN,HLAB_HUMAN;KI3L1_HUMAN,...",Homo sapiens;Homo sapiens,"10.0,12.0,3.0,4.0,11.0,16.0,9.0",5T70
15275,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,6,228.2500,82.0,260.05,"[3vh8, 7k81, 3wuw, 7k80, 6v3j]","KI3L1_HUMAN;HLAB_HUMAN,HLAB_HUMAN;KI3L1_HUMAN,...",Homo sapiens;Homo sapiens,"10.0,3.0,4.0,11.0,9.0",5T70
15276,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,7,184.0000,82.0,190.45,"[7k80, 3vh8, 7k81, 3wuw]","A0A411J078_HUMAN;I6LEL9_HUMAN,A0A411J078_HUMAN...",Homo sapiens;Homo sapiens,"10.0,9.0,3.0,4.0",5T70


In [30]:
data_test = {'Column1': [70, 65, 80, 95],
            'Column2': [['A', 'B', 'C'], ['B', 'C', 'D'], ['E', 'F', 'G'], ['X', 'Y', 'Z']],
            'Column3': ['PF1_PF2', 'PF1_PF2', 'PF3_PF4', 'PF3_PF4'],
            'Column4': ['B', 'B', 'E', 'E']}
df_test = pd.DataFrame(data_test)
#mask = (~df_test['Column4'].isin(df_test['Column2']))
mask= df_test.apply(lambda row: row['Column4'] in row['Column2'], axis=1)
df_test_filtered = df_test[mask]
df_test_filtered

Unnamed: 0,Column1,Column2,Column3,Column4
0,70,"[A, B, C]",PF1_PF2,B
1,65,"[B, C, D]",PF1_PF2,B
2,80,"[E, F, G]",PF3_PF4,E


In [31]:
df_max_Score_joined['PDB_ID_max'] = df_max_Score_joined['PDB_ID_max'].astype(str)
df_max_Score_joined['PDB_ID_max'] = df_max_Score_joined['PDB_ID_max'].str.lower()

# Apply the ensure_list_of_strings function to the column to ensure that it holds lists of strings
df_max_Score_joined['PdbID'] = df_max_Score_joined['PdbID'].apply(ensure_list_of_strings)

In [32]:
df_max_Score_joined

Unnamed: 0,DomainNames,FileName,DDI_type,ClusterID,SurfaceArea,MinSeqIdentity,MediumSurfaceArea,PdbID,UnpCode,Species,CFID,PDB_ID_max
0,Med21 Med4,(Med21)(Med4)_19935.txt,PF11221_PF10018,1,481.3500,27.0,506.75,"[5oqm, 5n9j]","MED21_SCHPO;MED4_SCHPO,MED21_YEAST;MED4_YEAST",Saccharomyces cerevisiae (strain ATCC 204508 /...,"2.0,6.0",5n9j
1,PI3Ka SH2,(PI3Ka)(SH2)_13501.txt,PF00613_PF00017,1,382.7833,97.0,347.40,"[5swo, 5swp, 5itd, 5ul1, 5xgh, 4l1b, 3hiz, 5sw...",PK3CA_HUMAN;P85A_HUMAN,Homo sapiens;Homo sapiens,"3.0,2.0,4.0,1.0,6.0",5swr
2,Pox_Rap94 RNA_pol_Rpb2_7,(Pox_Rap94)(RNA_pol_Rpb2_7)_35453.txt,PF03294_PF04560,1,480.2750,99.0,527.35,"[6rfl, 6ric]","Q1PIU7_9POXV;B9U1Q1_9POXV,B9U1I7_9POXV;B9U1Q1_...",Vaccinia virus GLV-1h68;Vaccinia virus GLV-1h68,"2.0,1.0",6rfl
3,Adeno_hexon_C Adeno_PVIII,(Adeno_hexon_C)(Adeno_PVIII)_1493.txt,PF01310_PF03678,1,396.3250,33.0,408.70,"[6qi5, 6z7n, 5tx1, 6b1t, 3zif, 6cgv, 6yba]","CAPSH_ADE05;CAP8_ADE05,B2ZX09_ADE41;B5SNS9_ADE...","Lizard adenovirus 2;Lizard adenovirus 2,Human ...","3.0,2.0,7.0,4.0,5.0,1.0,6.0",6cgv
4,Adeno_hexon_C Adeno_PVIII,(Adeno_hexon_C)(Adeno_PVIII)_1493.txt,PF01310_PF03678,2,298.4818,25.0,374.75,"[6qi5, 6z7n, 5tx1, 6b1t, 6cgv, 6yba]","CAPSH_ADE05;CAP8_ADE05,B2ZX09_ADE41;B5SNS9_ADE...","Lizard adenovirus 2;Lizard adenovirus 2,Human ...","3.0,2.0,4.0,5.0,1.0,6.0",6cgv
...,...,...,...,...,...,...,...,...,...,...,...,...
15273,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,4,190.0111,82.0,180.90,"[3vh8, 7k81, 3wuw, 5t6z, 7k80, 5b39, 6v3j]","I3ZN84_HUMAN;KI3L1_HUMAN,A0A411J078_HUMAN;I6LE...",Homo sapiens;Homo sapiens,"10.0,12.0,3.0,4.0,11.0,16.0,9.0",5t70
15274,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,5,170.8200,82.0,164.70,"[3vh8, 5t70, 7k81, 3wuw, 5t6z, 7k80, 5b39, 6v3j]","KI3L1_HUMAN;HLAB_HUMAN,HLAB_HUMAN;KI3L1_HUMAN,...",Homo sapiens;Homo sapiens,"10.0,12.0,3.0,4.0,11.0,16.0,9.0",5t70
15275,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,6,228.2500,82.0,260.05,"[3vh8, 7k81, 3wuw, 7k80, 6v3j]","KI3L1_HUMAN;HLAB_HUMAN,HLAB_HUMAN;KI3L1_HUMAN,...",Homo sapiens;Homo sapiens,"10.0,3.0,4.0,11.0,9.0",5t70
15276,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,7,184.0000,82.0,190.45,"[7k80, 3vh8, 7k81, 3wuw]","A0A411J078_HUMAN;I6LEL9_HUMAN,A0A411J078_HUMAN...",Homo sapiens;Homo sapiens,"10.0,9.0,3.0,4.0",5t70


In [33]:
# Create a boolean mask based on the condition
mask = df_max_Score_joined.apply(lambda row: row['PDB_ID_max'] in row['PdbID'], axis=1)
# Use the boolean mask to filter and keep only the desired rows
df_max_Score_joined_filtered = df_max_Score_joined[mask]

In [34]:
df_max_Score_joined_filtered

Unnamed: 0,DomainNames,FileName,DDI_type,ClusterID,SurfaceArea,MinSeqIdentity,MediumSurfaceArea,PdbID,UnpCode,Species,CFID,PDB_ID_max
0,Med21 Med4,(Med21)(Med4)_19935.txt,PF11221_PF10018,1,481.3500,27.0,506.75,"[5oqm, 5n9j]","MED21_SCHPO;MED4_SCHPO,MED21_YEAST;MED4_YEAST",Saccharomyces cerevisiae (strain ATCC 204508 /...,"2.0,6.0",5n9j
1,PI3Ka SH2,(PI3Ka)(SH2)_13501.txt,PF00613_PF00017,1,382.7833,97.0,347.40,"[5swo, 5swp, 5itd, 5ul1, 5xgh, 4l1b, 3hiz, 5sw...",PK3CA_HUMAN;P85A_HUMAN,Homo sapiens;Homo sapiens,"3.0,2.0,4.0,1.0,6.0",5swr
2,Pox_Rap94 RNA_pol_Rpb2_7,(Pox_Rap94)(RNA_pol_Rpb2_7)_35453.txt,PF03294_PF04560,1,480.2750,99.0,527.35,"[6rfl, 6ric]","Q1PIU7_9POXV;B9U1Q1_9POXV,B9U1I7_9POXV;B9U1Q1_...",Vaccinia virus GLV-1h68;Vaccinia virus GLV-1h68,"2.0,1.0",6rfl
3,Adeno_hexon_C Adeno_PVIII,(Adeno_hexon_C)(Adeno_PVIII)_1493.txt,PF01310_PF03678,1,396.3250,33.0,408.70,"[6qi5, 6z7n, 5tx1, 6b1t, 3zif, 6cgv, 6yba]","CAPSH_ADE05;CAP8_ADE05,B2ZX09_ADE41;B5SNS9_ADE...","Lizard adenovirus 2;Lizard adenovirus 2,Human ...","3.0,2.0,7.0,4.0,5.0,1.0,6.0",6cgv
4,Adeno_hexon_C Adeno_PVIII,(Adeno_hexon_C)(Adeno_PVIII)_1493.txt,PF01310_PF03678,2,298.4818,25.0,374.75,"[6qi5, 6z7n, 5tx1, 6b1t, 6cgv, 6yba]","CAPSH_ADE05;CAP8_ADE05,B2ZX09_ADE41;B5SNS9_ADE...","Lizard adenovirus 2;Lizard adenovirus 2,Human ...","3.0,2.0,4.0,5.0,1.0,6.0",6cgv
...,...,...,...,...,...,...,...,...,...,...,...,...
15260,Fer4_7 NADH_Oxid_Nqo15,(Fer4_7)(NADH_Oxid_Nqo15)_8080.txt,PF12838_PF11497,1,323.8731,99.0,326.20,"[6i0d, 4hea, 6zjn, 6q8w, 3iam, 2ybb, 6ziy, 6zj...",NQO9_THET8;NQO15_THET8,"THERMUS THERMOPHILUS;THERMUS THERMOPHILUS,Ther...","3.0,2.0,8.0,9.0,4.0,5.0,1.0,6.0",4hea
15269,Chordopox_RPO7 Pox_RNA_pol_35,(Chordopox_RPO7)(Pox_RNA_pol_35)_35486.txt,PF05864_PF03396,1,1492.7250,98.0,1503.20,"[6rid, 6rfl, 6rie, 6ric]","B9U1G3_9POXV;B9U1R2_9POXV,RP07_VACCA;RP35_VACCA","Vaccinia virus;Vaccinia virus,Vaccinia virus G...","3.0,2.0,4.0,1.0",6rie
15270,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,1,311.3024,79.0,297.50,"[6pa1, 1im9, 3vh8, 4n8v, 1efx, 5t70, 7k81, 3wu...","KI2L2_HUMAN;HLAC_HUMAN,HLAB_HUMAN;KI3L1_HUMAN,...",Homo sapiens;Homo sapiens,"10.0,12.0,14.0,3.0,8.0,4.0,11.0,5.0,16.0,1.0,9...",5t70
15271,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,2,243.0559,30.0,231.25,"[1im9, 3vh8, 4n8v, 5t70, 7k81, 3wuw, 5t6z, 7k8...","HLAB_HUMAN;KI3L1_HUMAN,KI2L1_HUMAN;HLAC_HUMAN,...",Homo sapiens;Homo sapiens,"10.0,12.0,14.0,3.0,4.0,11.0,5.0,16.0,9.0,15.0",5t70


In [35]:
df_max_Score_joined_filtered = df_max_Score_joined_filtered.copy()
df_max_Score_joined_filtered['PdbID'] = df_max_Score_joined_filtered['PdbID'].str.join(',')

In [36]:
df_max_Score_joined_filtered

Unnamed: 0,DomainNames,FileName,DDI_type,ClusterID,SurfaceArea,MinSeqIdentity,MediumSurfaceArea,PdbID,UnpCode,Species,CFID,PDB_ID_max
0,Med21 Med4,(Med21)(Med4)_19935.txt,PF11221_PF10018,1,481.3500,27.0,506.75,"5oqm,5n9j","MED21_SCHPO;MED4_SCHPO,MED21_YEAST;MED4_YEAST",Saccharomyces cerevisiae (strain ATCC 204508 /...,"2.0,6.0",5n9j
1,PI3Ka SH2,(PI3Ka)(SH2)_13501.txt,PF00613_PF00017,1,382.7833,97.0,347.40,"5swo,5swp,5itd,5ul1,5xgh,4l1b,3hiz,5swr,4jps,4...",PK3CA_HUMAN;P85A_HUMAN,Homo sapiens;Homo sapiens,"3.0,2.0,4.0,1.0,6.0",5swr
2,Pox_Rap94 RNA_pol_Rpb2_7,(Pox_Rap94)(RNA_pol_Rpb2_7)_35453.txt,PF03294_PF04560,1,480.2750,99.0,527.35,"6rfl,6ric","Q1PIU7_9POXV;B9U1Q1_9POXV,B9U1I7_9POXV;B9U1Q1_...",Vaccinia virus GLV-1h68;Vaccinia virus GLV-1h68,"2.0,1.0",6rfl
3,Adeno_hexon_C Adeno_PVIII,(Adeno_hexon_C)(Adeno_PVIII)_1493.txt,PF01310_PF03678,1,396.3250,33.0,408.70,"6qi5,6z7n,5tx1,6b1t,3zif,6cgv,6yba","CAPSH_ADE05;CAP8_ADE05,B2ZX09_ADE41;B5SNS9_ADE...","Lizard adenovirus 2;Lizard adenovirus 2,Human ...","3.0,2.0,7.0,4.0,5.0,1.0,6.0",6cgv
4,Adeno_hexon_C Adeno_PVIII,(Adeno_hexon_C)(Adeno_PVIII)_1493.txt,PF01310_PF03678,2,298.4818,25.0,374.75,"6qi5,6z7n,5tx1,6b1t,6cgv,6yba","CAPSH_ADE05;CAP8_ADE05,B2ZX09_ADE41;B5SNS9_ADE...","Lizard adenovirus 2;Lizard adenovirus 2,Human ...","3.0,2.0,4.0,5.0,1.0,6.0",6cgv
...,...,...,...,...,...,...,...,...,...,...,...,...
15260,Fer4_7 NADH_Oxid_Nqo15,(Fer4_7)(NADH_Oxid_Nqo15)_8080.txt,PF12838_PF11497,1,323.8731,99.0,326.20,"6i0d,4hea,6zjn,6q8w,3iam,2ybb,6ziy,6zjl,6q8x,3...",NQO9_THET8;NQO15_THET8,"THERMUS THERMOPHILUS;THERMUS THERMOPHILUS,Ther...","3.0,2.0,8.0,9.0,4.0,5.0,1.0,6.0",4hea
15269,Chordopox_RPO7 Pox_RNA_pol_35,(Chordopox_RPO7)(Pox_RNA_pol_35)_35486.txt,PF05864_PF03396,1,1492.7250,98.0,1503.20,"6rid,6rfl,6rie,6ric","B9U1G3_9POXV;B9U1R2_9POXV,RP07_VACCA;RP35_VACCA","Vaccinia virus;Vaccinia virus,Vaccinia virus G...","3.0,2.0,4.0,1.0",6rie
15270,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,1,311.3024,79.0,297.50,"6pa1,1im9,3vh8,4n8v,1efx,5t70,7k81,3wuw,5t6z,7...","KI2L2_HUMAN;HLAC_HUMAN,HLAB_HUMAN;KI3L1_HUMAN,...",Homo sapiens;Homo sapiens,"10.0,12.0,14.0,3.0,8.0,4.0,11.0,5.0,16.0,1.0,9...",5t70
15271,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,2,243.0559,30.0,231.25,"1im9,3vh8,4n8v,5t70,7k81,3wuw,5t6z,7k80,5b39,6...","HLAB_HUMAN;KI3L1_HUMAN,KI2L1_HUMAN;HLAC_HUMAN,...",Homo sapiens;Homo sapiens,"10.0,12.0,14.0,3.0,4.0,11.0,5.0,16.0,9.0,15.0",5t70


In [37]:
df_max_Score_joined_filtered.to_csv('/Users/johgeist/Downloads/3did_project_dataframes_new/ProtCID_parsed_files/Master_df_ProtCID_20231017.csv', index=False)

In [38]:
# Create another datarame with the same columns but holding oonly the counts of the ProtCID features as values, as
# numeric features will be needed later
df_counts = df_max_Score_joined_filtered.drop(labels= ['UnpCode', 'CFID', 'PdbID', 'Species'], axis = 1)
df_counts['UnpCode'] = df_max_Score_joined_filtered['UnpCode'].str.count(",") + 1
df_counts['PdbID'] = df_max_Score_joined_filtered['PdbID'].str.count(",") + 1
df_counts['CFID'] = df_max_Score_joined_filtered['CFID'].str.count(",") + 1

df_counts

Unnamed: 0,DomainNames,FileName,DDI_type,ClusterID,SurfaceArea,MinSeqIdentity,MediumSurfaceArea,PDB_ID_max,UnpCode,PdbID,CFID
0,Med21 Med4,(Med21)(Med4)_19935.txt,PF11221_PF10018,1,481.3500,27.0,506.75,5n9j,2,2,2
1,PI3Ka SH2,(PI3Ka)(SH2)_13501.txt,PF00613_PF00017,1,382.7833,97.0,347.40,5swr,1,30,5
2,Pox_Rap94 RNA_pol_Rpb2_7,(Pox_Rap94)(RNA_pol_Rpb2_7)_35453.txt,PF03294_PF04560,1,480.2750,99.0,527.35,6rfl,2,2,2
3,Adeno_hexon_C Adeno_PVIII,(Adeno_hexon_C)(Adeno_PVIII)_1493.txt,PF01310_PF03678,1,396.3250,33.0,408.70,6cgv,6,7,7
4,Adeno_hexon_C Adeno_PVIII,(Adeno_hexon_C)(Adeno_PVIII)_1493.txt,PF01310_PF03678,2,298.4818,25.0,374.75,6cgv,5,6,6
...,...,...,...,...,...,...,...,...,...,...,...
15260,Fer4_7 NADH_Oxid_Nqo15,(Fer4_7)(NADH_Oxid_Nqo15)_8080.txt,PF12838_PF11497,1,323.8731,99.0,326.20,4hea,1,14,8
15269,Chordopox_RPO7 Pox_RNA_pol_35,(Chordopox_RPO7)(Pox_RNA_pol_35)_35486.txt,PF05864_PF03396,1,1492.7250,98.0,1503.20,6rie,2,4,4
15270,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,1,311.3024,79.0,297.50,5t70,9,13,12
15271,ig MHC_I,(ig)(MHC_I)_18510.txt,PF00129_PF00047,2,243.0559,30.0,231.25,5t70,7,11,10


In [39]:
df_counts.to_csv('/Users/johgeist/Downloads/3did_project_dataframes_new/ProtCID_parsed_files/Master_counts_df_ProtCID_20231017.csv', index= False)