In [1]:
import pandas as pd
import numpy as np
# from rdkit import DataStructs
# from rdkit import Chem
# from rdkit.Chem import AllChem
import time
import random
import os
from os.path import join
# from cd_clustering import *

## 1. Loading data from TCDB and GOA database and merging them:

### (a) Loading database created from TCDB:

In [2]:
df_transporter = pd.read_csv("database.csv")
display(df_transporter.head(2))
print(len(df_transporter))
print("Number of different sequences: %s\
\nNumber of different InChI strings: %s\
\nNumber of different Uniprot IDs: %s" 
      % (len(set(list(df_transporter["Sequence"]))), len(set(list(df_transporter["InChI"]))),len(set(list(df_transporter["UniProt"]))) ))

Unnamed: 0,ChEBI,InChI,TCNumber,Name,UniProt,Sequence,Substrate
0,CHEBI:1,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,2.A.22.1.6,TransporterOS=SchistosomamansoniGN=Slc6a3PE=2SV=1,E9LD23,MAEESNKNNMTAHLNKINTYKNNLIISNNSINNNNNSINNNNDIID...,(R)-noradrenaline
1,CHEBI:1,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,2.A.22.1.2,SODIUM-DEPENDENTNORADRENALINETRANSPORTER(NOREP...,P23975,MLLARMNPQVQPENNGADTGPEQPLRARKTAELLVVKERNGVQCLL...,(R)-noradrenaline


11298
Number of different sequences: 6796
Number of different InChI strings: 1051
Number of different Uniprot IDs: 6797


### (b) Loading database created from the GOA database:

In [3]:
df_transporter_GOA = pd.read_pickle("df_GOA_Transporter_exp.pkl")
display(df_transporter.head(2))
print(len(df_transporter_GOA))
print("Number of different KEGG IDs: %s\
\nNumber of different Uniprot IDs: %s" 
      % ( len(set(list(df_transporter_GOA["molecule ID"]))),len(set(list(df_transporter_GOA["Uniprot ID"]))) ))

Unnamed: 0,ChEBI,InChI,TCNumber,Name,UniProt,Sequence,Substrate
0,CHEBI:1,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,2.A.22.1.6,TransporterOS=SchistosomamansoniGN=Slc6a3PE=2SV=1,E9LD23,MAEESNKNNMTAHLNKINTYKNNLIISNNSINNNNNSINNNNDIID...,(R)-noradrenaline
1,CHEBI:1,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,2.A.22.1.2,SODIUM-DEPENDENTNORADRENALINETRANSPORTER(NOREP...,P23975,MLLARMNPQVQPENNGADTGPEQPLRARKTAELLVVKERNGVQCLL...,(R)-noradrenaline


4613
Number of different KEGG IDs: 273
Number of different Uniprot IDs: 3300


### (c) Checking how many Uniprot IDs are in both datasets:

In [4]:
GOA_UIDs = list(set(df_transporter_GOA["Uniprot ID"]))
TCDB_UIDs = list(set(df_transporter["UniProt"]))
duplicated_UIDs = []

for UID in GOA_UIDs:
    if UID in TCDB_UIDs:
        duplicated_UIDs.append(UID)
len(duplicated_UIDs)

934

### (d) Mapping InChI strings to ECFP vectors and KEGG IDs to ECFP vectors:

In [5]:
df_transporter

Unnamed: 0,ChEBI,InChI,TCNumber,Name,UniProt,Sequence,Substrate
0,CHEBI:1,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,2.A.22.1.6,TransporterOS=SchistosomamansoniGN=Slc6a3PE=2SV=1,E9LD23,MAEESNKNNMTAHLNKINTYKNNLIISNNSINNNNNSINNNNDIID...,(R)-noradrenaline
1,CHEBI:1,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,2.A.22.1.2,SODIUM-DEPENDENTNORADRENALINETRANSPORTER(NOREP...,P23975,MLLARMNPQVQPENNGADTGPEQPLRARKTAELLVVKERNGVQCLL...,(R)-noradrenaline
2,CHEBI:10008,InChI=1S/C27H44O/c1-19(2)8-6-9-21(4)25-15-16-2...,9.B.208.1.1,VitaminD3receptorOS=HomosapiensGN=VDRPE=1SV=1,P11473,MEAMAASTSLPDPGDFDRNVPRICGVCGDRATGFHFNAMTCEGCKG...,calciol
3,CHEBI:10022,"InChI=1S/C15H20O6/c1-7-3-9-14(5-16,11(19)10(7)...",2.A.1.3.84,TrichotheceneeffluxpumpOS=GibberellazeaeOX=551...,Q96W86,MTATVPQEGVVDLESQPDDRLRAEALATTAAELPEGYYTSARVMAS...,Vomitoxin
4,CHEBI:10023,InChI=1S/C16H14F3N5O/c1-10(15-14(19)5-20-7-22-...,3.A.1.205.32,ABCmultidrugtransporterMDR3OS=Trichophytonrubr...,F2SG60,MAPTEEANVTKPTGELRPDEKLNYEEDVKCSGSSSTTVGKTAYDTD...,voriconazole
...,...,...,...,...,...,...,...
11293,CHEBI:9948,"InChI=1S/C27H38N2O4/c1-20(2)27(19-28,22-10-12-...",3.A.1.208.3,OligomycinresistanceATP-dependentpermeaseYOR1-...,P53049,MTITVGDAVSETELENKSQNVVLSPKASASSDISTDVDKDTSSSWD...,verapamil
11294,CHEBI:9948,"InChI=1S/C27H38N2O4/c1-20(2)27(19-28,22-10-12-...",3.A.1.106.2,PutativemultidrugexportATP-binding/permeasepro...,Q2G2M9,MIKRYLQFVKPYKYRIFATIIVGIIKFGIPMLIPLLIKYAIDGVIN...,verapamil
11295,CHEBI:9983,InChI=1S/C46H58N4O9/c1-8-42(54)23-28-24-45(40(...,3.A.1.106.2,PutativemultidrugexportATP-binding/permeasepro...,Q2G2M9,MIKRYLQFVKPYKYRIFATIIVGIIKFGIPMLIPLLIKYAIDGVIN...,vincaleukoblastine
11296,CHEBI:9987,InChI=1S/C46H56N4O10/c1-7-42(55)22-28-23-45(40...,3.A.1.208.1,Multidrugresistance-associatedprotein6(MRP-lik...,O88269,MNGEHSMATPGESCAGLRVWNQTEQEPVAYHLLNLCFLRAAGSWVP...,vincristine


In [6]:
df_transporter["ECFP"] = ""
for ind in df_transporter.index:
    mol = Chem.inchi.MolFromInchi(df_transporter["InChI"][ind])
    ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024).ToBitString()
    df_transporter["ECFP"][ind] = ecfp

In [7]:
mol_folder ="C:\\Users\\alexk\\substrateprediction-main\\data\\mol-files"

df_transporter_GOA["ECFP"] = ""
for ind in df_transporter_GOA.index:
    try:
        mol = Chem.MolFromMolFile(join(mol_folder, df_transporter_GOA["molecule ID"][ind] + '.mol'))
        ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024).ToBitString()
        df_transporter_GOA["ECFP"][ind] = ecfp
    except:
        pass
    
#Remove all entries without ECFP:
df_transporter_GOA = df_transporter_GOA.loc[df_transporter_GOA["ECFP"] != ""]

### (e) Merging both datasets:

In [8]:
df_transporter["KEGG ID"] = np.nan

for ind in df_transporter_GOA.index:
    seq, uid = df_transporter_GOA["Sequence"][ind], df_transporter_GOA["Uniprot ID"][ind]
    ecfp, kegg_id = df_transporter_GOA["ECFP"][ind], df_transporter_GOA["molecule ID"][ind]
    #check if combination of ECFP and Sequence is already in TCDB:
    help_df = df_transporter.loc[df_transporter["Sequence"] == seq].loc[df_transporter["ECFP"] == ecfp]
    if len(help_df) == 0:
        df_transporter = df_transporter.append({"UniProt" : uid, "Sequence" : seq, "KEGG ID" : kegg_id, "ECFP": ecfp},
                             ignore_index = True)
df_transporter

Unnamed: 0,ChEBI,InChI,TCNumber,Name,UniProt,Sequence,Substrate,ECFP,KEGG ID
0,CHEBI:1,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,2.A.22.1.6,TransporterOS=SchistosomamansoniGN=Slc6a3PE=2SV=1,E9LD23,MAEESNKNNMTAHLNKINTYKNNLIISNNSINNNNNSINNNNDIID...,(R)-noradrenaline,0100000000000000001000000000000000000000000000...,
1,CHEBI:1,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,2.A.22.1.2,SODIUM-DEPENDENTNORADRENALINETRANSPORTER(NOREP...,P23975,MLLARMNPQVQPENNGADTGPEQPLRARKTAELLVVKERNGVQCLL...,(R)-noradrenaline,0100000000000000001000000000000000000000000000...,
2,CHEBI:10008,InChI=1S/C27H44O/c1-19(2)8-6-9-21(4)25-15-16-2...,9.B.208.1.1,VitaminD3receptorOS=HomosapiensGN=VDRPE=1SV=1,P11473,MEAMAASTSLPDPGDFDRNVPRICGVCGDRATGFHFNAMTCEGCKG...,calciol,0100100010000000000000000000000001011000010000...,
3,CHEBI:10022,"InChI=1S/C15H20O6/c1-7-3-9-14(5-16,11(19)10(7)...",2.A.1.3.84,TrichotheceneeffluxpumpOS=GibberellazeaeOX=551...,Q96W86,MTATVPQEGVVDLESQPDDRLRAEALATTAAELPEGYYTSARVMAS...,Vomitoxin,0100000000000000000000100000000001001000000000...,
4,CHEBI:10023,InChI=1S/C16H14F3N5O/c1-10(15-14(19)5-20-7-22-...,3.A.1.205.32,ABCmultidrugtransporterMDR3OS=Trichophytonrubr...,F2SG60,MAPTEEANVTKPTGELRPDEKLNYEEDVKCSGSSSTTVGKTAYDTD...,voriconazole,1100000000000000000000000000000001000000000100...,
...,...,...,...,...,...,...,...,...,...
14812,,,,,P02693,MAFDGTWKVDRNENYEKFMEKMGINVVKRKLGAHDNLKLTITQEGN...,,0000000000000000000000000000000000000000000000...,C00162
14813,,,,,P35396,MEQPQEETPEAREEEKEEVAMGDGAPELNGGPEHTLPSSSCADLSQ...,,0000000000000000000000000000000000000000000000...,C00162
14814,,,,,Q0GMA8,MGPPYSDLRESDEDRPAEAVGSVSGSRNALQPLPGEDDEEPFTTYF...,,0000000000000000000000000000000000000000000000...,C19610
14815,,,,,Q84W56,MMKPASLQGFSSHASSSIYSDVRRPATTPSKMAAFSALSLCPYTFT...,,0000000000000000000000000000000000000000000000...,C00954


In [9]:
df_transporter.to_csv("database_TCDB_and_GOA.csv", index = False)

In [10]:
df_transporter = pd.read_csv("database_TCDB_and_GOA.csv")

## 2.Spitting dataset into training and test set:
We want to make sure that proteins in the training set and in the test set are not very similar. To be more explicit: There should exist no protein in the training set with a sequence identity score >80% compared to and protein in the test set.

Getting input for cd-hit algorithm:

In [11]:
df_Uniprot = pd.DataFrame({"Uniprot ID" : df_transporter["UniProt"], "Sequence" : df_transporter["Sequence"]})
df_Uniprot.drop_duplicates(inplace = True)
df_Uniprot.reset_index(inplace = True)
df_Uniprot

Unnamed: 0,index,Uniprot ID,Sequence
0,0,E9LD23,MAEESNKNNMTAHLNKINTYKNNLIISNNSINNNNNSINNNNDIID...
1,1,P23975,MLLARMNPQVQPENNGADTGPEQPLRARKTAELLVVKERNGVQCLL...
2,2,P11473,MEAMAASTSLPDPGDFDRNVPRICGVCGDRATGFHFNAMTCEGCKG...
3,3,Q96W86,MTATVPQEGVVDLESQPDDRLRAEALATTAAELPEGYYTSARVMAS...
4,4,F2SG60,MAPTEEANVTKPTGELRPDEKLNYEEDVKCSGSSSTTVGKTAYDTD...
...,...,...,...
8971,14810,O61967,MPAFFCLPMACQRQVDSIDRSQSNLQAIPSDIFRFRKLEDLNLTMN...
8972,14811,P00505,MALLHSGRVLPGIAAAFHPGLAAAASARASSWWTHVEMGPPDPILG...
8973,14813,P35396,MEQPQEETPEAREEEKEEVAMGDGAPELNGGPEHTLPSSSCADLSQ...
8974,14814,Q0GMA8,MGPPYSDLRESDEDRPAEAVGSVSGSRNALQPLPGEDDEEPFTTYF...


In [16]:
ofile = open(join(".", "protein_data", 'clusters', "all_sequences.fasta"), "w")
for ind in df_Uniprot.index:
    seq = df_Uniprot["Sequence"][ind]
    if not pd.isnull(seq):
        seq_end = seq.find("#")
        seq = seq[:seq_end]
        ofile.write(">" + str(ind) + "\n" + seq  + "\n")
ofile.close()

In [17]:
df_Uniprot

# cluster the fasta files
cluster_folder = join(".", "protein_data", 'clusters')
start_folder = cluster_folder
cluster_all_levels(start_folder, 
                   cluster_folder, 
                   filename='all_sequences')

cd-hit -i .\protein_data\clusters\all_sequences.fasta -o .\protein_data\clusters\all_sequences_clustered_sequences_100.fasta -c 1.0 -n 5 -T 1 -M 2000 -d 0
cd-hit -i .\protein_data\clusters\all_sequences_clustered_sequences_100.fasta -o .\protein_data\clusters\all_sequences_clustered_sequences_90.fasta -c 0.9 -n 5 -T 1 -M 2000 -d 0
cd-hit -i .\protein_data\clusters\all_sequences_clustered_sequences_90.fasta -o .\protein_data\clusters\all_sequences_clustered_sequences_80.fasta -c 0.8 -n 5 -T 1 -M 2000 -d 0
cd-hit -i .\protein_data\clusters\all_sequences_clustered_sequences_80.fasta -o .\protein_data\clusters\all_sequences_clustered_sequences_70.fasta -c 0.7 -n 5 -T 1 -M 2000 -d 0
cd-hit -i .\protein_data\clusters\all_sequences_clustered_sequences_70.fasta -o .\protein_data\clusters\all_sequences_clustered_sequences_60.fasta -c 0.6 -n 4 -T 1 -M 2000 -d 0
cd-hit -i .\protein_data\clusters\all_sequences_clustered_sequences_60.fasta -o .\protein_data\clusters\all_sequences_clustered_sequence

In [18]:
###We first cluster in such a way that two different clusters do not contain two enzymes
###with a sequence identity higher than 80%:

# collect cluster members
df_80 = find_cluster_members_80(folder=cluster_folder, 
                          filename='all_sequences')

display(df_80.describe())
display(df_80.head())
display(df_80.tail())



###We first cluster in such a way that two different clusters do not contain two enzymes
###with a sequence identity higher than 60%:

cluster_all_levels_60(start_folder, 
                   cluster_folder, 
                   filename='all_sequences')

# collect cluster members
df_60 = find_cluster_members_60(folder=cluster_folder, 
                       filename='all_sequences')
display(df_60.describe())


###We first cluster in such a way that two different clusters do not contain two enzymes
###with a sequence identity higher than 40%:

# cluster the fasta files
cluster_all_levels(start_folder, 
                   cluster_folder, 
                   filename='all_sequences')

# collect cluster members
df_40 = find_cluster_members(folder=cluster_folder, 
                          filename='all_sequences')

display(df_40.describe())

Unnamed: 0,cluster
count,8920.0
mean,3688.330605
std,2244.383812
min,0.0
25%,1713.75
50%,3607.5
75%,5614.25
max,7718.0


Unnamed: 0,cluster,member
0,0,3297
1,0,3317
2,0,2617
3,1,2646
4,2,3312


Unnamed: 0,cluster,member
8915,7714,6220
8916,7715,5195
8917,7716,5103
8918,7717,3928
8919,7718,3190


Unnamed: 0,cluster
count,8920.0
mean,3178.513453
std,1978.947702
min,0.0
25%,1435.75
50%,3066.5
75%,4865.25
max,6815.0


Unnamed: 0,cluster
count,8920.0
mean,2244.081951
std,1465.087092
min,0.0
25%,952.0
50%,2097.0
75%,3455.25
max,5080.0


#### Splitting the dataset in train, validation and test set with a sequence identity cutoff of 80%. Later, we divide the test set in three subparts with identity cutoffs of <40%, 40-60% and 60-80%

In [19]:
df_Uniprot["cluster"] = np.nan
for ind in df_80.index:
    member = int(df_80["member"][ind])
    cluster = df_80["cluster"][ind]
    df_Uniprot["cluster"][member] = cluster

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [20]:
clusters = list(set(df_Uniprot["cluster"]))
random.seed(1)
random.shuffle(clusters)
print(len(clusters))

n = int(len(clusters)*0.8)
train_clusters = clusters[:n]
test_clusters = clusters[n:]

training_UIDs = df_Uniprot["Uniprot ID"].loc[df_Uniprot["cluster"].isin(train_clusters)]
test_UIDs = df_Uniprot["Uniprot ID"].loc[df_Uniprot["cluster"].isin(test_clusters)]

df_80["split"] = np.nan
df_80["split"].loc[df_80["cluster"].isin(train_clusters)] = "train"
df_80["split"].loc[df_80["cluster"].isin(test_clusters)] = "test"

train_members = list(df_80["member"].loc[df_80["split"] == "train"])
test_members = list(df_80["member"].loc[df_80["split"] == "test"])

df_60["split"] = np.nan
df_40["split"] = np.nan
df_60["split"].loc[df_60["member"].isin(train_members)] = "train"
df_60["split"].loc[df_60["member"].isin(test_members)] = "test"
df_40["split"].loc[df_40["member"].isin(train_members)] = "train"
df_40["split"].loc[df_40["member"].isin(test_members)] = "test"

len(training_UIDs), len(test_UIDs)

7775


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


(7182, 1850)

In [21]:
df_UID_MID_train = df_transporter.loc[df_transporter["UniProt"].isin(training_UIDs)]
df_UID_MID_test = df_transporter.loc[df_transporter["UniProt"].isin(test_UIDs)]
len(df_UID_MID_test), len(df_UID_MID_train)

(3066, 11875)

Calculating for every sequence in the validation and test set the maximum accuracy compared to sequences in the training set:

In [22]:
df_80["identity"] = np.nan
df_80["identity"].loc[df_80["split"].isin(["test"])] =  "60-80%"

test_indices = list(df_80.loc[~pd.isnull(df_80["identity"])].index)


for ind in test_indices:

    member = df_80["member"][ind]
    cluster = list(df_40["cluster"].loc[df_40["member"] == member])[0]
    cluster_splits = list(df_40["split"].loc[df_40["cluster"] == cluster])
    if not "train" in cluster_splits:
        df_80["identity"][ind] = "<40%"
    else:
        cluster = list(df_60["cluster"].loc[df_60["member"] == member])[0]
        cluster_splits = list(df_60["split"].loc[df_60["cluster"] == cluster])
        if not "train" in cluster_splits:
            df_80["identity"][ind] = "40-60%"
            
    if ind % 1000 == 0:
        print(ind)
                    
                    
ind = 0
df_Uniprot["identity"] = np.nan
for ind in df_Uniprot.index:
    try:
        df_Uniprot["identity"][ind] = list(df_80["identity"].loc[df_80["member"] == str(ind)])[0]
    except:
        None
        
df_Uniprot.to_pickle(join(".", "protein_data", "Uniprot_df_with_seq_identities.pkl"))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-do

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-do

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation:

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.py

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-do

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.py

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.py

6000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation:

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.py

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.py

8000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gui

In [23]:
df_Uniprot.loc[~pd.isnull(df_Uniprot["identity"])]

Unnamed: 0,index,Uniprot ID,Sequence,cluster,identity
0,0,E9LD23,MAEESNKNNMTAHLNKINTYKNNLIISNNSINNNNNSINNNNDIID...,1134.0,40-60%
5,5,Q8U2X0,MSEKTTKGVQLLRGDPKKAIVRLSIPMMIGMSVQTLYNLADGIWVS...,3191.0,<40%
12,12,Q9ZPR7,MYMIESKGGAIACMLLALLFLGTWPAIMTLTERRGRLPQHTYLDYT...,4103.0,60-80%
14,14,P48777,MDGPDQIGPDVRPRRTFGDRVRRAARAFTTRDGLIGDYDYGFLFTP...,1910.0,60-80%
16,16,Q93Z75,MMIAQELGIYVVESKGGAILCLLLSLLCLGTWPALMALLERRGRLP...,3826.0,60-80%
...,...,...,...,...,...
8953,14785,Q8W4P8,MDPTMNPTPTPSSAGNSVCTDELTNLPPEDSPLDSEKDDSVDFSQE...,1701.0,<40%
8965,14800,Q9SJP9,MNSKNRINNVGEGVDIEIPDTAHQISSDSWFQAAFVLTTSINSAYV...,3447.0,<40%
8966,14801,Q9V471,MSRNEDTPIAKRDSGRTRRSNYGTAPSFHLMEQGQPGVNVVAGNGN...,3052.0,<40%
8967,14802,Q9VFN7,MKLSLGLFVIFAALIGFTSSTDVSQCPKSKSKALAAGDVSISNCPK...,6995.0,<40%


## 3.Sampling negative data points:

In [24]:
df_Uniprot = pd.read_pickle(join(".", "protein_data", "Uniprot_df_with_seq_identities.pkl"))

### Find all small molcules (mostly ions):

In [25]:
small_molecules = []
small_ecfps = []

for ind in df_transporter.index:
    ecfp = df_transporter["ECFP"][ind]
    if sum(np.array(list(ecfp)).astype(int)) <=2:
        if not pd.isnull(df_transporter["InChI"][ind]):
            print(df_transporter["InChI"][ind])
            small_molecules.append(df_transporter["InChI"][ind])
            small_ecfps.append(ecfp)
        else:
            print(df_transporter["KEGG ID"][ind])
            small_molecules.append(df_transporter["KEGG ID"][ind])
            small_ecfps.append(ecfp)
small_molecules = list(set(small_molecules))

InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S/Zn/q+2
InChI=1S

InChI=1S/ClH.Na/h1H;/q;+1/p-1
InChI=1S/Mn/q+2
InChI=1S/Mn/q+2
InChI=1S/Mn/q+2
InChI=1S/Mn/q+2
InChI=1S/Hg/q+2
InChI=1S/Hg/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/Ni/q+2
InChI=1S/N

InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S/Ca/q+2
InChI=1S

InChI=1S/Ca/q+2
InChI=1S/Cd/q+2
InChI=1S/Co/q+2
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Ag/q+1
InChI=1S/Al/q+3
InChI=1S/Al/q+3
InChI=1S/Al/q+3
InChI=1S/Al/q+3
InChI=1S/Al/q+3
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S/Cu/q+1
InChI=1S

InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1
InChI=1S/p+1

InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H2O/h1H2
InChI=1S/H

InChI=1S/H2Se/h1H2
InChI=1S/H2Se/h1H2
InChI=1S/H2Se/h1H2
InChI=1S/Se
InChI=1S/Se
InChI=1S/Si
InChI=1S/Si
InChI=1S/Si
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InChI=1S/Na/q+1
InC

C00698
C00698
C00238
C00238
C00698
C00238
C00238
C00238
C00238
C00238
C00238
C00238
C00238
C00238
C00238
C00238
C00698
C00698
C00238
C00238
C00238
C00076
C00076
C00238
C00238
C00698
C00076
C00698
C00076
C00001
C01330
C00001
C01330
C00001
C01330
C00001
C00001
C00001
C00698
C00698
C00001
C00038
C00238
C01330
C01342
C00038
C01330
C01330
C01330
C01330
C01330
C01330
C00001
C01330
C00076
C00238
C00238
C00038
C00238
C00001
C00238
C00001
C00001
C01342
C01330
C00076
C00076
C00238
C00698
C00001
C00238
C00238
C00238
C00080
C00238
C00698
C01330
C00076
C00001
C00001
C00001
C00001
C00001
C00001
C00001
C00238
C00001
C00001
C00698
C00698
C00238
C00238
C00238
C00698
C00076
C00076
C00076
C00238
C00076
C00698
C00001
C00001
C00698
C00698
C00001
C00076
C00001
C00001
C00076
C00698
C00238
C00080
C00076
C00076
C00076
C00076
C00076
C00238
C01330
C00038
C00080
C01342
C01342
C00076
C00034
C00076
C19610
C00698
C00238
C00238
C01330
C00698
C00238
C00034
C00076
C19610
C00238
C00698
C00238
C00698
C00001
C00238
C00034

In [26]:
mol_folder ="C:\\Users\\alexk\\substrateprediction-main\\data\\mol-files\\"

def get_mol(met_ID):
    is_InChI = (met_ID[0:5] == "InChI")  
    if is_InChI:
        try:
            mol = Chem.inchi.MolFromInchi(met_ID)
        except:
            mol = None
        
    else:
        try:
            mol = Chem.MolFromMolFile(mol_folder + met_ID + '.mol')
        except OSError:
            mol = None
            
    return(mol)

def drop_samples_without_mol_file(df):
    droplist = []
    for ind in df.index:
        if get_mol(met_ID = df["molecule ID"][ind]) is None:
            droplist.append(ind)

    df.drop(droplist, inplace = True)
    return(df)

def get_metabolites_and_similarities(df):
    df_metabolites = pd.DataFrame(data = {"ECFP": df["ECFP"], "ID": df["molecule ID"]})
    df_metabolites = df_metabolites.drop_duplicates()
    df_metabolites.reset_index(inplace = True, drop = True)


    ms = [get_mol(met_ID = df_metabolites["ID"][ind]) for ind in df_metabolites.index]
    fps = [Chem.RDKFingerprint(x) for x in ms]

    similarity_matrix = np.zeros((len(ms), len(ms)))
    for i in range(len(ms)):
        for j in range(len(ms)):
            similarity_matrix[i,j] = DataStructs.FingerprintSimilarity(fps[i],fps[j])
            
    return(df_metabolites, similarity_matrix)



def get_valid_list(met_ID, UID, forbidden_metabolites, df_metabolites, similarity_matrix, lower_bound =0.7, upper_bound =0.9):
    binding_met_IDs = list(df_transporter["molecule ID"].loc[df_transporter["UniProt"] == UID])
    k = df_metabolites.loc[df_metabolites["ID"] == met_ID].index[0]

    similarities = similarity_matrix[k,:]
    selection = (similarities< upper_bound) * (similarities >lower_bound) 
    metabolites = list(df_metabolites["ID"].loc[selection])
    
    no_mets = list(set(binding_met_IDs + forbidden_metabolites))
    
    metabolites = [met for met in metabolites if (met not in no_mets)]
    return(metabolites)

def get_valid_list_small(met_ID, UID, allowed_small_molecules):
    binding_met_IDs = list(df_transporter["molecule ID"].loc[df_transporter["UniProt"] == UID])
    metabolites = [met for met in allowed_small_molecules if (met not in binding_met_IDs)]
    return(metabolites)


def create_negative_samples(df, df_metabolites, similarity_matrix):
    start = time.time()
    UID_list = []
    MID_list = []
    forbidden_mets = []

    for ind in df.index:
        if ind % 100 ==0:
            print(ind)
            print("Time: %s [min]" % np.round(float((time.time()-start)/60),2))

            df2 = pd.DataFrame(data = {"Uniprot ID": UID_list, "molecule ID" : MID_list})
            df2["outcome"] = 0
            df = pd.concat([df, df2], ignore_index=True)

            UID_list, MID_list = [], []

            forbidden_mets_old = forbidden_mets.copy()
            all_mets = list(set(df["molecule ID"]))
            all_mets = [met for met in all_mets if not met in forbidden_mets_old]
            forbidden_mets = list(set([met for met in all_mets if 
                                       (np.mean(df["outcome"].loc[df["molecule ID"] == met]) < 1/4)]))
            forbidden_mets = forbidden_mets + forbidden_mets_old
            print(len(forbidden_mets))

        UID = df["Uniprot ID"][ind]
        met_ID = df["molecule ID"][ind]

        metabolites = get_valid_list(met_ID = met_ID, UID = UID, forbidden_metabolites= forbidden_mets,
                                     df_metabolites = df_metabolites, similarity_matrix = similarity_matrix,
                                     lower_bound =0.7, upper_bound =0.95)
        lower_bound = 0.7
        while len(metabolites) < 2:
            lower_bound = lower_bound - 0.2
            metabolites = get_valid_list(met_ID = met_ID, UID = UID, forbidden_metabolites= forbidden_mets,
                                     df_metabolites = df_metabolites, similarity_matrix = similarity_matrix,
                                     lower_bound =lower_bound, upper_bound =0.95)
            if lower_bound <0:
                break
        
        new_metabolites =  random.sample(metabolites, min(3,len(metabolites)))

        for met in new_metabolites:
            UID_list.append(UID), MID_list.append(met)

    df2 = pd.DataFrame(data = {"Uniprot ID": UID_list, "molecule ID" : MID_list})
    df2["outcome"] = 0

    df = pd.concat([df, df2], ignore_index = True)
    return(df)

def create_negative_samples_V2(df, df_metabolites, similarity_matrix):
    start = time.time()
    UID_list = []
    MID_list = []
    forbidden_mets = []

    for ind in df.index:
        if ind % 100 ==0:
            print(ind)
            print("Time: %s [min]" % np.round(float((time.time()-start)/60),2))

            df2 = pd.DataFrame(data = {"Uniprot ID": UID_list, "molecule ID" : MID_list})
            df2["outcome"] = 0
            df = pd.concat([df, df2], ignore_index=True)

            UID_list, MID_list = [], []

            forbidden_mets_old = forbidden_mets.copy()
            all_mets = list(set(df["molecule ID"]))
            all_mets = [met for met in all_mets if not met in forbidden_mets_old]
            forbidden_mets = list(set([met for met in all_mets if 
                                       (np.mean(df["outcome"].loc[df["molecule ID"] == met]) < 1/4)]))
            forbidden_mets = forbidden_mets + forbidden_mets_old
            print(len(forbidden_mets))
            
            allowed_small_molecules = [mol for mol in small_molecules if not mol in forbidden_mets]
            print(len(allowed_small_molecules))

        UID = df["Uniprot ID"][ind]
        met_ID = df["molecule ID"][ind]
        
        #check if we have small molecule:
        if met_ID in small_molecules:
            #sample 2 small molecules:
            metabolites = get_valid_list_small(met_ID = met_ID, UID = UID, allowed_small_molecules = allowed_small_molecules)
            n = min(2,len(metabolites))
            new_metabolites =  random.sample(metabolites, n)
            
            #smale one bigger molecule:
            metabolites = get_valid_list(met_ID = met_ID, UID = UID, forbidden_metabolites= forbidden_mets,
                                         df_metabolites = df_metabolites, similarity_matrix = similarity_matrix,
                                         lower_bound =-0.1, upper_bound =0.95)
            new_metabolites = new_metabolites + list(random.sample(metabolites, 3-n))
            
            for met in new_metabolites:
                UID_list.append(UID), MID_list.append(met)
            
            
        else:
            metabolites = get_valid_list(met_ID = met_ID, UID = UID, forbidden_metabolites= forbidden_mets,
                                         df_metabolites = df_metabolites, similarity_matrix = similarity_matrix,
                                         lower_bound =0.7, upper_bound =0.95)
            lower_bound = 0.7
            while len(metabolites) < 2:
                lower_bound = lower_bound - 0.2
                metabolites = get_valid_list(met_ID = met_ID, UID = UID, forbidden_metabolites= forbidden_mets,
                                         df_metabolites = df_metabolites, similarity_matrix = similarity_matrix,
                                         lower_bound =lower_bound, upper_bound =0.95)
                if lower_bound <0:
                    break

            new_metabolites =  random.sample(metabolites, min(3,len(metabolites)))

            for met in new_metabolites:
                UID_list.append(UID), MID_list.append(met)

    df2 = pd.DataFrame(data = {"Uniprot ID": UID_list, "molecule ID" : MID_list})
    df2["outcome"] = 0

    df = pd.concat([df, df2], ignore_index = True)
    return(df)

In [27]:
df_UID_MID_train = df_transporter.loc[df_transporter["UniProt"].isin(training_UIDs)]
df_UID_MID_test = df_transporter.loc[df_transporter["UniProt"].isin(test_UIDs)]
len(df_UID_MID_test), len(df_UID_MID_train)

(3066, 11875)

In [28]:
df_transporter["molecule ID"] = [df_transporter["InChI"][ind] if not pd.isnull(df_transporter["InChI"][ind])
                                      else df_transporter["KEGG ID"][ind] for ind in df_transporter.index]



df_UID_MID_train["molecule ID"] = [df_UID_MID_train["InChI"][ind] if not pd.isnull(df_UID_MID_train["InChI"][ind])
                                      else df_UID_MID_train["KEGG ID"][ind] for ind in df_UID_MID_train.index]

df_UID_MID_train = pd.DataFrame({"Uniprot ID" : df_UID_MID_train["UniProt"],
                   "Sequence" : df_UID_MID_train["Sequence"],
                  "molecule ID" : df_UID_MID_train["molecule ID"],
                  "ECFP" : df_UID_MID_train["ECFP"],
                  "outcome" : 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


#### Creating negative data points for the training set (experimental evidence):

In [29]:
df_UID_MID_train = drop_samples_without_mol_file(df = df_UID_MID_train)
#calculating similarity matrix for all metabolites in the set:
df_metabolites_train, similarity_matrix_train = get_metabolites_and_similarities(df = df_UID_MID_train)
print(len(df_metabolites_train))

df_UID_MID_train.reset_index(inplace = True, drop = True)

df_UID_MID_train = create_negative_samples_V2(df = df_UID_MID_train, df_metabolites = df_metabolites_train,
                                          similarity_matrix = similarity_matrix_train)
df_UID_MID_train

1201
0
Time: 0.0 [min]
0
81
100
Time: 0.02 [min]
6
75
200
Time: 0.04 [min]
14
67
300
Time: 0.05 [min]
22
67
400
Time: 0.07 [min]
31
67
500
Time: 0.09 [min]
50
65
600
Time: 0.11 [min]
64
65
700
Time: 0.13 [min]
79
65
800
Time: 0.15 [min]
97
64
900
Time: 0.17 [min]
115
64
1000
Time: 0.19 [min]
126
64
1100
Time: 0.21 [min]
140
60
1200
Time: 0.23 [min]
145
56
1300
Time: 0.25 [min]
158
54
1400
Time: 0.27 [min]
179
54
1500
Time: 0.29 [min]
192
50
1600
Time: 0.31 [min]
206
50
1700
Time: 0.33 [min]
218
47
1800
Time: 0.36 [min]
226
45
1900
Time: 0.38 [min]
233
45
2000
Time: 0.4 [min]
247
44
2100
Time: 0.42 [min]
258
43
2200
Time: 0.45 [min]
268
40
2300
Time: 0.47 [min]
282
40
2400
Time: 0.49 [min]
291
40
2500
Time: 0.51 [min]
295
40
2600
Time: 0.53 [min]
307
40
2700
Time: 0.55 [min]
332
39
2800
Time: 0.58 [min]
339
38
2900
Time: 0.6 [min]
343
38
3000
Time: 0.63 [min]
349
37
3100
Time: 0.66 [min]
357
36
3200
Time: 0.68 [min]
377
34
3300
Time: 0.71 [min]
385
33
3400
Time: 0.73 [min]
399
33
3500
T

Unnamed: 0,Uniprot ID,Sequence,molecule ID,ECFP,outcome
0,P23975,MLLARMNPQVQPENNGADTGPEQPLRARKTAELLVVKERNGVQCLL...,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,0100000000000000001000000000000000000000000000...,1
1,P11473,MEAMAASTSLPDPGDFDRNVPRICGVCGDRATGFHFNAMTCEGCKG...,InChI=1S/C27H44O/c1-19(2)8-6-9-21(4)25-15-16-2...,0100100010000000000000000000000001011000010000...,1
2,Q96W86,MTATVPQEGVVDLESQPDDRLRAEALATTAAELPEGYYTSARVMAS...,"InChI=1S/C15H20O6/c1-7-3-9-14(5-16,11(19)10(7)...",0100000000000000000000100000000001001000000000...,1
3,F2SG60,MAPTEEANVTKPTGELRPDEKLNYEEDVKCSGSSSTTVGKTAYDTD...,InChI=1S/C16H14F3N5O/c1-10(15-14(19)5-20-7-22-...,1100000000000000000000000000000001000000000100...,1
4,WP_068464567.1,MKIKDWNRSLKVRLVGEFFMNTSFWMVFPFLAIYFAEEFGKGLAGM...,InChI=1S/C16H18FN3O3/c1-2-19-9-11(16(22)23)15(...,0100000000000000000100010000000001100010000001...,1
...,...,...,...,...,...
45792,Q84W56,,InChI=1S/C6H9N3O2/c7-5(6(10)11)1-4-2-8-3-9-4/h...,,0
45793,Q84W56,,InChI=1S/C15H31N3O13P2/c16-13-1-7(20)11(28-13)...,,0
45794,Q9CAT6,,InChI=1S/NO3/c2-1(3)4/q-1,,0
45795,Q9CAT6,,"InChI=1S/CH4N2O/c2-1(3)4/h(H4,2,3,4)",,0


#### Creating negative data points for the test set

In [30]:
df_UID_MID_test["molecule ID"] = [df_UID_MID_test["InChI"][ind] if not pd.isnull(df_UID_MID_test["InChI"][ind])
                                      else df_UID_MID_test["KEGG ID"][ind] for ind in df_UID_MID_test.index]

df_UID_MID_test = pd.DataFrame({"Uniprot ID" : df_UID_MID_test["UniProt"],
                   "Sequence" : df_UID_MID_test["Sequence"],
                  "molecule ID" : df_UID_MID_test["molecule ID"],
                  "ECFP" : df_UID_MID_test["ECFP"],
                  "outcome" : 1})


df_UID_MID_test = drop_samples_without_mol_file(df = df_UID_MID_test)
#calculating similarity matrix for all metabolites in the set:
df_metabolites_test, similarity_matrix_test = get_metabolites_and_similarities(df = df_UID_MID_test)
print(len(df_metabolites_test))

df_UID_MID_test.reset_index(inplace = True, drop = True)

df_UID_MID_test = create_negative_samples_V2(df = df_UID_MID_test, df_metabolites = df_metabolites_test,
                                          similarity_matrix = similarity_matrix_test)
df_UID_MID_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


678
0
Time: 0.0 [min]
0
81
100
Time: 0.01 [min]
18
67
200
Time: 0.02 [min]
31
65
300
Time: 0.02 [min]
45
64
400
Time: 0.03 [min]
65
61
500
Time: 0.04 [min]
90
55
600
Time: 0.05 [min]
109
48
700
Time: 0.06 [min]
127
46
800
Time: 0.07 [min]
138
40
900
Time: 0.08 [min]
165
33
1000
Time: 0.09 [min]
192
32
1100
Time: 0.1 [min]
209
32
1200
Time: 0.1 [min]
233
32
1300
Time: 0.11 [min]
256
29
1400
Time: 0.12 [min]
273
29
1500
Time: 0.13 [min]
291
25
1600
Time: 0.14 [min]
314
25
1700
Time: 0.15 [min]
341
24
1800
Time: 0.16 [min]
373
24
1900
Time: 0.17 [min]
396
23
2000
Time: 0.18 [min]
424
20
2100
Time: 0.19 [min]
444
16
2200
Time: 0.2 [min]
468
15
2300
Time: 0.2 [min]
488
15
2400
Time: 0.21 [min]
510
13
2500
Time: 0.22 [min]
530
11
2600
Time: 0.23 [min]
553
11
2700
Time: 0.24 [min]
569
11
2800
Time: 0.25 [min]
596
11
2900
Time: 0.26 [min]
615
11
3000
Time: 0.27 [min]
628
11


Unnamed: 0,Uniprot ID,Sequence,molecule ID,ECFP,outcome
0,E9LD23,MAEESNKNNMTAHLNKINTYKNNLIISNNSINNNNNSINNNNDIID...,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,0100000000000000001000000000000000000000000000...,1
1,Q8U2X0,MSEKTTKGVQLLRGDPKKAIVRLSIPMMIGMSVQTLYNLADGIWVS...,InChI=1S/C16H18FN3O3/c1-2-19-9-11(16(22)23)15(...,0100000000000000000100010000000001100010000001...,1
2,Q9ZPR7,MYMIESKGGAIACMLLALLFLGTWPAIMTLTERRGRLPQHTYLDYT...,InChI=1S/C5H4N4O2/c10-4-2-3(7-1-6-2)8-5(11)9-4...,0000000000000000000000000000000000000000000000...,1
3,P48777,MDGPDQIGPDVRPRRTFGDRVRRAARAFTTRDGLIGDYDYGFLFTP...,InChI=1S/C5H4N4O2/c10-4-2-3(7-1-6-2)8-5(11)9-4...,0000000000000000000000000000000000000000000000...,1
4,Q93Z75,MMIAQELGIYVVESKGGAILCLLLSLLCLGTWPALMALLERRGRLP...,InChI=1S/C5H4N4O2/c10-4-2-3(7-1-6-2)8-5(11)9-4...,0000000000000000000000000000000000000000000000...,1
...,...,...,...,...,...
11866,Q9X1G6,,InChI=1S/C15H31N3O13P2/c16-13-1-7(20)11(28-13)...,,0
11867,Q9X1G6,,InChI=1S/C9H11NO3/c10-8(9(12)13)5-6-1-3-7(11)4...,,0
11868,O61967,,C00195,,0
11869,O61967,,"InChI=1S/C9H17NO5/c1-9(2,5-11)7(14)8(15)10-4-3...",,0


Adding ECFPs and Sequence for all newly added data points:

In [31]:
for ind in df_UID_MID_train.index:
    if df_UID_MID_train["outcome"][ind] == 0:
        UID, met_ID = df_UID_MID_train["Uniprot ID"][ind], df_UID_MID_train["molecule ID"][ind]
        df_UID_MID_train["Sequence"][ind] = list(df_transporter["Sequence"].loc[df_transporter["UniProt"] == UID])[0]
        df_UID_MID_train["ECFP"][ind] = list(df_transporter["ECFP"].loc[df_transporter["molecule ID"] == met_ID])[0]
df_UID_MID_train

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,Uniprot ID,Sequence,molecule ID,ECFP,outcome
0,P23975,MLLARMNPQVQPENNGADTGPEQPLRARKTAELLVVKERNGVQCLL...,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,0100000000000000001000000000000000000000000000...,1
1,P11473,MEAMAASTSLPDPGDFDRNVPRICGVCGDRATGFHFNAMTCEGCKG...,InChI=1S/C27H44O/c1-19(2)8-6-9-21(4)25-15-16-2...,0100100010000000000000000000000001011000010000...,1
2,Q96W86,MTATVPQEGVVDLESQPDDRLRAEALATTAAELPEGYYTSARVMAS...,"InChI=1S/C15H20O6/c1-7-3-9-14(5-16,11(19)10(7)...",0100000000000000000000100000000001001000000000...,1
3,F2SG60,MAPTEEANVTKPTGELRPDEKLNYEEDVKCSGSSSTTVGKTAYDTD...,InChI=1S/C16H14F3N5O/c1-10(15-14(19)5-20-7-22-...,1100000000000000000000000000000001000000000100...,1
4,WP_068464567.1,MKIKDWNRSLKVRLVGEFFMNTSFWMVFPFLAIYFAEEFGKGLAGM...,InChI=1S/C16H18FN3O3/c1-2-19-9-11(16(22)23)15(...,0100000000000000000100010000000001100010000001...,1
...,...,...,...,...,...
45792,Q84W56,MMKPASLQGFSSHASSSIYSDVRRPATTPSKMAAFSALSLCPYTFT...,InChI=1S/C6H9N3O2/c7-5(6(10)11)1-4-2-8-3-9-4/h...,0100000000000000000000000000000000000000000000...,0
45793,Q84W56,MMKPASLQGFSSHASSSIYSDVRRPATTPSKMAAFSALSLCPYTFT...,InChI=1S/C15H31N3O13P2/c16-13-1-7(20)11(28-13)...,0000000000000000000000000000000000000000000000...,0
45794,Q9CAT6,MEPSKQEVPKLMETPPNISNDSSATEKGEATRQQQLPNNRYALTVD...,InChI=1S/NO3/c2-1(3)4/q-1,0000000010000000000000000000000000000000000000...,0
45795,Q9CAT6,MEPSKQEVPKLMETPPNISNDSSATEKGEATRQQQLPNNRYALTVD...,"InChI=1S/CH4N2O/c2-1(3)4/h(H4,2,3,4)",0000000000000000000000000000000000000000000000...,0


In [32]:
for ind in df_UID_MID_test.index:
    if df_UID_MID_test["outcome"][ind] == 0:
        UID, met_ID = df_UID_MID_test["Uniprot ID"][ind], df_UID_MID_test["molecule ID"][ind]
        df_UID_MID_test["Sequence"][ind] = list(df_transporter["Sequence"].loc[df_transporter["UniProt"] == UID])[0]
        df_UID_MID_test["ECFP"][ind] = list(df_transporter["ECFP"].loc[df_transporter["molecule ID"] == met_ID])[0]
df_UID_MID_test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,Uniprot ID,Sequence,molecule ID,ECFP,outcome
0,E9LD23,MAEESNKNNMTAHLNKINTYKNNLIISNNSINNNNNSINNNNDIID...,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,0100000000000000001000000000000000000000000000...,1
1,Q8U2X0,MSEKTTKGVQLLRGDPKKAIVRLSIPMMIGMSVQTLYNLADGIWVS...,InChI=1S/C16H18FN3O3/c1-2-19-9-11(16(22)23)15(...,0100000000000000000100010000000001100010000001...,1
2,Q9ZPR7,MYMIESKGGAIACMLLALLFLGTWPAIMTLTERRGRLPQHTYLDYT...,InChI=1S/C5H4N4O2/c10-4-2-3(7-1-6-2)8-5(11)9-4...,0000000000000000000000000000000000000000000000...,1
3,P48777,MDGPDQIGPDVRPRRTFGDRVRRAARAFTTRDGLIGDYDYGFLFTP...,InChI=1S/C5H4N4O2/c10-4-2-3(7-1-6-2)8-5(11)9-4...,0000000000000000000000000000000000000000000000...,1
4,Q93Z75,MMIAQELGIYVVESKGGAILCLLLSLLCLGTWPALMALLERRGRLP...,InChI=1S/C5H4N4O2/c10-4-2-3(7-1-6-2)8-5(11)9-4...,0000000000000000000000000000000000000000000000...,1
...,...,...,...,...,...
11866,Q9X1G6,MSSIKKISFVGIFSALATLVMFLEFPIFPQASFLKYDPSEIPALIV...,InChI=1S/C15H31N3O13P2/c16-13-1-7(20)11(28-13)...,0000000000000000000000000000000000000000000000...,0
11867,Q9X1G6,MSSIKKISFVGIFSALATLVMFLEFPIFPQASFLKYDPSEIPALIV...,InChI=1S/C9H11NO3/c10-8(9(12)13)5-6-1-3-7(11)4...,0100000000000000000000100000000000000000000000...,0
11868,O61967,MPAFFCLPMACQRQVDSIDRSQSNLQAIPSDIFRFRKLEDLNLTMN...,C00195,0100000000001000000000000000000001000000000000...,0
11869,O61967,MPAFFCLPMACQRQVDSIDRSQSNLQAIPSDIFRFRKLEDLNLTMN...,"InChI=1S/C9H17NO5/c1-9(2,5-11)7(14)8(15)10-4-3...",0100000000000100000000000000000001000000000000...,0


In [33]:
df_UID_MID_train.to_pickle(join(".", "protein_data", "df_UID_MID_train_V3.pkl"))
df_UID_MID_test.to_pickle(join(".", "protein_data", "df_UID_MID_test_V3.pkl"))

## 4.Calculating enzyme representations for all Sequences:

In [34]:
df_Uniprot.drop_duplicates(inplace = True)
df_Uniprot.reset_index(inplace = True)
df_Uniprot

Unnamed: 0,level_0,index,Uniprot ID,Sequence,cluster,identity
0,0,0,E9LD23,MAEESNKNNMTAHLNKINTYKNNLIISNNSINNNNNSINNNNDIID...,1134.0,40-60%
1,1,1,P23975,MLLARMNPQVQPENNGADTGPEQPLRARKTAELLVVKERNGVQCLL...,1666.0,
2,2,2,P11473,MEAMAASTSLPDPGDFDRNVPRICGVCGDRATGFHFNAMTCEGCKG...,3635.0,
3,3,3,Q96W86,MTATVPQEGVVDLESQPDDRLRAEALATTAAELPEGYYTSARVMAS...,1827.0,
4,4,4,F2SG60,MAPTEEANVTKPTGELRPDEKLNYEEDVKCSGSSSTTVGKTAYDTD...,162.0,
...,...,...,...,...,...,...
8971,8971,14810,O61967,MPAFFCLPMACQRQVDSIDRSQSNLQAIPSDIFRFRKLEDLNLTMN...,1263.0,<40%
8972,8972,14811,P00505,MALLHSGRVLPGIAAAFHPGLAAAASARASSWWTHVEMGPPDPILG...,3612.0,
8973,8973,14813,P35396,MEQPQEETPEAREEEKEEVAMGDGAPELNGGPEHTLPSSSCADLSQ...,3478.0,
8974,8974,14814,Q0GMA8,MGPPYSDLRESDEDRPAEAVGSVSGSRNALQPLPGEDDEEPFTTYF...,2039.0,


Creating FASTA file will all sequences as input for ESM1b model

In [35]:
ofile = open(join(".", "protein_data", "all_transporter_sequences.fasta"), "w")
for ind in df_Uniprot.index:
    seq = df_Uniprot["Sequence"][ind]
    if not pd.isnull(seq):
        seq_end = seq.find("#")
        seq = seq[:seq_end]
        ofile.write(">" + str(ind) + "\n" + seq  + "\n")
ofile.close()

Creating enzyme representations on HILBERT. Using the created .pt file to map ESM1b vectors to Sequences:

In [36]:
import torch 

df_Uniprot["ESM1b"] = ""
rep_dict = torch.load(join(".", "protein_data", "all_transporter_sequences.pt"))

for ind in df_Uniprot.index:
    try:
        df_Uniprot["ESM1b"][ind] = rep_dict[str(ind) +".pt"]
    except:
        print(ind)
df_Uniprot

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


4
39
45
48
54
61
62
66
74
75
79
92
153
218
277
317
340
341
342
369
371
442
524
564
565
588
602
650
659
660
682
705
719
852
884
907
943
979
980
981
983
984
1004
1020
1099
1101
1104
1105
1109
1110
1113
1125
1126
1127
1130
1131
1133
1147
1192
1224
1234
1243
1255
1257
1337
1341
1344
1349
1354
1386
1402
1416
1594
1597
1708
1716
1752
1762
1764
1765
1768
1854
1861
1864
1885
1894
1917
1932
1943
1963
1965
1968
1973
1976
1977
1978
1979
1983
1985
1987
1988
1994
2009
2017
2026
2031
2044
2045
2067
2074
2075
2077
2083
2091
2092
2095
2208
2213
2230
2292
2294
2311
2359
2380
2421
2424
2425
2446
2448
2453
2454
2458
2472
2474
2484
2492
2496
2500
2501
2507
2510
2515
2519
2520
2527
2533
2534
2538
2544
2547
2549
2551
2552
2556
2558
2559
2566
2567
2570
2577
2579
2580
2584
2588
2592
2593
2594
2595
2597
2598
2601
2603
2605
2608
2609
2611
2613
2614
2615
2617
2619
2627
2629
2633
2635
2638
2642
2645
2646
2647
2651
2658
2659
2661
2662
2665
2671
2681
2685
2691
2692
2694
2699
2700
2708
2713
2714
2715
2721
2723
2725


Unnamed: 0,level_0,index,Uniprot ID,Sequence,cluster,identity,ESM1b
0,0,0,E9LD23,MAEESNKNNMTAHLNKINTYKNNLIISNNSINNNNNSINNNNDIID...,1134.0,40-60%,"[0.03662403, 0.14126705, -0.055604078, 0.01308..."
1,1,1,P23975,MLLARMNPQVQPENNGADTGPEQPLRARKTAELLVVKERNGVQCLL...,1666.0,,"[-0.07108224, 0.096061245, 0.06852332, 0.03081..."
2,2,2,P11473,MEAMAASTSLPDPGDFDRNVPRICGVCGDRATGFHFNAMTCEGCKG...,3635.0,,"[-0.074354, 0.2844502, 0.062469013, -0.0139034..."
3,3,3,Q96W86,MTATVPQEGVVDLESQPDDRLRAEALATTAAELPEGYYTSARVMAS...,1827.0,,"[-0.08255166, 0.14464356, 0.08608833, 0.076414..."
4,4,4,F2SG60,MAPTEEANVTKPTGELRPDEKLNYEEDVKCSGSSSTTVGKTAYDTD...,162.0,,
...,...,...,...,...,...,...,...
8971,8971,14810,O61967,MPAFFCLPMACQRQVDSIDRSQSNLQAIPSDIFRFRKLEDLNLTMN...,1263.0,<40%,"[-0.010590305, 0.17112774, 0.06293734, -0.0026..."
8972,8972,14811,P00505,MALLHSGRVLPGIAAAFHPGLAAAASARASSWWTHVEMGPPDPILG...,3612.0,,"[-0.064013295, 0.23662551, 0.11381381, 0.00630..."
8973,8973,14813,P35396,MEQPQEETPEAREEEKEEVAMGDGAPELNGGPEHTLPSSSCADLSQ...,3478.0,,"[-0.018252473, 0.25217032, 0.045643948, -0.008..."
8974,8974,14814,Q0GMA8,MGPPYSDLRESDEDRPAEAVGSVSGSRNALQPLPGEDDEEPFTTYF...,2039.0,,"[-0.09520724, 0.0897981, 0.15097311, -0.013405..."


Mapping ESM1b vectors to positive and negative data points:

In [37]:
df_UID_MID_train = df_UID_MID_train.merge(df_Uniprot, how = "left", on = ["Uniprot ID", "Sequence"])
df_UID_MID_test = df_UID_MID_test.merge(df_Uniprot, how = "left", on = ["Uniprot ID", "Sequence"])

In [38]:
#Removing all data points without an ESM1b vector:
df_UID_MID_train = df_UID_MID_train.loc[df_UID_MID_train["ESM1b"] != ""]
df_UID_MID_test = df_UID_MID_test.loc[df_UID_MID_test["ESM1b"] != ""]

  result = libops.scalar_compare(x.ravel(), y, op)


#### Mapping sequence idenitity level to all proteins in the test set:

In [39]:
df_Uniprot

Unnamed: 0,level_0,index,Uniprot ID,Sequence,cluster,identity,ESM1b
0,0,0,E9LD23,MAEESNKNNMTAHLNKINTYKNNLIISNNSINNNNNSINNNNDIID...,1134.0,40-60%,"[0.03662403, 0.14126705, -0.055604078, 0.01308..."
1,1,1,P23975,MLLARMNPQVQPENNGADTGPEQPLRARKTAELLVVKERNGVQCLL...,1666.0,,"[-0.07108224, 0.096061245, 0.06852332, 0.03081..."
2,2,2,P11473,MEAMAASTSLPDPGDFDRNVPRICGVCGDRATGFHFNAMTCEGCKG...,3635.0,,"[-0.074354, 0.2844502, 0.062469013, -0.0139034..."
3,3,3,Q96W86,MTATVPQEGVVDLESQPDDRLRAEALATTAAELPEGYYTSARVMAS...,1827.0,,"[-0.08255166, 0.14464356, 0.08608833, 0.076414..."
4,4,4,F2SG60,MAPTEEANVTKPTGELRPDEKLNYEEDVKCSGSSSTTVGKTAYDTD...,162.0,,
...,...,...,...,...,...,...,...
8971,8971,14810,O61967,MPAFFCLPMACQRQVDSIDRSQSNLQAIPSDIFRFRKLEDLNLTMN...,1263.0,<40%,"[-0.010590305, 0.17112774, 0.06293734, -0.0026..."
8972,8972,14811,P00505,MALLHSGRVLPGIAAAFHPGLAAAASARASSWWTHVEMGPPDPILG...,3612.0,,"[-0.064013295, 0.23662551, 0.11381381, 0.00630..."
8973,8973,14813,P35396,MEQPQEETPEAREEEKEEVAMGDGAPELNGGPEHTLPSSSCADLSQ...,3478.0,,"[-0.018252473, 0.25217032, 0.045643948, -0.008..."
8974,8974,14814,Q0GMA8,MGPPYSDLRESDEDRPAEAVGSVSGSRNALQPLPGEDDEEPFTTYF...,2039.0,,"[-0.09520724, 0.0897981, 0.15097311, -0.013405..."


In [40]:
df_UID_MID_test["Sequence identity"] = np.nan
for ind in df_UID_MID_test.index:
    UID = df_UID_MID_test["Uniprot ID"][ind]
    help_df = df_Uniprot.loc[df_Uniprot["Uniprot ID"] == UID]
    df_UID_MID_test["Sequence identity"][ind] = list(help_df["identity"])[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [41]:
df_UID_MID_test.head(3)

Unnamed: 0,Uniprot ID,Sequence,molecule ID,ECFP,outcome,level_0,index,cluster,identity,ESM1b,Sequence identity
0,E9LD23,MAEESNKNNMTAHLNKINTYKNNLIISNNSINNNNNSINNNNDIID...,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,0100000000000000001000000000000000000000000000...,1,0,0,1134.0,40-60%,"[0.03662403, 0.14126705, -0.055604078, 0.01308...",40-60%
1,Q8U2X0,MSEKTTKGVQLLRGDPKKAIVRLSIPMMIGMSVQTLYNLADGIWVS...,InChI=1S/C16H18FN3O3/c1-2-19-9-11(16(22)23)15(...,0100000000000000000100010000000001100010000001...,1,5,5,3191.0,<40%,"[-0.12811443, 0.05226953, 0.060862567, -0.0099...",<40%
2,Q9ZPR7,MYMIESKGGAIACMLLALLFLGTWPAIMTLTERRGRLPQHTYLDYT...,InChI=1S/C5H4N4O2/c10-4-2-3(7-1-6-2)8-5(11)9-4...,0000000000000000000000000000000000000000000000...,1,12,12,4103.0,60-80%,"[-0.13493022, 0.1466448, 0.010632281, -0.18228...",60-80%


In [42]:
df_UID_MID_train.to_pickle(join(".", "training_data_V3.pkl"))
df_UID_MID_test.to_pickle(join(".", "test_data_V3.pkl"))

## 5. Trying to fit a first very simple model:

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_UID_MID_train = pd.read_pickle(join(".", "training_data_V3.pkl"))
df_UID_MID_test = pd.read_pickle(join(".", "test_data_V3.pkl"))

In [9]:
df_UID_MID_train["ECFP", 1]

Unnamed: 0,Uniprot ID,Sequence,molecule ID,ECFP,outcome,level_0,index,cluster,identity,ESM1b
0,P23975,MLLARMNPQVQPENNGADTGPEQPLRARKTAELLVVKERNGVQCLL...,InChI=1S/C8H11NO3/c9-4-8(12)5-1-2-6(10)7(11)3-...,0100000000000000001000000000000000000000000000...,1,1,1,1666.0,,"[-0.07108224, 0.096061245, 0.06852332, 0.03081..."
1,P11473,MEAMAASTSLPDPGDFDRNVPRICGVCGDRATGFHFNAMTCEGCKG...,InChI=1S/C27H44O/c1-19(2)8-6-9-21(4)25-15-16-2...,0100100010000000000000000000000001011000010000...,1,2,2,3635.0,,"[-0.074354, 0.2844502, 0.062469013, -0.0139034..."
2,Q96W86,MTATVPQEGVVDLESQPDDRLRAEALATTAAELPEGYYTSARVMAS...,"InChI=1S/C15H20O6/c1-7-3-9-14(5-16,11(19)10(7)...",0100000000000000000000100000000001001000000000...,1,3,3,1827.0,,"[-0.08255166, 0.14464356, 0.08608833, 0.076414..."
4,WP_068464567.1,MKIKDWNRSLKVRLVGEFFMNTSFWMVFPFLAIYFAEEFGKGLAGM...,InChI=1S/C16H18FN3O3/c1-2-19-9-11(16(22)23)15(...,0100000000000000000100010000000001100010000001...,1,6,6,3689.0,,"[0.13784999, 0.2672458, 0.11770853, -0.1454757..."
5,Q86MB6,MALGFSSAGEVYMYATCILLGVSLLMPLNALVSAPRFMVDYYKYVS...,InChI=1S/C5H4N4O2/c10-4-2-3(7-1-6-2)8-5(11)9-4...,0000000000000000000000000000000000000000000000...,1,7,7,3540.0,,"[0.064317204, 0.21982138, 0.1075682, -0.073877..."
...,...,...,...,...,...,...,...,...,...,...
45792,Q84W56,MMKPASLQGFSSHASSSIYSDVRRPATTPSKMAAFSALSLCPYTFT...,InChI=1S/C6H9N3O2/c7-5(6(10)11)1-4-2-8-3-9-4/h...,0100000000000000000000000000000000000000000000...,0,8975,14815,665.0,,"[-0.065544195, 0.3663752, 0.1645551, -0.034514..."
45793,Q84W56,MMKPASLQGFSSHASSSIYSDVRRPATTPSKMAAFSALSLCPYTFT...,InChI=1S/C15H31N3O13P2/c16-13-1-7(20)11(28-13)...,0000000000000000000000000000000000000000000000...,0,8975,14815,665.0,,"[-0.065544195, 0.3663752, 0.1645551, -0.034514..."
45794,Q9CAT6,MEPSKQEVPKLMETPPNISNDSSATEKGEATRQQQLPNNRYALTVD...,InChI=1S/NO3/c2-1(3)4/q-1,0000000010000000000000000000000000000000000000...,0,803,927,2280.0,,"[-0.07403659, 0.12013931, 0.092609845, -0.1040..."
45795,Q9CAT6,MEPSKQEVPKLMETPPNISNDSSATEKGEATRQQQLPNNRYALTVD...,"InChI=1S/CH4N2O/c2-1(3)4/h(H4,2,3,4)",0000000000000000000000000000000000000000000000...,0,803,927,2280.0,,"[-0.07403659, 0.12013931, 0.092609845, -0.1040..."


##### Splitting dataset in 80% training data and 20% test data (splitting by uniprot ID):

In [91]:
train_UIDs = list(set(df_UID_MID_train["Uniprot ID"]))
len(train_UIDs)

6136

In [5]:
df_train = df_UID_MID_train.copy()
df_test = df_UID_MID_test.copy()

In [9]:
14817/(10659 + 41880)

0.2820190715468509

In [93]:
from os.path import join
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
#from hyperopt import fmin, tpe, hp, Trials, rand
import xgboost as xgb
from sklearn.metrics import matthews_corrcoef



def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["outcome"][ind], );

    return(X,y)

train_X, train_y =  create_input_and_output_data(df = df_train)
test_X, test_y =  create_input_and_output_data(df = df_test)


feature_names =  ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

train_X = np.array(train_X)
test_X  = np.array(test_X)

train_y = np.array(train_y)
test_y  = np.array(test_y)

In [101]:
param = {'learning_rate': 0.31553117247348733,
         'max_delta_step': 1.7726044219753656,
         'max_depth': 10,
         'min_child_weight': 1.3845040588450772,
         'num_rounds': 542.68325188584106,
         'reg_alpha': 0.531395259755843,
         'reg_lambda': 3.744980563764689,
         'weight': 0.26187490421514203}

num_round = param["num_rounds"]

param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"

param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["outcome"]])

del param["num_rounds"]
del param["weight"]


dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y))
dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y))

evallist = [(dtest, 'eval'), (dtrain, 'train')]

bst = xgb.train(param,  dtrain, int(num_round), evallist)

y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))

roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
mcc = matthews_corrcoef(np.array(test_y), y_test_pred)

print("Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"  % (acc_test, roc_auc, mcc))


[0]	eval-error:0.54359	train-error:0.33881
[1]	eval-error:0.51644	train-error:0.30927
[2]	eval-error:0.47889	train-error:0.27987
[3]	eval-error:0.43384	train-error:0.25327
[4]	eval-error:0.42839	train-error:0.24121
[5]	eval-error:0.41924	train-error:0.23573
[6]	eval-error:0.40835	train-error:0.22345
[7]	eval-error:0.40105	train-error:0.21954
[8]	eval-error:0.38636	train-error:0.20369
[9]	eval-error:0.37644	train-error:0.19678
[10]	eval-error:0.37138	train-error:0.19297
[11]	eval-error:0.36534	train-error:0.18771
[12]	eval-error:0.36116	train-error:0.18304
[13]	eval-error:0.35523	train-error:0.17742
[14]	eval-error:0.34365	train-error:0.16988
[15]	eval-error:0.33800	train-error:0.16530
[16]	eval-error:0.33110	train-error:0.15876
[17]	eval-error:0.32720	train-error:0.15394
[18]	eval-error:0.32302	train-error:0.15004
[19]	eval-error:0.32146	train-error:0.14586
[20]	eval-error:0.31670	train-error:0.14126
[21]	eval-error:0.31349	train-error:0.13914
[22]	eval-error:0.30784	train-error:0.1357

[185]	eval-error:0.17883	train-error:0.02556
[186]	eval-error:0.17815	train-error:0.02553
[187]	eval-error:0.17805	train-error:0.02550
[188]	eval-error:0.17824	train-error:0.02543
[189]	eval-error:0.17795	train-error:0.02527
[190]	eval-error:0.17786	train-error:0.02513
[191]	eval-error:0.17795	train-error:0.02505
[192]	eval-error:0.17805	train-error:0.02494
[193]	eval-error:0.17776	train-error:0.02492
[194]	eval-error:0.17883	train-error:0.02488
[195]	eval-error:0.17834	train-error:0.02476
[196]	eval-error:0.17795	train-error:0.02449
[197]	eval-error:0.17815	train-error:0.02430
[198]	eval-error:0.17854	train-error:0.02433
[199]	eval-error:0.17795	train-error:0.02424
[200]	eval-error:0.17756	train-error:0.02426
[201]	eval-error:0.17698	train-error:0.02417
[202]	eval-error:0.17698	train-error:0.02411
[203]	eval-error:0.17756	train-error:0.02401
[204]	eval-error:0.17747	train-error:0.02389
[205]	eval-error:0.17717	train-error:0.02387
[206]	eval-error:0.17766	train-error:0.02400
[207]	eval

[368]	eval-error:0.16180	train-error:0.01808
[369]	eval-error:0.16239	train-error:0.01812
[370]	eval-error:0.16239	train-error:0.01811
[371]	eval-error:0.16219	train-error:0.01806
[372]	eval-error:0.16209	train-error:0.01802
[373]	eval-error:0.16112	train-error:0.01799
[374]	eval-error:0.16151	train-error:0.01799
[375]	eval-error:0.16161	train-error:0.01797
[376]	eval-error:0.16161	train-error:0.01797
[377]	eval-error:0.16170	train-error:0.01797
[378]	eval-error:0.16180	train-error:0.01793
[379]	eval-error:0.16180	train-error:0.01794
[380]	eval-error:0.16190	train-error:0.01788
[381]	eval-error:0.16170	train-error:0.01790
[382]	eval-error:0.16248	train-error:0.01788
[383]	eval-error:0.16278	train-error:0.01787
[384]	eval-error:0.16297	train-error:0.01787
[385]	eval-error:0.16287	train-error:0.01788
[386]	eval-error:0.16229	train-error:0.01788
[387]	eval-error:0.16209	train-error:0.01787
[388]	eval-error:0.16209	train-error:0.01785
[389]	eval-error:0.16190	train-error:0.01784
[390]	eval

In [100]:
df_UID_MID_test["prediction"] = y_test_pred

seq_identity = ["60-80%", "40-60%", "<40%"]

for identity in seq_identity:
    y_true = np.array(df_UID_MID_test["outcome"].loc[df_UID_MID_test["Sequence identity"] == identity])
    y_pred = np.array(df_UID_MID_test["prediction"].loc[df_UID_MID_test["Sequence identity"] == identity])
    acc = np.mean(y_pred == np.array(y_true))
    mcc = matthews_corrcoef(np.array(y_true), y_pred)
    print("Sequence identity %s, Accuracy: %s, MCC: %s \n" % (identity, acc, mcc))

Sequence identity 60-80%, Accuracy: 0.8461873638344226, MCC: 0.6140625864438229 

Sequence identity 40-60%, Accuracy: 0.8399518652226233, MCC: 0.5777100133860964 

Sequence identity <40%, Accuracy: 0.8193774660236738, MCC: 0.5039163841394687 

