### Import ids for randomization

In [1]:
import pandas as pd

pos_id = pd.read_csv("pos_cluster/positive_representative_data.tsv" , sep="\t")
neg_id = pd.read_csv("neg_cluster/negative_representative_data.tsv" , sep="\t")
pos_id['Class']="Positive"
neg_id['Class']="Negative"
pos_id.head()

Unnamed: 0,EntryID,OrganismName,Kingdom,SequenceLength,SPStart,SPEnd,Class
0,O43155,Homo sapiens,Metazoa,660,1,35,Positive
1,O43866,Homo sapiens,Metazoa,347,1,19,Positive
2,O75629,Homo sapiens,Metazoa,220,1,31,Positive
3,O94985,Homo sapiens,Metazoa,981,1,28,Positive
4,P01579,Homo sapiens,Metazoa,166,1,23,Positive


### Randomization of positive and negative ids

In [2]:
#DO NOT RERUN THIS CELL

pos_random = pos_id.sample(frac=1).reset_index(drop=True)
neg_random = neg_id.sample(frac=1).reset_index(drop=True)

In [3]:
pos_random.head()

Unnamed: 0,EntryID,OrganismName,Kingdom,SequenceLength,SPStart,SPEnd,Class
0,P82358,Acanthoscurria gomesiana,Metazoa,84,1,23,Positive
1,Q6UW56,Homo sapiens,Metazoa,229,1,30,Positive
2,P04639,Rattus norvegicus,Metazoa,259,1,18,Positive
3,Q29381,Sus scrofa,Metazoa,439,1,25,Positive
4,P52706,Prunus serotina,Viridiplantae,563,1,27,Positive


In [4]:
neg_random.head()

Unnamed: 0,EntryID,OrganismName,Kingdom,SequenceLength,HelixDomain,Class
0,O74792,Schizosaccharomyces pombe (strain 972 / ATCC 2...,Fungi,515,False,Negative
1,O95476,Homo sapiens,Metazoa,244,True,Negative
2,O35465,Mus musculus,Metazoa,402,False,Negative
3,Q9VGL8,Drosophila melanogaster,Metazoa,363,False,Negative
4,Q9Y4R8,Homo sapiens,Metazoa,837,False,Negative


### Divide 80% training and 20% benchmark sets

In [5]:
#training set creation

n_pos = int(len(pos_random) * 0.8)
n_neg = int(len(neg_random) * 0.8)

training_pos = pos_random[:n_pos] 
training_neg = neg_random[:n_neg]

In [6]:
print(n_pos , n_neg)

874 7147


In [7]:
training_pos

Unnamed: 0,EntryID,OrganismName,Kingdom,SequenceLength,SPStart,SPEnd,Class
0,P82358,Acanthoscurria gomesiana,Metazoa,84,1,23,Positive
1,Q6UW56,Homo sapiens,Metazoa,229,1,30,Positive
2,P04639,Rattus norvegicus,Metazoa,259,1,18,Positive
3,Q29381,Sus scrofa,Metazoa,439,1,25,Positive
4,P52706,Prunus serotina,Viridiplantae,563,1,27,Positive
...,...,...,...,...,...,...,...
869,P13691,Hordeum vulgare,Viridiplantae,152,1,30,Positive
870,Q9Y2I2,Homo sapiens,Metazoa,539,1,28,Positive
871,A0A0R4I951,Scolopendra dehaani,Metazoa,70,1,23,Positive
872,P32004,Homo sapiens,Metazoa,1257,1,19,Positive


In [8]:
training_neg

Unnamed: 0,EntryID,OrganismName,Kingdom,SequenceLength,HelixDomain,Class
0,O74792,Schizosaccharomyces pombe (strain 972 / ATCC 2...,Fungi,515,False,Negative
1,O95476,Homo sapiens,Metazoa,244,True,Negative
2,O35465,Mus musculus,Metazoa,402,False,Negative
3,Q9VGL8,Drosophila melanogaster,Metazoa,363,False,Negative
4,Q9Y4R8,Homo sapiens,Metazoa,837,False,Negative
...,...,...,...,...,...,...
7142,P47120,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,325,False,Negative
7143,P08212,Pisum sativum,Viridiplantae,81,True,Negative
7144,P53064,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,558,False,Negative
7145,Q80TH2,Mus musculus,Metazoa,1402,False,Negative


In [9]:
#benchmark set creation

benchmark_pos = pos_random[n_pos:] 
benchmark_neg = neg_random[n_neg:]

In [10]:
benchmark_pos

Unnamed: 0,EntryID,OrganismName,Kingdom,SequenceLength,SPStart,SPEnd,Class
874,Q9H7M9,Homo sapiens,Metazoa,311,1,32,Positive
875,Q8SXL2,Drosophila melanogaster,Metazoa,136,1,24,Positive
876,P27731,Gallus gallus,Metazoa,150,1,20,Positive
877,P31419,Manduca sexta,Metazoa,161,1,20,Positive
878,Q8MVA6,Ixodes scapularis,Metazoa,69,1,23,Positive
...,...,...,...,...,...,...,...
1088,Q14802,Homo sapiens,Metazoa,87,1,20,Positive
1089,Q8NI17,Homo sapiens,Metazoa,732,1,19,Positive
1090,Q62813,Rattus norvegicus,Metazoa,338,1,28,Positive
1091,I3LGZ3,Sus scrofa,Metazoa,81,1,24,Positive


In [11]:
benchmark_neg

Unnamed: 0,EntryID,OrganismName,Kingdom,SequenceLength,HelixDomain,Class
7147,P32266,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,881,False,Negative
7148,Q8N1N2,Homo sapiens,Metazoa,210,False,Negative
7149,Q94JY0,Arabidopsis thaliana,Viridiplantae,330,False,Negative
7150,Q8LDU1,Arabidopsis thaliana,Viridiplantae,321,False,Negative
7151,Q9ZNT7,Arabidopsis thaliana,Viridiplantae,286,False,Negative
...,...,...,...,...,...,...
8929,Q9LFP6,Arabidopsis thaliana,Viridiplantae,367,False,Negative
8930,Q7S5M7,Neurospora crassa (strain ATCC 24698 / 74-OR23...,Fungi,64,True,Negative
8931,Q9Y4P9,Homo sapiens,Metazoa,236,False,Negative
8932,Q1K7P9,Neurospora crassa (strain ATCC 24698 / 74-OR23...,Fungi,140,False,Negative


In [12]:
# Check if we have intersection between datasets

check1 = pd.merge(benchmark_neg , training_neg , how="inner")
check2 = pd.merge(benchmark_pos , training_pos , how="inner")
print(check1)
print(check2)

Empty DataFrame
Columns: [EntryID, OrganismName, Kingdom, SequenceLength, HelixDomain, Class]
Index: []
Empty DataFrame
Columns: [EntryID, OrganismName, Kingdom, SequenceLength, SPStart, SPEnd, Class]
Index: []


In [13]:
# Merge positive and negative benchmark sets

benchmark = pd.concat([benchmark_neg , benchmark_pos] , ignore_index=True)
#len(benchmark)
print(len(benchmark_pos),'+',len(benchmark_neg),'=',len(benchmark_neg)+len(benchmark_pos))

219 + 1787 = 2006


In [14]:
benchmark

Unnamed: 0,EntryID,OrganismName,Kingdom,SequenceLength,HelixDomain,Class,SPStart,SPEnd
0,P32266,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,881,False,Negative,,
1,Q8N1N2,Homo sapiens,Metazoa,210,False,Negative,,
2,Q94JY0,Arabidopsis thaliana,Viridiplantae,330,False,Negative,,
3,Q8LDU1,Arabidopsis thaliana,Viridiplantae,321,False,Negative,,
4,Q9ZNT7,Arabidopsis thaliana,Viridiplantae,286,False,Negative,,
...,...,...,...,...,...,...,...,...
2001,Q14802,Homo sapiens,Metazoa,87,,Positive,1.0,20.0
2002,Q8NI17,Homo sapiens,Metazoa,732,,Positive,1.0,19.0
2003,Q62813,Rattus norvegicus,Metazoa,338,,Positive,1.0,28.0
2004,I3LGZ3,Sus scrofa,Metazoa,81,,Positive,1.0,24.0


### Adding sequences for benchmark set from the fasta file

In [15]:
from Bio import SeqIO
import subprocess
import shlex
'''
Takes as input a list of ids 
and  a dataset of fasta sequences (for example the entire swissprot)
'''

#merge all the negative and positive representative sequences fasta files 
command = "cat neg_cluster/neg_rep_seq.fasta pos_cluster/pos_rep_seq.fasta" 
with open("rep_dataset.fasta", "w") as f:
    subprocess.run(shlex.split(command), stdout=f)
    
benchmark['Set']="Benchmark"
rep_dataset='rep_dataset.fasta'    
ids_to_get=benchmark['EntryID'].tolist() #convert the ids of the benchmark  to a list

found = 0 #keeps track of single ids.
counter = len(ids_to_get) #number of the ids inside the benchmark list


#find all the sequences inside the all representative sequences fasta file that corresponds to the ids inside the benchmark set
with open(rep_dataset , 'r') as file1:
    for prot in SeqIO.parse(file1 , 'fasta'):
        if str(prot.id) in ids_to_get:
            found = found + 1
            benchmark.loc[benchmark['EntryID'] == str(prot.id), 'Sequence'] = str(prot.seq)
        else:
            pass
            
print("how many ids are in the benchmark set: ", counter)
print("total number of sequences recovered: ",found)

how many ids are in the benchmark set:  2006
total number of sequences recovered:  2006


In [16]:
benchmark

Unnamed: 0,EntryID,OrganismName,Kingdom,SequenceLength,HelixDomain,Class,SPStart,SPEnd,Set,Sequence
0,P32266,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,881,False,Negative,,,Benchmark,MNASPVRLLILRRQLATHPAILYSSPYIKSPLVHLHSRMSNVHRSA...
1,Q8N1N2,Homo sapiens,Metazoa,210,False,Negative,,,Benchmark,MVADIKGNEQIEKYSWREACDTGSSRMDRKHGKYILNVEHSENQPP...
2,Q94JY0,Arabidopsis thaliana,Viridiplantae,330,False,Negative,,,Benchmark,MGSISMHITPSTALPIRHFRARVSCCSSGHVSFIKDVAATEPPMHL...
3,Q8LDU1,Arabidopsis thaliana,Viridiplantae,321,False,Negative,,,Benchmark,MKSLVAHFSTPLITARLVPRCIIHRASISAVSFSTVRRRFSPLTMA...
4,Q9ZNT7,Arabidopsis thaliana,Viridiplantae,286,False,Negative,,,Benchmark,MSFNKVPNIPGAPALSALLKVSVIGGLGVYALTNSLYNVDGGHRAV...
...,...,...,...,...,...,...,...,...,...,...
2001,Q14802,Homo sapiens,Metazoa,87,,Positive,1.0,20.0,Benchmark,MQKVTLGLLVFLAGFPVLDANDLEDKNSPFYYDWHSLQVGGLICAG...
2002,Q8NI17,Homo sapiens,Metazoa,732,,Positive,1.0,19.0,Benchmark,MMWTWALWMLPSLCKFSLAALPAKPENISCVYYYRKNLTCTWSPGK...
2003,Q62813,Rattus norvegicus,Metazoa,338,,Positive,1.0,28.0,Benchmark,MVGRVQPDRKQLPLVLLRLLCLLPTGLPVRSVDFNRGTDNITVRQG...
2004,I3LGZ3,Sus scrofa,Metazoa,81,,Positive,1.0,24.0,Benchmark,MRFLALTSLLCILLLCLSFFSAEGRRHPRNPAKPGKIRICCPRLPG...


### Adding sequences for training subset set from the fasta file

In [39]:
start=0
step_pos=n_pos//5
for i in range(1,6):
    if i==5:
        subset = pd.concat([subset , training_pos[start:]] , ignore_index=True)
        ids_to_get=training_pos[start:]['EntryID'].tolist()
    elif i==1:
        stop=i*step_pos
        subset=training_pos[start:stop]
        ids_to_get=training_pos[start:stop]['EntryID'].tolist()
        start=i*step_pos
    else:
        stop=i*step_pos
        subset = pd.concat([subset , training_pos[start:stop]] , ignore_index=True)
        ids_to_get=training_pos[start:stop]['EntryID'].tolist()
        start=i*step_pos
    print(f"Total number positive sequences in the subset {i}: ",len(subset))
    with open(rep_dataset , 'r') as file1:
        for prot in SeqIO.parse(file1 , 'fasta'):
            if str(prot.id) in ids_to_get:
                found = found + 1
                subset.loc[subset['EntryID'] == str(prot.id), 'Sequence'] = str(prot.seq)
                subset.loc[subset['EntryID'] == str(prot.id), 'Set'] = f"{i}"
            else:
                pass

###now work on negatives
step_neg=n_neg//5
start=0
for i in range(1,6):
    if i==5:
        subset = pd.concat([subset , training_neg[start:]], ignore_index=True)
        ids_to_get=training_neg[start:]['EntryID'].tolist()
    else:
        stop=i*step_neg
        subset = pd.concat([subset , training_neg[start:stop]] , ignore_index=True)
        ids_to_get=training_neg[start:stop]['EntryID'].tolist()
        start=i*step_neg
    print(f"Total number negative sequences in the subset {i}: ",len(subset))
    with open(rep_dataset , 'r') as file1:
        for prot in SeqIO.parse(file1 , 'fasta'):
            if str(prot.id) in ids_to_get:
                    found = found + 1
                    subset.loc[subset['EntryID'] == str(prot.id), 'Sequence'] = str(prot.seq)
                    subset.loc[subset['EntryID'] == str(prot.id), 'Set'] = f"{i}"
            else:
                pass

Total number positive sequences in the subset 1:  174


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset.loc[subset['EntryID'] == str(prot.id), 'Sequence'] = str(prot.seq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset.loc[subset['EntryID'] == str(prot.id), 'Set'] = f"{i}"


Total number positive sequences in the subset 2:  348
Total number positive sequences in the subset 3:  522
Total number positive sequences in the subset 4:  696
Total number positive sequences in the subset 5:  874
Total number negative sequences in the subset 1:  2303
Total number negative sequences in the subset 2:  3732
Total number negative sequences in the subset 3:  5161
Total number negative sequences in the subset 4:  6590
Total number negative sequences in the subset 5:  8021


In [40]:
len(subset)

8021

In [41]:
subset

Unnamed: 0,EntryID,OrganismName,Kingdom,SequenceLength,SPStart,SPEnd,Class,Sequence,Set,HelixDomain
0,P82358,Acanthoscurria gomesiana,Metazoa,84,1.0,23.0,Positive,MNRTRLFACLLLAVLILVHESNAQCRRLCYKQRCVTYCRGRGKRSL...,1,
1,Q6UW56,Homo sapiens,Metazoa,229,1.0,30.0,Positive,MAPHDPGSLTTLVPWAAALLLALGVERALALPEICTQCPGSVQNLS...,1,
2,P04639,Rattus norvegicus,Metazoa,259,1.0,18.0,Positive,MKAAVLAVALVFLTGCQAWEFWQQDEPQSQWDRVKDFATVYVDAVK...,1,
3,Q29381,Sus scrofa,Metazoa,439,1.0,25.0,Positive,MELGAAARAWSLLWLLLPLLGLVGASGPRTLVLLDNLNLRETHSLF...,1,
4,P52706,Prunus serotina,Viridiplantae,563,1.0,27.0,Positive,MEKSTMSAILLVLHLFVLLLQYSEVHSLATTSNHDFSYLRFAYDAT...,1,
...,...,...,...,...,...,...,...,...,...,...
8016,P47120,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,325,,,Negative,MSTNFEKHFQENVDECTLEQLRDILVNKSGKTVLANRFRALFNLKT...,5,False
8017,P08212,Pisum sativum,Viridiplantae,81,,,Negative,MNPLIAAASVIAAGLAVGLASIGPGVGQGTAAGQAVEGIARQPEAE...,5,True
8018,P53064,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,558,,,Negative,MSDLDEDLLALAGADESEEEDQVLTTTSAKRAKNNDQSLSKKRRIE...,5,False
8019,Q80TH2,Mus musculus,Metazoa,1402,,,Negative,MTTKRSLFVRLVPCRCLRGEEETVTTLDYSHCSLEQVPKEIFTFEK...,5,False


In [48]:
subset.loc[subset['Set'] == "5"]

Unnamed: 0,EntryID,OrganismName,Kingdom,SequenceLength,SPStart,SPEnd,Class,Sequence,Set,HelixDomain
696,Q7SCE9,Neurospora crassa (strain ATCC 24698 / 74-OR23...,Fungi,385,1.0,18.0,Positive,MKFSIISVALASAITVDAHGYLTIPFSRTRLGAEAGLDTCPECSIL...,5,
697,Q40975,Papaver rhoeas,Viridiplantae,139,1.0,19.0,Positive,MNIFYVIVLLSFFLSKSSGFFPVIEVRIMNRRGNGRSIGIHCRSKD...,5,
698,P79073,Hypocrea jecorina,Fungi,86,1.0,15.0,Positive,MQFFAVALFATSALAAVCPTGLFSNPLCCATNVLDLIGVDCKTPTI...,5,
699,P20507,Chlamydomonas reinhardtii,Viridiplantae,377,1.0,20.0,Positive,MARTGALLLVALALAGCAQACIYKFGTSPDSKATVSGDHWDHGLNG...,5,
700,Q99013,Pseudopleuronectes americanus,Metazoa,82,1.0,23.0,Positive,MALSLFTVGQLIFLFWTMRITEARPDPAAKAAPAAAAVPAAAAPDT...,5,
...,...,...,...,...,...,...,...,...,...,...
8016,P47120,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,325,,,Negative,MSTNFEKHFQENVDECTLEQLRDILVNKSGKTVLANRFRALFNLKT...,5,False
8017,P08212,Pisum sativum,Viridiplantae,81,,,Negative,MNPLIAAASVIAAGLAVGLASIGPGVGQGTAAGQAVEGIARQPEAE...,5,True
8018,P53064,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,558,,,Negative,MSDLDEDLLALAGADESEEEDQVLTTTSAKRAKNNDQSLSKKRRIE...,5,False
8019,Q80TH2,Mus musculus,Metazoa,1402,,,Negative,MTTKRSLFVRLVPCRCLRGEEETVTTLDYSHCSLEQVPKEIFTFEK...,5,False


In [50]:
train_bench = pd.concat([benchmark , subset] , ignore_index=True)
train_bench.to_csv("train_bench.tsv",sep="\t", index=False)

In [55]:
print("Training Positives",len(subset.loc[subset['Class'] == "Positive"]))
print("Training Negatives",len(subset.loc[subset['Class'] == "Negative"]))
print("Benchmark Positives",len(benchmark.loc[benchmark['Class'] == "Positive"]))
print("Benchmark Positives",len(benchmark.loc[benchmark['Class'] == "Negative"]))

Training Positives 874
Training Negatives 7147
Benchmark Positives 219
Benchmark Positives 1787
