### Import ids for randomization

In [1]:
import pandas as pd

pos_id = pd.read_csv("pos_cluster/positive_representative_data.tsv" , sep="\t" , usecols=[0] , names=["id"] , skiprows=1)
neg_id = pd.read_csv("neg_cluster/negative_representative_data.tsv" , sep="\t" , usecols=[0] , names=["id"] , skiprows=1)
pos_id.head()

Unnamed: 0,id
0,O43155
1,O43866
2,O75629
3,O94985
4,P01579


In [2]:
neg_id.head()

Unnamed: 0,id
0,Q6YN16
1,Q9NTK1
2,P50542
3,P49419
4,Q9Y3I1


### Randomization of positive and negative ids

In [3]:
#DO NOT RERUN THIS CELL

pos_random = pos_id.sample(frac=1).reset_index(drop=True)
neg_random = neg_id.sample(frac=1).reset_index(drop=True)

In [4]:
pos_random.head()

Unnamed: 0,id
0,P13671
1,P18684
2,Q94K85
3,Q56S59
4,Q9UBN6


In [5]:
neg_random.head()

Unnamed: 0,id
0,P47164
1,P50402
2,Q5TEU4
3,P61581
4,Q9P000


### Divide 80% training and 20% benchmark sets

In [6]:
#training set creation

n_pos = int(len(pos_random) * 0.8)
n_neg = int(len(neg_random) * 0.8)

training_pos = pos_random[:n_pos] 
training_neg = neg_random[:n_neg]

In [7]:
print(n_pos , n_neg)

874 7147


In [37]:
training_pos

Unnamed: 0,id
0,P13671
1,P18684
2,Q94K85
3,Q56S59
4,Q9UBN6
...,...
869,Q09022
870,Q2ULM2
871,Q8N766
872,Q8I7X1


In [38]:
training_neg

Unnamed: 0,id
0,P47164
1,P50402
2,Q5TEU4
3,P61581
4,Q9P000
...,...
7142,Q3UHH1
7143,P39103
7144,Q9SU91
7145,Q33DK1


In [10]:
#benchmark set creation

benchmark_pos = pos_random[n_pos:] 
benchmark_neg = neg_random[n_neg:]

In [35]:
benchmark_pos

Unnamed: 0,id
874,P07981
875,P32004
876,Q06KA5
877,Q9NS68
878,P04557
...,...
1088,Q8BPM6
1089,P81941
1090,P30659
1091,O97939


In [36]:
benchmark_neg

Unnamed: 0,id
7147,Q62384
7148,Q32P44
7149,Q5IRJ6
7150,Q949Q5
7151,Q9NUQ2
...,...
8929,Q15760
8930,Q91Z67
8931,Q8BG21
8932,P53507


In [13]:
# Check if we have intersection between datasets

check1 = pd.merge(benchmark_neg , training_neg , how="inner")
check2 = pd.merge(benchmark_pos , training_pos , how="inner")
print(check1)
print(check2)

Empty DataFrame
Columns: [id]
Index: []
Empty DataFrame
Columns: [id]
Index: []


In [14]:
# Merge positive and negative benchmark sets

benchmark = pd.concat([benchmark_neg , benchmark_pos] , ignore_index=True)
#len(benchmark)
print(len(benchmark_pos),'+',len(benchmark_neg),'=',len(benchmark_neg)+len(benchmark_pos))

219 + 1787 = 2006


In [31]:
benchmark

Unnamed: 0,id
0,Q62384
1,Q32P44
2,Q5IRJ6
3,Q949Q5
4,Q9NUQ2
...,...
2001,Q8BPM6
2002,P81941
2003,P30659
2004,O97939


### Create fasta file benchmark

In [30]:
from sys import argv
from Bio import SeqIO
import subprocess
import shlex
'''
Takes as input a list of ids (ids that contain the > character)
and  a dataset of fasta sequences (for example the entire swissprot)
'''

#merge all the negative and positive representative sequences fasta files 
command = "cat neg_cluster/neg_rep_seq.fasta pos_cluster/pos_rep_seq.fasta" 
with open("rep_dataset.fasta", "w") as f:
    subprocess.run(shlex.split(command), stdout=f)


rep_dataset='rep_dataset.fasta'    
output_file_name = 'benchmark_sequences.fasta'
ids_to_get=benchmark['id'].tolist() #convert the ids of the benchmark  to a list

found = 0 #keeps track of single ids.
counter = len(ids_to_get) #number of the ids inside the benchmark list

#find all the sequences inside the all representative sequences fasta file that corresponds to the ids inside the benchmark set
with open(rep_dataset , 'r') as file1 , open(output_file_name , 'w') as file2:
    for prot in SeqIO.parse(file1 , 'fasta'):
        if str(prot.id) in ids_to_get:
            found = found + 1
            SeqIO.write(prot , file2 , 'fasta')
        else:
            pass
            
print("how many ids are in the benchmark set: ", counter)
print("total number of sequences recovered: ",found)

how many ids are in the benchmark set:  2006
total number of sequences recovered:  2006


## Creating the subsets of the training set

In [95]:
start=0
step_pos=n_pos//5
for i in range(1,6):
    if i==5:
        subset=training_pos[start:]
    else:
        stop=i*step_pos
        subset=training_pos[start:stop]
        start=i*step_pos
    print(f"Total number positive sequences in the subset {i}: ",len(subset))
    output_file_name = f"subset{i}.fasta"
    ids_to_get=subset['id'].tolist()
    with open(rep_dataset , 'r') as file1 , open(output_file_name , 'w') as file2:
        for prot in SeqIO.parse(file1 , 'fasta'):
            if str(prot.id) in ids_to_get:
                found = found + 1
                SeqIO.write(prot , file2 , 'fasta')
            else:
                pass

###now work on negatives
step_neg=n_neg//5
start=0
for i in range(1,6):
    if i==5:
        subset=training_neg[start:]
    else:
        stop=i*step_neg
        subset=training_neg[start:stop]
        start=i*step_neg
    print(f"Total number negative sequences in the subset {i}: ",len(subset))
    output_file_name = f"subset{i}.fasta"
    ids_to_get=subset['id'].tolist()
    with open(rep_dataset , 'r') as file1 , open(output_file_name , 'a') as file2:
        for prot in SeqIO.parse(file1 , 'fasta'):
            if str(prot.id) in ids_to_get:
                found = found + 1
                SeqIO.write(prot , file2 , 'fasta')
            else:
                pass
for i in range(1,6):
    command = f"grep -c '^>' subset{i}.fasta" 
    print(f"Total number sequences in the total subset {i}: ", subprocess.run(shlex.split(command),capture_output=True, text=True).stdout.strip())

Total number positive sequences in the subset 1 174
Total number positive sequences in the subset 2 174
Total number positive sequences in the subset 3 174
Total number positive sequences in the subset 4 174
Total number positive sequences in the subset 5 178
Total number negative sequences in the subset 1 1429
Total number negative sequences in the subset 2 1429
Total number negative sequences in the subset 3 1429
Total number negative sequences in the subset 4 1429
Total number negative sequences in the subset 5 1431
Total number sequences in the total subset 1:  1603
Total number sequences in the total subset 2:  1603
Total number sequences in the total subset 3:  1603
Total number sequences in the total subset 4:  1603
Total number sequences in the total subset 5:  1609
