In [1]:
import pandas as pd

positive_filtered = pd.read_csv("positive_cluster/positive_filtered.tsv" , sep="\t")
negative_filtered = pd.read_csv("neg_cluster/negative_filtered.tsv" , sep="\t")
positive_filtered['Label']="Positive"
negative_filtered['Label']="Negative"
positive_filtered.head()

Unnamed: 0,Accession,Organism,Kingdom,Sequence length,SP cleavage,Label
0,O43155,Homo sapiens,Metazoa,660,35,Positive
1,O43866,Homo sapiens,Metazoa,347,19,Positive
2,O75629,Homo sapiens,Metazoa,220,31,Positive
3,O94985,Homo sapiens,Metazoa,981,28,Positive
4,P01579,Homo sapiens,Metazoa,166,23,Positive


In [2]:
from sklearn.utils import shuffle

positive_filtered_random = shuffle(positive_filtered, random_state=42).reset_index(drop=True)
negative_filtered_random = shuffle(negative_filtered, random_state=42).reset_index(drop=True)


In [3]:
from sklearn.model_selection import train_test_split

X_positive_train, X_positive_test= train_test_split(positive_filtered_random, test_size=0.2, random_state=42)
X_negative_train, X_negative_test= train_test_split(negative_filtered_random, test_size=0.2, random_state=42)


In [4]:
X_positive_train = X_positive_train.reset_index(drop=True)
X_positive_test = X_positive_test.reset_index(drop=True)
X_negative_train = X_negative_train.reset_index(drop=True)
X_negative_test = X_negative_test.reset_index(drop=True)

In [5]:
X_train = pd.concat([X_positive_train , X_negative_train], axis=0)
X_test = pd.concat([X_positive_test, X_negative_test], axis=0)

In [6]:
X_test

Unnamed: 0,Accession,Organism,Kingdom,Sequence length,SP cleavage,Label,N-term transmembrane
0,P51654,Homo sapiens,Metazoa,580,24.0,Positive,
1,P10562,Canavalia gladiata,Viridiplantae,445,26.0,Positive,
2,O35806,Rattus norvegicus,Metazoa,1764,35.0,Positive,
3,P01189,Homo sapiens,Metazoa,267,26.0,Positive,
4,Q9NR16,Homo sapiens,Metazoa,1453,40.0,Positive,
...,...,...,...,...,...,...,...
1782,Q8VY52,Arabidopsis thaliana,Viridiplantae,232,,Negative,False
1783,P22798,Spinacia oleracea,Viridiplantae,271,,Negative,False
1784,Q24143,Drosophila melanogaster,Metazoa,723,,Negative,False
1785,Q99417,Homo sapiens,Metazoa,103,,Negative,False


In [7]:
!pip install biopython



In [8]:
from Bio import SeqIO
import subprocess
import shlex

In [9]:
neg_sequences = SeqIO.parse("neg_cluster/negative_cluster_rep_seq.fasta", "fasta")
pos_sequences = SeqIO.parse("positive_cluster/positive_cluster_rep_seq.fasta", "fasta")

all_sequences = list(neg_sequences) + list(pos_sequences)

with open("repressive_dataset.fasta", "w") as output_handle:
    SeqIO.write(all_sequences, output_handle, "fasta")


X_test['Set']="Benchmark"
X_train['Set']="Train"
rep_dataset='repressive_dataset.fasta'
ids=X_test['EntryID'].tolist()


with open(rep_dataset , 'r') as file:
    for protein in SeqIO.parse(file , 'fasta'):
        if str(protein.id) in ids:
            X_test.loc[X_test['EntryID'] == str(protein.id), 'Sequence'] = str(protein.seq)
        else:
            pass


In [10]:
X_test

Unnamed: 0,Accession,Organism,Kingdom,Sequence length,SP cleavage,Label,N-term transmembrane,Set,Sequence
0,P51654,Homo sapiens,Metazoa,580,24.0,Positive,,Benchmark,MAGTVRTACLVVAMLLSLDFPGQAQPPPPPPDATCHQVRSFFQRLQ...
1,P10562,Canavalia gladiata,Viridiplantae,445,26.0,Positive,,Benchmark,MAFSARFPLWLLLGVVLLASVSASFAHSGHSGGEAEDESEESRAQN...
2,O35806,Rattus norvegicus,Metazoa,1764,35.0,Positive,,Benchmark,MRAPTTVRCSGRIQRARWRGFLPLVLALLMGTSHAQRDSVGRYEPA...
3,P01189,Homo sapiens,Metazoa,267,26.0,Positive,,Benchmark,MPRSCCSRSGALLLALLLQASMEVRGWCLESSQCQDLTTESNLLEC...
4,Q9NR16,Homo sapiens,Metazoa,1453,40.0,Positive,,Benchmark,MMLPQNSWHIDFGRCCCHQNLFSAVVTCILLLNSCFLISSFNGTDL...
...,...,...,...,...,...,...,...,...,...
1782,Q8VY52,Arabidopsis thaliana,Viridiplantae,232,,Negative,False,Benchmark,MWSQSFLGSAPKLCLFSSSLPPFSHHKIHKFFCFAQNPSSTVSINL...
1783,P22798,Spinacia oleracea,Viridiplantae,271,,Negative,False,Benchmark,MASLLSLSSTPPSTANSNNYPSSTFKGNINNFRINPFNFAPLKLHL...
1784,Q24143,Drosophila melanogaster,Metazoa,723,,Negative,False,Benchmark,MSPPKNCAVCGDKALGYNFNAVTCESCKAFFRRNALAKKQFTCPFN...
1785,Q99417,Homo sapiens,Metazoa,103,,Negative,False,Benchmark,MAHYKAADSKREQFRRYLEKSGVLDTLTKVLVALYEEPEKPNSALD...


In [11]:
ids=X_train['EntryID'].tolist()


with open(rep_dataset , 'r') as file:
    for protein in SeqIO.parse(file , 'fasta'):
        if str(protein.id) in ids:
            X_train.loc[X_train['EntryID'] == str(protein.id), 'Sequence'] = str(protein.seq)
        else:
            pass

In [12]:
X_train

Unnamed: 0,Accession,Organism,Kingdom,Sequence length,SP cleavage,Label,N-term transmembrane,Set,Sequence
0,Q16552,Homo sapiens,Metazoa,155,23.0,Positive,,Train,MTPGKTSLVSLLLLLSLEAIVKAGITIPRNPGCPNSEDKNFPRTVM...
1,Q9M373,Arabidopsis thaliana,Viridiplantae,74,26.0,Positive,,Train,MASRNSVAVIALFAFVFAVISPFAGAQSLAPAPSPTSDGTSIDQGI...
2,P62520,Chilobrachys guangxiensis,Metazoa,63,27.0,Positive,,Train,MKNTSILFILGLALLLVLAFEAQVGESDGECGGFWWKCGRGKPPCC...
3,Q8I7X1,Porcellio scaber,Metazoa,145,21.0,Positive,,Train,MKGLLFIVSLLCLTLHQRVWAYQVIGMKSDVICADIRFTVHCICNE...
4,P14625,Homo sapiens,Metazoa,803,21.0,Positive,,Train,MRALWVLGLCCVLLTFGSVRADDEVDVDGTVEEDLGKSREGSRTDD...
...,...,...,...,...,...,...,...,...,...
7142,B7WN96,Caenorhabditis elegans,Metazoa,317,,Negative,False,Train,METANYYLPSPPYSSTSSSDSRESRMNTPIPTTYSEENVNSLFHLM...
7143,P42838,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,414,,Negative,True,Train,MVNFDLGQVGEVFRRKDKGAIVSGDNPEEEEDVDASEFEEDEVKPV...
7144,Q96MF7,Homo sapiens,Metazoa,247,,Negative,False,Train,MPGRSSSNSGSTGFISFSGVESALSSLKNFQACINSGMDTASSVAL...
7145,Q9UJS0,Homo sapiens,Metazoa,675,,Negative,False,Train,MAAAKVALTKRADPAELRTIFLKYASIEKNGEFFMSPNDFVTRYLN...


In [13]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X_train["fold"] = -1

X_train = X_train.reset_index(drop=True)

for fold, (_, val_idx) in enumerate(skf.split(X_train, X_train["Label"])):
    X_train.loc[val_idx, "fold"] = fold


In [14]:
X_test

Unnamed: 0,Accession,Organism,Kingdom,Sequence length,SP cleavage,Label,N-term transmembrane,Set,Sequence
0,P51654,Homo sapiens,Metazoa,580,24.0,Positive,,Benchmark,MAGTVRTACLVVAMLLSLDFPGQAQPPPPPPDATCHQVRSFFQRLQ...
1,P10562,Canavalia gladiata,Viridiplantae,445,26.0,Positive,,Benchmark,MAFSARFPLWLLLGVVLLASVSASFAHSGHSGGEAEDESEESRAQN...
2,O35806,Rattus norvegicus,Metazoa,1764,35.0,Positive,,Benchmark,MRAPTTVRCSGRIQRARWRGFLPLVLALLMGTSHAQRDSVGRYEPA...
3,P01189,Homo sapiens,Metazoa,267,26.0,Positive,,Benchmark,MPRSCCSRSGALLLALLLQASMEVRGWCLESSQCQDLTTESNLLEC...
4,Q9NR16,Homo sapiens,Metazoa,1453,40.0,Positive,,Benchmark,MMLPQNSWHIDFGRCCCHQNLFSAVVTCILLLNSCFLISSFNGTDL...
...,...,...,...,...,...,...,...,...,...
1782,Q8VY52,Arabidopsis thaliana,Viridiplantae,232,,Negative,False,Benchmark,MWSQSFLGSAPKLCLFSSSLPPFSHHKIHKFFCFAQNPSSTVSINL...
1783,P22798,Spinacia oleracea,Viridiplantae,271,,Negative,False,Benchmark,MASLLSLSSTPPSTANSNNYPSSTFKGNINNFRINPFNFAPLKLHL...
1784,Q24143,Drosophila melanogaster,Metazoa,723,,Negative,False,Benchmark,MSPPKNCAVCGDKALGYNFNAVTCESCKAFFRRNALAKKQFTCPFN...
1785,Q99417,Homo sapiens,Metazoa,103,,Negative,False,Benchmark,MAHYKAADSKREQFRRYLEKSGVLDTLTKVLVALYEEPEKPNSALD...


In [15]:
X_train

Unnamed: 0,Accession,Organism,Kingdom,Sequence length,SP cleavage,Label,N-term transmembrane,Set,Sequence,fold
0,Q16552,Homo sapiens,Metazoa,155,23.0,Positive,,Train,MTPGKTSLVSLLLLLSLEAIVKAGITIPRNPGCPNSEDKNFPRTVM...,4
1,Q9M373,Arabidopsis thaliana,Viridiplantae,74,26.0,Positive,,Train,MASRNSVAVIALFAFVFAVISPFAGAQSLAPAPSPTSDGTSIDQGI...,1
2,P62520,Chilobrachys guangxiensis,Metazoa,63,27.0,Positive,,Train,MKNTSILFILGLALLLVLAFEAQVGESDGECGGFWWKCGRGKPPCC...,2
3,Q8I7X1,Porcellio scaber,Metazoa,145,21.0,Positive,,Train,MKGLLFIVSLLCLTLHQRVWAYQVIGMKSDVICADIRFTVHCICNE...,4
4,P14625,Homo sapiens,Metazoa,803,21.0,Positive,,Train,MRALWVLGLCCVLLTFGSVRADDEVDVDGTVEEDLGKSREGSRTDD...,2
...,...,...,...,...,...,...,...,...,...,...
8016,B7WN96,Caenorhabditis elegans,Metazoa,317,,Negative,False,Train,METANYYLPSPPYSSTSSSDSRESRMNTPIPTTYSEENVNSLFHLM...,4
8017,P42838,Saccharomyces cerevisiae (strain ATCC 204508 /...,Fungi,414,,Negative,True,Train,MVNFDLGQVGEVFRRKDKGAIVSGDNPEEEEDVDASEFEEDEVKPV...,1
8018,Q96MF7,Homo sapiens,Metazoa,247,,Negative,False,Train,MPGRSSSNSGSTGFISFSGVESALSSLKNFQACINSGMDTASSVAL...,0
8019,Q9UJS0,Homo sapiens,Metazoa,675,,Negative,False,Train,MAAAKVALTKRADPAELRTIFLKYASIEKNGEFFMSPNDFVTRYLN...,1


In [16]:
data = pd.concat([X_train , X_test] , ignore_index=True)
data.to_csv("data.tsv",sep="\t", index=False)

In [17]:
print("Training Positives",len(X_train.loc[X_train['Label'] == "Positive"]))
print("Training Negatives",len(X_train.loc[X_train['Label'] == "Negative"]))
print("Benchmark Positives",len(X_test.loc[X_test['Label'] == "Positive"]))
print("Benchmark Positives",len(X_test.loc[X_test['Label'] == "Negative"]))

Training Positives 874
Training Negatives 7147
Benchmark Positives 219
Benchmark Positives 1787
