In [16]:
# OBJETIVO: Predicción de Lipoproteínas presentes en genoma de Mycobacterium tuberculosis a partir de la secuencia genomica
# de cepa H37RV utilizando a API SignalP6 y posteriormente comparar con lipoproteinas ya identintificadas bajo el mismo
# criterio en bases de datos Mycobrowser una base de referencia para micobacterias

In [2]:
!pip3 install pandas
!pip3 install pyrodigal
!pip3 install requests
!pip3 install biopython



In [110]:
import matplotlib.pyplot as plt
import pandas as pd
import pyrodigal
import requests
import subprocess
from Bio import SeqIO
from io                 import StringIO
from requests.adapters  import HTTPAdapter, Retry
from typing import TextIO, Union
from pathlib import Path
from time import sleep
from Bio import Entrez
import credenciales




In [None]:
#1. Obtención de una secuencia genómica

In [5]:
#lGenoma de Mycobacterium tuberculosis a analizar:
genoma= "NC_000962.3"

In [9]:
#1.1 Vamos a descargar un genoma usando la biblioteca de NCBI - Entrez

def fetch_sequences(acc: str) -> TextIO:
    try:
        handle = Entrez.efetch(db="nucleotide", id=acc, rettype="fasta")
        return handle
    except requests.HTTPError as err:
        print(err.response.status_code)
        print(err.response.text)
        sleep(60)





def get_fasta_sequences(acc_id: str) -> None:
    def save_fasta(x: str) -> None:
        handle = fetch_sequences(x)
        output_prefix = str(Path.cwd() / x)
        fasta_file = f"{output_prefix}.fasta"

        with open(fasta_file, "w") as f:
            f.write(handle.read())

    if isinstance(acc_id, list):
        [save_fasta(i) for i in acc_id]
    else:
        save_fasta(acc_id)

In [10]:
genoma_tuberculosis = get_fasta_sequences(genoma)

In [43]:
#2. Predicción de genes usando pyrodigal


In [11]:
gff3_header = [
    "seqid",
    "source",
    "type",
    "start",
    "end",
    "score",
    "strand",
    "phase",
    "attributes",
]

In [12]:
tb_prefix = str(Path.cwd() / genoma)
input_file = SeqIO.read(f"{tb_prefix}.fasta", "fasta")
orf_finder = pyrodigal.OrfFinder()
orf_finder.train(bytes(input_file.seq))

with open(f"{tb_prefix}.gff", "w") as dst:
    fasta_parser = SeqIO.parse(f"{tb_prefix}.fasta", "fasta")
    for i, record in enumerate(fasta_parser):
        genes = orf_finder.find_genes(bytes(record.seq))
        genes.write_gff(dst, sequence_id=record.id, header=(i == 0))
        globals()["genes"] = pd.read_table(f"{tb_prefix}.gff", names=gff3_header, comment="#")

In [13]:
genes.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,NC_000962.3,pyrodigal_v2.1.0,CDS,1,1524,160.1,+,0,ID=NC_000962.3_1;partial=10;start_type=Edge;rb...
1,NC_000962.3,pyrodigal_v2.1.0,CDS,2052,3260,83.9,+,0,ID=NC_000962.3_2;partial=00;start_type=ATG;rbs...
2,NC_000962.3,pyrodigal_v2.1.0,CDS,3280,4437,45.6,+,0,ID=NC_000962.3_3;partial=00;start_type=GTG;rbs...
3,NC_000962.3,pyrodigal_v2.1.0,CDS,4482,4997,13.8,+,0,ID=NC_000962.3_4;partial=00;start_type=ATG;rbs...
4,NC_000962.3,pyrodigal_v2.1.0,CDS,5123,7267,312.9,+,0,ID=NC_000962.3_5;partial=00;start_type=ATG;rbs...


In [14]:
#3. Obtecion de las proteínas a partir de las secuancias nucleotídicas de los genes
aa_file = genoma + ".faa"
tb_prefix = str(Path.cwd() / genoma)

with open(f"{tb_prefix}.gff", "w") as dst:
    fasta_parser = SeqIO.parse(f"{tb_prefix}.fasta", "fasta")
    for i, record in enumerate(fasta_parser):
        genes = orf_finder.find_genes(bytes(record.seq))
        aa_file = genoma + ".faa"
        prefix  = genoma
        with open(aa_file, "w") as orf_gene:
            genes.write_translations(orf_gene,sequence_id=prefix)

In [1]:
#4. librerias para conexión con servidor SignalP_6
!pip3 install -U pybiolib
import biolib
signalp_6 = biolib.load('DTU/SignalP_6')
biolib.login()
# token b4jwHu3lRao15Nafh6u5tGVzIG3l3u79j6ULZU2HEhs
biolib.BIOLIB_TOKEN = 'UM6ImC8v6xsMyg41faxfTAuTFfXjymKyam0aPF4KTIg'



2023-03-14 11:26:30,187 | INFO : Loaded project DTU/SignalP-6:0.0.56
2023-03-14 11:26:30,188 | INFO : Already signed in


In [2]:
#permisos para que descargue los resultados
%%bash
. ~/.bashrc
chmod -R 775 /home/mebigi/WBDSLA_Camp


In [17]:
#Verificamos la conexion
print(signalp_6.cli(args='--help'))

NameError: name 'signalp_6' is not defined

In [191]:
#6. Preparacion de los Archivos
# lo siguewnte vamos a dividir al archivo faa de referencia en archivos con menos de 500 genes gen para 
#luego enviar al signalp6 para hacer la predicion de las seceuncias señales, a fin de evitar que sature el servidor
#Comandos usados en terminal bash
# $csplit -f tb NC_000962.3.faa '/^>/' '{*}'
# luego armado de aprox 400 genes por archivo

#!/bin/bash 
# s=1
# x=482
# t=4082
# f=1
# while [ $x -le $t ]
# do
# while [ $s -le $x ]
# do cat tb$s >> fa_"$f".txt
# s=$(( $s + 1 ))
# done
# x=$(( $x + 400 ))
# f=$(( $f + 1 ))
# done

#luego los reagrupamos en sub grupos de 400 secuancias ya que el envio de mas secuancias al signalp6 está limitados

In [90]:
#luego a cada archivo los vamos a ir enviando al servidor signalP-6. Se guadaran los resultados de las mas de 4000 genes en distintas subcarpetas
#results_files/ numeradas 1 al 10

for n in range(1,10,1):
    job = signalp_6.cli(args='--fastafile /home/mebigi/WBDSLA_Camp/imput_signalp6/fa_'+str(n)+' --output_dir output --organism other --format txt --mode fast')
    
    if job.get_status() == 'completed':
        job.save_files('results_files/'+str(n)) 
        continue

2023-03-13 10:09:13,671 | INFO : Job "46122c72-ff75-431b-857c-99034736c7ef" is starting...
2023-03-13 10:09:23,079 | INFO : Cloud: Initializing
2023-03-13 10:09:23,080 | INFO : Cloud: Downloading Source Files...
2023-03-13 10:09:23,081 | INFO : Cloud: Pulling images...
2023-03-13 10:09:23,081 | INFO : Cloud: Computing...
Predicting: 100% 400/400 [23:18<00:00,  3.50s/sequences]
Writing files: 100% 400/400 [00:00<00:00, 407.04it/s]
2023-03-13 10:32:49,086 | INFO : Cloud: Computation finished
2023-03-13 10:32:49,087 | INFO : Cloud: Result Ready
2023-03-13 10:33:08,894 | INFO : Saving 406 files to results_files/6...
2023-03-13 10:33:13,149 | INFO :   - results_files/6/output.gff3
2023-03-13 10:34:21,722 | INFO :   - results_files/6/output.json
2023-03-13 10:34:24,038 | INFO :   - results_files/6/output_NC_000962_3_2083___2233723___2233980___1___ID_4_2083_partial_00_start_type_ATG_rbs_motif_AGGA_GGAG_GAGG_rbs_spacer_11_12bp_gc_cont_0_663_plot.txt
2023-03-13 10:34:25,028 | INFO :   - results

2023-03-13 10:35:58,483 | INFO :   - results_files/6/output_NC_000962_3_2121___2270750___2271709____1___ID_4_2121_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_664_plot.txt
2023-03-13 10:36:01,931 | INFO :   - results_files/6/output_NC_000962_3_2122___2271863___2272747____1___ID_4_2122_partial_00_start_type_ATG_rbs_motif_AGGAG_rbs_spacer_5_10bp_gc_cont_0_628_plot.txt
2023-03-13 10:36:05,619 | INFO :   - results_files/6/output_NC_000962_3_2123___2272787___2274508____1___ID_4_2123_partial_00_start_type_GTG_rbs_motif_None_rbs_spacer_None_gc_cont_0_653_plot.txt
2023-03-13 10:36:08,093 | INFO :   - results_files/6/output_NC_000962_3_2124___2274569___2275408____1___ID_4_2124_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_680_plot.txt
2023-03-13 10:36:09,228 | INFO :   - results_files/6/output_NC_000962_3_2125___2275405___2276424____1___ID_4_2125_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_649_plot.txt
20

2023-03-13 10:38:05,549 | INFO :   - results_files/6/output_NC_000962_3_2163___2325886___2326809____1___ID_4_2163_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_661_plot.txt
2023-03-13 10:38:10,240 | INFO :   - results_files/6/output_NC_000962_3_2164___2326938___2327501___1___ID_4_2164_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_677_plot.txt
2023-03-13 10:38:16,486 | INFO :   - results_files/6/output_NC_000962_3_2165___2327491___2328225____1___ID_4_2165_partial_00_start_type_GTG_rbs_motif_None_rbs_spacer_None_gc_cont_0_665_plot.txt
2023-03-13 10:38:20,011 | INFO :   - results_files/6/output_NC_000962_3_2166___2328222___2328977____1___ID_4_2166_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_675_plot.txt
2023-03-13 10:38:24,505 | INFO :   - results_files/6/output_NC_000962_3_2167___2328974___2330146____1___ID_4_2167_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_683_plot.txt
2023-03-13 10:38:25,664 

2023-03-13 10:40:04,185 | INFO :   - results_files/6/output_NC_000962_3_2205___2368983___2369693____1___ID_4_2205_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_640_plot.txt
2023-03-13 10:40:06,440 | INFO :   - results_files/6/output_NC_000962_3_2206___2369726___2370601____1___ID_4_2206_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_636_plot.txt
2023-03-13 10:40:07,493 | INFO :   - results_files/6/output_NC_000962_3_2207___2370598___2370792____1___ID_4_2207_partial_00_start_type_ATG_rbs_motif_AGGAG_rbs_spacer_5_10bp_gc_cont_0_656_plot.txt
2023-03-13 10:40:08,666 | INFO :   - results_files/6/output_NC_000962_3_2208___2370905___2372569____1___ID_4_2208_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_673_plot.txt
2023-03-13 10:40:12,128 | INFO :   - results_files/6/output_NC_000962_3_2209___2372630___2373823___1___ID_4_2209_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_670_plot.txt
2

2023-03-13 10:41:35,479 | INFO :   - results_files/6/output_NC_000962_3_2247___2406840___2407616____1___ID_4_2247_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_636_plot.txt
2023-03-13 10:41:38,987 | INFO :   - results_files/6/output_NC_000962_3_2248___2407622___2408425____1___ID_4_2248_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_11_12bp_gc_cont_0_684_plot.txt
2023-03-13 10:41:40,047 | INFO :   - results_files/6/output_NC_000962_3_2249___2408385___2409524____1___ID_4_2249_partial_00_start_type_ATG_rbs_motif_GGAGG_rbs_spacer_5_10bp_gc_cont_0_663_plot.txt
2023-03-13 10:41:45,076 | INFO :   - results_files/6/output_NC_000962_3_2250___2409697___2410641____1___ID_4_2250_partial_00_start_type_ATG_rbs_motif_GGxGG_rbs_spacer_5_10bp_gc_cont_0_684_plot.txt
2023-03-13 10:41:48,839 | INFO :   - results_files/6/output_NC_000962_3_2251___2410638___2412122____1___ID_4_2251_partial_00_start_type_GTG_rbs_motif_AGGA_rbs_spacer_5_10bp_gc_cont_0_676_plot.txt
202

2023-03-13 10:43:19,649 | INFO :   - results_files/6/output_NC_000962_3_2289___2452115___2453296____1___ID_4_2289_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_660_plot.txt
2023-03-13 10:43:20,773 | INFO :   - results_files/6/output_NC_000962_3_2290___2453458___2453736____1___ID_4_2290_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_681_plot.txt
2023-03-13 10:43:21,837 | INFO :   - results_files/6/output_NC_000962_3_2291___2453813___2455756___1___ID_4_2291_partial_00_start_type_GTG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_696_plot.txt
2023-03-13 10:43:22,824 | INFO :   - results_files/6/output_NC_000962_3_2292___2455631___2456674____1___ID_4_2292_partial_00_start_type_GTG_rbs_motif_AGxAG_rbs_spacer_5_10bp_gc_cont_0_698_plot.txt
2023-03-13 10:43:23,871 | INFO :   - results_files/6/output_NC_000962_3_2293___2456970___2457512___1___ID_4_2293_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_584_plot.txt
2023-03-13 10

2023-03-13 10:44:20,433 | INFO :   - results_files/6/output_NC_000962_3_2331___2502735___2503472____1___ID_4_2331_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_656_plot.txt
2023-03-13 10:44:21,428 | INFO :   - results_files/6/output_NC_000962_3_2332___2503469___2504608____1___ID_4_2332_partial_00_start_type_ATG_rbs_motif_GGAGG_rbs_spacer_5_10bp_gc_cont_0_684_plot.txt
2023-03-13 10:44:22,427 | INFO :   - results_files/6/output_NC_000962_3_2333___2504605___2505699____1___ID_4_2333_partial_00_start_type_GTG_rbs_motif_None_rbs_spacer_None_gc_cont_0_674_plot.txt
2023-03-13 10:44:24,512 | INFO :   - results_files/6/output_NC_000962_3_2334___2506521___2507153___1___ID_4_2334_partial_00_start_type_GTG_rbs_motif_None_rbs_spacer_None_gc_cont_0_689_plot.txt
2023-03-13 10:44:25,576 | INFO :   - results_files/6/output_NC_000962_3_2335___2507146___2507637___1___ID_4_2335_partial_00_start_type_GTG_rbs_motif_GGAGG_rbs_spacer_5_10bp_gc_cont_0_661_plot.txt
2023-03-13 10:44:26

2023-03-13 10:45:08,859 | INFO :   - results_files/6/output_NC_000962_3_2373___2545332___2545631___1___ID_4_2373_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_600_plot.txt
2023-03-13 10:45:09,865 | INFO :   - results_files/6/output_NC_000962_3_2374___2545737___2546105___1___ID_4_2374_partial_00_start_type_GTG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_683_plot.txt
2023-03-13 10:45:11,826 | INFO :   - results_files/6/output_NC_000962_3_2375___2546102___2546431___1___ID_4_2375_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_667_plot.txt
2023-03-13 10:45:12,835 | INFO :   - results_files/6/output_NC_000962_3_2376___2546883___2547752___1___ID_4_2376_partial_00_start_type_ATG_rbs_motif_AGGA_rbs_spacer_5_10bp_gc_cont_0_615_plot.txt
2023-03-13 10:45:13,933 | INFO :   - results_files/6/output_NC_000962_3_2377___2547749___2548939___1___ID_4_2377_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_622_plot.txt
2023-03-13

2023-03-13 10:45:57,040 | INFO :   - results_files/6/output_NC_000962_3_2415___2582477___2582644____1___ID_4_2415_partial_00_start_type_ATG_rbs_motif_AGGA_rbs_spacer_5_10bp_gc_cont_0_631_plot.txt
2023-03-13 10:45:58,018 | INFO :   - results_files/6/output_NC_000962_3_2416___2583277___2583429___1___ID_4_2416_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_11_12bp_gc_cont_0_725_plot.txt
2023-03-13 10:46:00,185 | INFO :   - results_files/6/output_NC_000962_3_2417___2583438___2583779___1___ID_4_2417_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_702_plot.txt
2023-03-13 10:46:01,300 | INFO :   - results_files/6/output_NC_000962_3_2418___2583812___2584408___1___ID_4_2418_partial_00_start_type_GTG_rbs_motif_None_rbs_spacer_None_gc_cont_0_668_plot.txt
2023-03-13 10:46:02,298 | INFO :   - results_files/6/output_NC_000962_3_2419___2584486___2584755___1___ID_4_2419_partial_00_start_type_ATG_rbs_motif_AGGAG_rbs_spacer_5_10bp_gc_cont_0_607_plot.txt
2023-03-13

2023-03-13 10:46:44,806 | INFO :   - results_files/6/output_NC_000962_3_2457___2627172___2628725____1___ID_4_2457_partial_00_start_type_GTG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_618_plot.txt
2023-03-13 10:46:45,954 | INFO :   - results_files/6/output_NC_000962_3_2458___2628781___2630346____1___ID_4_2458_partial_00_start_type_GTG_rbs_motif_AGGAGG_rbs_spacer_3_4bp_gc_cont_0_629_plot.txt
2023-03-13 10:46:47,061 | INFO :   - results_files/6/output_NC_000962_3_2459___2630537___2632075____1___ID_4_2459_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_628_plot.txt
2023-03-13 10:46:48,064 | INFO :   - results_files/6/output_NC_000962_3_2460___2632619___2632792___1___ID_4_2460_partial_00_start_type_GTG_rbs_motif_None_rbs_spacer_None_gc_cont_0_655_plot.txt
2023-03-13 10:46:49,071 | INFO :   - results_files/6/output_NC_000962_3_2461___2632923___2634191____1___ID_4_2461_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_690_plot.txt
2023-03-13 1

2023-03-13 11:11:37,254 | INFO :   - results_files/7/output_NC_000962_3_2493___2675936___2677633___1___ID_4_2493_partial_00_start_type_ATG_rbs_motif_AGGAG_rbs_spacer_5_10bp_gc_cont_0_692_plot.txt
2023-03-13 11:11:38,370 | INFO :   - results_files/7/output_NC_000962_3_2494___2677729___2678649___1___ID_4_2494_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_662_plot.txt
2023-03-13 11:11:39,417 | INFO :   - results_files/7/output_NC_000962_3_2495___2678653___2680005____1___ID_4_2495_partial_00_start_type_GTG_rbs_motif_3Base_5BMM_rbs_spacer_13_15bp_gc_cont_0_642_plot.txt
2023-03-13 11:11:40,502 | INFO :   - results_files/7/output_NC_000962_3_2496___2680458___2680667___1___ID_4_2496_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_633_plot.txt
2023-03-13 11:11:41,617 | INFO :   - results_files/7/output_NC_000962_3_2497___2680765___2682018___1___ID_4_2497_partial_00_start_type_ATG_rbs_motif_GGAGG_rbs_spacer_5_10bp_gc_cont_0_617_plot.txt
2023-03-13 1

2023-03-13 11:12:26,655 | INFO :   - results_files/7/output_NC_000962_3_2535___2721866___2723308____1___ID_4_2535_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_665_plot.txt
2023-03-13 11:12:27,702 | INFO :   - results_files/7/output_NC_000962_3_2536___2723308___2724183____1___ID_4_2536_partial_00_start_type_GTG_rbs_motif_AGGAG_rbs_spacer_5_10bp_gc_cont_0_671_plot.txt
2023-03-13 11:12:28,787 | INFO :   - results_files/7/output_NC_000962_3_2537___2724230___2725477____1___ID_4_2537_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_677_plot.txt
2023-03-13 11:12:29,803 | INFO :   - results_files/7/output_NC_000962_3_2538___2725595___2725828____1___ID_4_2538_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_675_plot.txt
2023-03-13 11:12:30,797 | INFO :   - results_files/7/output_NC_000962_3_2539___2726193___2726780___1___ID_4_2539_partial_00_start_type_ATG_rbs_motif_AGGAG_rbs_spacer_3_4bp_gc_cont_0_607_plot.txt
2023-03-13 11:12:

2023-03-13 11:13:15,138 | INFO :   - results_files/7/output_NC_000962_3_2577___2772098___2772331____1___ID_4_2577_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_624_plot.txt
2023-03-13 11:13:16,268 | INFO :   - results_files/7/output_NC_000962_3_2578___2772367___2773035____1___ID_4_2578_partial_00_start_type_ATG_rbs_motif_GGAGG_rbs_spacer_3_4bp_gc_cont_0_682_plot.txt
2023-03-13 11:13:17,322 | INFO :   - results_files/7/output_NC_000962_3_2579___2773178___2773564___1___ID_4_2579_partial_00_start_type_ATG_rbs_motif_AGGAG_GGAGG_rbs_spacer_11_12bp_gc_cont_0_623_plot.txt
2023-03-13 11:13:19,062 | INFO :   - results_files/7/output_NC_000962_3_2580___2773642___2775204___1___ID_4_2580_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_672_plot.txt
2023-03-13 11:13:20,179 | INFO :   - results_files/7/output_NC_000962_3_2581___2775820___2776284___1___ID_4_2581_partial_00_start_type_GTG_rbs_motif_GGxGG_rbs_spacer_5_10bp_gc_cont_0_649_plot.txt
2023-03-13 

2023-03-13 11:14:01,960 | INFO :   - results_files/7/output_NC_000962_3_2619___2829954___2830139___1___ID_4_2619_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_538_plot.txt
2023-03-13 11:14:03,063 | INFO :   - results_files/7/output_NC_000962_3_2620___2830161___2830583___1___ID_4_2620_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_629_plot.txt
2023-03-13 11:14:04,096 | INFO :   - results_files/7/output_NC_000962_3_2621___2830877___2831338____1___ID_4_2621_partial_00_start_type_GTG_rbs_motif_4Base_6BMM_rbs_spacer_13_15bp_gc_cont_0_621_plot.txt
2023-03-13 11:14:05,194 | INFO :   - results_files/7/output_NC_000962_3_2622___2831344___2832573____1___ID_4_2622_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_659_plot.txt
2023-03-13 11:14:06,298 | INFO :   - results_files/7/output_NC_000962_3_2623___2832710___2833513____1___ID_4_2623_partial_00_start_type_GTG_rbs_motif_AGGA_rbs_spacer_5_10bp_gc_cont_0_612_plot.txt
202

2023-03-13 11:14:46,647 | INFO :   - results_files/7/output_NC_000962_3_2661___2872012___2873265____1___ID_4_2661_partial_00_start_type_ATG_rbs_motif_AGxAGG_AGGxGG_rbs_spacer_5_10bp_gc_cont_0_652_plot.txt
2023-03-13 11:14:47,679 | INFO :   - results_files/7/output_NC_000962_3_2662___2873258___2873686____1___ID_4_2662_partial_00_start_type_GTG_rbs_motif_None_rbs_spacer_None_gc_cont_0_699_plot.txt
2023-03-13 11:14:48,783 | INFO :   - results_files/7/output_NC_000962_3_2663___2873771___2876485____1___ID_4_2663_partial_00_start_type_GTG_rbs_motif_AGGA_rbs_spacer_5_10bp_gc_cont_0_657_plot.txt
2023-03-13 11:14:49,812 | INFO :   - results_files/7/output_NC_000962_3_2664___2876576___2876977____1___ID_4_2664_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_659_plot.txt
2023-03-13 11:14:50,904 | INFO :   - results_files/7/output_NC_000962_3_2665___2877072___2877746___1___ID_4_2665_partial_00_start_type_TTG_rbs_motif_GGAGG_rbs_spacer_5_10bp_gc_cont_0_650_plot.txt
2023-0

2023-03-13 11:15:41,464 | INFO :   - results_files/7/output_NC_000962_3_2703___2924817___2925383____1___ID_4_2703_partial_00_start_type_GTG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_665_plot.txt
2023-03-13 11:15:42,567 | INFO :   - results_files/7/output_NC_000962_3_2704___2925525___2925737___1___ID_4_2704_partial_00_start_type_GTG_rbs_motif_AGGA_rbs_spacer_5_10bp_gc_cont_0_653_plot.txt
2023-03-13 11:15:45,285 | INFO :   - results_files/7/output_NC_000962_3_2705___2925734___2926138___1___ID_4_2705_partial_00_start_type_GTG_rbs_motif_None_rbs_spacer_None_gc_cont_0_647_plot.txt
2023-03-13 11:15:47,729 | INFO :   - results_files/7/output_NC_000962_3_2706___2926355___2926975___1___ID_4_2706_partial_00_start_type_GTG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_667_plot.txt
2023-03-13 11:15:50,220 | INFO :   - results_files/7/output_NC_000962_3_2707___2926986___2927480___1___ID_4_2707_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_699_plot.txt
2023-03-1

2023-03-13 11:16:33,728 | INFO :   - results_files/7/output_NC_000962_3_2745___2960105___2962441____1___ID_4_2745_partial_00_start_type_ATG_rbs_motif_AGGAG_rbs_spacer_5_10bp_gc_cont_0_744_plot.txt
2023-03-13 11:16:34,840 | INFO :   - results_files/7/output_NC_000962_3_2746___2962761___2963390___1___ID_4_2746_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_633_plot.txt
2023-03-13 11:16:35,929 | INFO :   - results_files/7/output_NC_000962_3_2747___2963586___2964242___1___ID_4_2747_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_11_12bp_gc_cont_0_676_plot.txt
2023-03-13 11:16:36,964 | INFO :   - results_files/7/output_NC_000962_3_2748___2964405___2964851___1___ID_4_2748_partial_00_start_type_ATG_rbs_motif_AGGAG_rbs_spacer_5_10bp_gc_cont_0_682_plot.txt
2023-03-13 11:16:38,065 | INFO :   - results_files/7/output_NC_000962_3_2749___2965026___2965358____1___ID_4_2749_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_664_plot.txt
2023-03-

2023-03-13 11:17:21,635 | INFO :   - results_files/7/output_NC_000962_3_2787___2995115___2995945___1___ID_4_2787_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_681_plot.txt
2023-03-13 11:17:22,721 | INFO :   - results_files/7/output_NC_000962_3_2788___2996003___2996737___1___ID_4_2788_partial_00_start_type_GTG_rbs_motif_None_rbs_spacer_None_gc_cont_0_694_plot.txt
2023-03-13 11:17:23,778 | INFO :   - results_files/7/output_NC_000962_3_2789___2996739___2998055___1___ID_4_2789_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_686_plot.txt
2023-03-13 11:17:24,785 | INFO :   - results_files/7/output_NC_000962_3_2790___2998052___2999968____1___ID_4_2790_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_672_plot.txt
2023-03-13 11:17:25,823 | INFO :   - results_files/7/output_NC_000962_3_2791___3000112___3000609___1___ID_4_2791_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_627_plot.txt
2023-03-13 11:17:26,810 | I

2023-03-13 11:18:09,664 | INFO :   - results_files/7/output_NC_000962_3_2829___3032520___3034619____1___ID_4_2829_partial_00_start_type_GTG_rbs_motif_GGAGG_rbs_spacer_3_4bp_gc_cont_0_653_plot.txt
2023-03-13 11:18:10,656 | INFO :   - results_files/7/output_NC_000962_3_2830___3034909___3036102___1___ID_4_2830_partial_00_start_type_ATG_rbs_motif_AGGA_GGAG_GAGG_rbs_spacer_11_12bp_gc_cont_0_599_plot.txt
2023-03-13 11:18:11,716 | INFO :   - results_files/7/output_NC_000962_3_2831___3036131___3037291____1___ID_4_2831_partial_00_start_type_ATG_rbs_motif_GGAGG_rbs_spacer_5_10bp_gc_cont_0_604_plot.txt
2023-03-13 11:18:12,820 | INFO :   - results_files/7/output_NC_000962_3_2832___3037427___3038914____1___ID_4_2832_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_675_plot.txt
2023-03-13 11:18:13,922 | INFO :   - results_files/7/output_NC_000962_3_2833___3038931___3039800____1___ID_4_2833_partial_00_start_type_ATG_rbs_motif_AGGAG_rbs_spacer_5_10bp_gc_cont_0_680_plot.txt
2

2023-03-13 11:19:08,505 | INFO :   - results_files/7/output_NC_000962_3_2871___3075588___3076370____1___ID_4_2871_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_664_plot.txt
2023-03-13 11:19:09,601 | INFO :   - results_files/7/output_NC_000962_3_2872___3076894___3078078____1___ID_4_2872_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_663_plot.txt
2023-03-13 11:19:10,620 | INFO :   - results_files/7/output_NC_000962_3_2873___3078158___3078985____1___ID_4_2873_partial_00_start_type_ATG_rbs_motif_AGGAG_rbs_spacer_5_10bp_gc_cont_0_687_plot.txt
2023-03-13 11:19:11,623 | INFO :   - results_files/7/output_NC_000962_3_2874___3079309___3080415____1___ID_4_2874_partial_00_start_type_ATG_rbs_motif_AGGAG_G__GGAGG_rbs_spacer_13_15bp_gc_cont_0_669_plot.txt
2023-03-13 11:19:12,644 | INFO :   - results_files/7/output_NC_000962_3_2875___3080581___3081033____1___ID_4_2875_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_63

2023-03-13 11:43:55,113 | INFO :   - results_files/8/output_NC_000962_3_2908___3110780___3111796____1___ID_4_2908_partial_00_start_type_GTG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_721_plot.txt
2023-03-13 11:43:56,093 | INFO :   - results_files/8/output_NC_000962_3_2909___3111822___3112289___1___ID_4_2909_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_660_plot.txt
2023-03-13 11:43:57,213 | INFO :   - results_files/8/output_NC_000962_3_2910___3112465___3112722____1___ID_4_2910_partial_00_start_type_GTG_rbs_motif_AGGA_GGAG_GAGG_rbs_spacer_11_12bp_gc_cont_0_725_plot.txt
2023-03-13 11:43:58,361 | INFO :   - results_files/8/output_NC_000962_3_2911___3112840___3113271___1___ID_4_2911_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_674_plot.txt
2023-03-13 11:43:59,376 | INFO :   - results_files/8/output_NC_000962_3_2912___3113268___3113459___1___ID_4_2912_partial_00_start_type_GTG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_615_plot.txt

2023-03-13 11:44:53,951 | INFO :   - results_files/8/output_NC_000962_3_2950___3150231___3150716___1___ID_4_2950_partial_00_start_type_GTG_rbs_motif_AGxAGG_AGGxGG_rbs_spacer_5_10bp_gc_cont_0_706_plot.txt
2023-03-13 11:44:55,144 | INFO :   - results_files/8/output_NC_000962_3_2951___3150713___3151201___1___ID_4_2951_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_691_plot.txt
2023-03-13 11:44:56,317 | INFO :   - results_files/8/output_NC_000962_3_2952___3151202___3153082____1___ID_4_2952_partial_00_start_type_ATG_rbs_motif_AGxAGG_AGGxGG_rbs_spacer_5_10bp_gc_cont_0_656_plot.txt
2023-03-13 11:44:57,455 | INFO :   - results_files/8/output_NC_000962_3_2953___3153039___3154631____1___ID_4_2953_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_649_plot.txt
2023-03-13 11:44:58,461 | INFO :   - results_files/8/output_NC_000962_3_2954___3154654___3155871____1___ID_4_2954_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_con

2023-03-13 11:45:41,188 | INFO :   - results_files/8/output_NC_000962_3_2992___3192373___3193158____1___ID_4_2992_partial_00_start_type_ATG_rbs_motif_AGGAG_rbs_spacer_3_4bp_gc_cont_0_649_plot.txt
2023-03-13 11:45:45,324 | INFO :   - results_files/8/output_NC_000962_3_2993___3193450___3194151___1___ID_4_2993_partial_00_start_type_TTG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_635_plot.txt
2023-03-13 11:45:46,354 | INFO :   - results_files/8/output_NC_000962_3_2994___3194166___3195545____1___ID_4_2994_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_654_plot.txt
2023-03-13 11:45:47,473 | INFO :   - results_files/8/output_NC_000962_3_2995___3195545___3196132____1___ID_4_2995_partial_00_start_type_ATG_rbs_motif_AGGA_rbs_spacer_5_10bp_gc_cont_0_658_plot.txt
2023-03-13 11:45:48,493 | INFO :   - results_files/8/output_NC_000962_3_2996___3196431___3196850___1___ID_4_2996_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_667_plot.txt
2023-03-13 11:

2023-03-13 11:46:39,134 | INFO :   - results_files/8/output_NC_000962_3_3034___3238086___3238499____1___ID_4_3034_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_679_plot.txt
2023-03-13 11:46:40,216 | INFO :   - results_files/8/output_NC_000962_3_3035___3238601___3239470____1___ID_4_3035_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_672_plot.txt
2023-03-13 11:46:42,422 | INFO :   - results_files/8/output_NC_000962_3_3036___3239829___3240551____1___ID_4_3036_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_642_plot.txt
2023-03-13 11:46:45,390 | INFO :   - results_files/8/output_NC_000962_3_3037___3240548___3241135____1___ID_4_3037_partial_00_start_type_ATG_rbs_motif_AGGA_rbs_spacer_5_10bp_gc_cont_0_658_plot.txt
2023-03-13 11:46:46,487 | INFO :   - results_files/8/output_NC_000962_3_3038___3241222___3241959____1___ID_4_3038_partial_00_start_type_GTG_rbs_motif_AGGAG_rbs_spacer_5_10bp_gc_cont_0_638_plot.txt
2023-03-13 11:46:4

2023-03-13 11:47:30,836 | INFO :   - results_files/8/output_NC_000962_3_3076___3318901___3319467____1___ID_4_3076_partial_00_start_type_TTG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_698_plot.txt
2023-03-13 11:47:31,873 | INFO :   - results_files/8/output_NC_000962_3_3077___3319663___3323046____1___ID_4_3077_partial_00_start_type_GTG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_652_plot.txt
2023-03-13 11:47:32,905 | INFO :   - results_files/8/output_NC_000962_3_3078___3323071___3323658____1___ID_4_3078_partial_00_start_type_GTG_rbs_motif_None_rbs_spacer_None_gc_cont_0_619_plot.txt
2023-03-13 11:47:34,005 | INFO :   - results_files/8/output_NC_000962_3_3079___3323709___3324476____1___ID_4_3079_partial_00_start_type_GTG_rbs_motif_AGGA_GGAG_GAGG_rbs_spacer_11_12bp_gc_cont_0_632_plot.txt
2023-03-13 11:47:35,025 | INFO :   - results_files/8/output_NC_000962_3_3080___3324573___3325703____1___ID_4_3080_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_655_plot.

2023-03-13 11:48:15,352 | INFO :   - results_files/8/output_NC_000962_3_3118___3363693___3364478____1___ID_4_3118_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_650_plot.txt
2023-03-13 11:48:17,477 | INFO :   - results_files/8/output_NC_000962_3_3119___3364556___3365830___1___ID_4_3119_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_656_plot.txt
2023-03-13 11:48:18,575 | INFO :   - results_files/8/output_NC_000962_3_3120___3365836___3366414____1___ID_4_3120_partial_00_start_type_GTG_rbs_motif_3Base_5BMM_rbs_spacer_13_15bp_gc_cont_0_629_plot.txt
2023-03-13 11:48:19,589 | INFO :   - results_files/8/output_NC_000962_3_3121___3366698___3367267___1___ID_4_3121_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_11_12bp_gc_cont_0_630_plot.txt
2023-03-13 11:48:20,709 | INFO :   - results_files/8/output_NC_000962_3_3122___3367264___3368793____1___ID_4_3122_partial_00_start_type_ATG_rbs_motif_AGGA_rbs_spacer_5_10bp_gc_cont_0_652_plot.txt
20

2023-03-13 11:49:19,809 | INFO :   - results_files/8/output_NC_000962_3_3160___3405232___3406215___1___ID_4_3160_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_11_12bp_gc_cont_0_679_plot.txt
2023-03-13 11:49:20,809 | INFO :   - results_files/8/output_NC_000962_3_3161___3406285___3407325___1___ID_4_3161_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_646_plot.txt
2023-03-13 11:49:21,872 | INFO :   - results_files/8/output_NC_000962_3_3162___3407314___3407688____1___ID_4_3162_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_683_plot.txt
2023-03-13 11:49:22,866 | INFO :   - results_files/8/output_NC_000962_3_3163___3408022___3408306____1___ID_4_3163_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_11_12bp_gc_cont_0_642_plot.txt
2023-03-13 11:49:23,854 | INFO :   - results_files/8/output_NC_000962_3_3164___3408404___3409378____1___ID_4_3164_partial_00_start_type_GTG_rbs_motif_None_rbs_spacer_None_gc_cont_0_601_plot.

2023-03-13 11:50:26,976 | INFO :   - results_files/8/output_NC_000962_3_3202___3451781___3452887___1___ID_4_3202_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_668_plot.txt
2023-03-13 11:50:28,057 | INFO :   - results_files/8/output_NC_000962_3_3203___3452925___3454343___1___ID_4_3203_partial_00_start_type_ATG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_700_plot.txt
2023-03-13 11:50:29,192 | INFO :   - results_files/8/output_NC_000962_3_3204___3454340___3455764___1___ID_4_3204_partial_00_start_type_ATG_rbs_motif_AGGAG_rbs_spacer_5_10bp_gc_cont_0_659_plot.txt
2023-03-13 11:50:30,279 | INFO :   - results_files/8/output_NC_000962_3_3205___3455761___3457272___1___ID_4_3205_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_647_plot.txt
2023-03-13 11:50:31,426 | INFO :   - results_files/8/output_NC_000962_3_3206___3457769___3458071____1___ID_4_3206_partial_00_start_type_GTG_rbs_motif_GGAGG_rbs_spacer_5_10bp_gc_cont_0_647_plot.txt
202

2023-03-13 11:51:19,829 | INFO :   - results_files/8/output_NC_000962_3_3244___3492147___3493181___1___ID_4_3244_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_630_plot.txt
2023-03-13 11:51:20,929 | INFO :   - results_files/8/output_NC_000962_3_3245___3493168___3493518____1___ID_4_3245_partial_00_start_type_GTG_rbs_motif_GGAG_GAGG_rbs_spacer_5_10bp_gc_cont_0_644_plot.txt
2023-03-13 11:51:21,959 | INFO :   - results_files/8/output_NC_000962_3_3246___3493600___3494133____1___ID_4_3246_partial_00_start_type_ATG_rbs_motif_None_rbs_spacer_None_gc_cont_0_597_plot.txt
2023-03-13 11:51:22,994 | INFO :   - results_files/8/output_NC_000962_3_3247___3494678___3494992___1___ID_4_3247_partial_00_start_type_GTG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_648_plot.txt
2023-03-13 11:51:24,093 | INFO :   - results_files/8/output_NC_000962_3_3248___3494975___3496366____1___ID_4_3248_partial_00_start_type_ATG_rbs_motif_GGA_GAG_AGG_rbs_spacer_5_10bp_gc_cont_0_623_plot.tx

In [None]:
# Finalmente se unieron los 10 archivos de salida "prediction_results" en uno solo 
# #!/bin/bash 
#  x=1
#  while [ $x -le 10 ]
#  do
#  tail -n +3 $x/prediction_results.txt >> prediction_results_todos.txt
#  x=$(( $x + 1 ))
# done

In [123]:
#7. Lectura del archivo con todas los predicciones de secuencias señales para Lipoproteínas
result_signalp6_df = pd.read_csv("/home/mebigi/WBDSLA_Camp/results_files/prediction_results_todos.txt",sep="\t", header=None)

In [122]:
#result_ignalp6_df[result_ignalp6_df["SP(Sec/SPI)" != 0]]
result_signalp6_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,NC_000962.3_3683 # 3972552 # 3973592 # -1 # ID...,OTHER,1.000061,0.000000,0.000000,0.0,0.0,0.0,
1,NC_000962.3_3684 # 3973589 # 3974500 # -1 # ID...,OTHER,1.000057,0.000000,0.000000,0.0,0.0,0.0,
2,NC_000962.3_3685 # 3974511 # 3975296 # -1 # ID...,OTHER,1.000055,0.000002,0.000000,0.0,0.0,0.0,
3,NC_000962.3_3686 # 3975369 # 3977060 # 1 # ID=...,OTHER,0.999897,0.000083,0.000049,0.0,0.0,0.0,
4,NC_000962.3_3687 # 3977062 # 3977922 # 1 # ID=...,OTHER,1.000042,0.000005,0.000000,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...
4077,NC_000962.3_3678 # 3965884 # 3967038 # -1 # ID...,OTHER,1.000015,0.000053,0.000001,0.0,0.0,0.0,
4078,NC_000962.3_3679 # 3967038 # 3967820 # -1 # ID...,OTHER,1.000057,0.000000,0.000000,0.0,0.0,0.0,
4079,NC_000962.3_3680 # 3967817 # 3968944 # -1 # ID...,OTHER,1.000049,0.000000,0.000000,0.0,0.0,0.0,
4080,NC_000962.3_3681 # 3969343 # 3970563 # 1 # ID=...,OTHER,1.000033,0.000009,0.000000,0.0,0.0,0.0,


In [119]:
#info para ver los registros si hay nulos
result_signalp6_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4082 entries, 0 to 4081
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       4082 non-null   object 
 1   1       4082 non-null   object 
 2   2       4082 non-null   float64
 3   3       4082 non-null   float64
 4   4       4082 non-null   float64
 5   5       4082 non-null   float64
 6   6       4082 non-null   float64
 7   7       4082 non-null   float64
 8   8       226 non-null    object 
dtypes: float64(6), object(3)
memory usage: 287.1+ KB


In [124]:
result_signalp6_df.columns =['ID','Prediction','OTHER','SP(Sec/SPI)','LIPO(Sec/SPII)','TAT(Tat/SPI)','TATLIPO(Tat/SPII)','PILIN(Sec/SPIII)','CS Position']

In [125]:
result_signalp6_df


Unnamed: 0,ID,Prediction,OTHER,SP(Sec/SPI),LIPO(Sec/SPII),TAT(Tat/SPI),TATLIPO(Tat/SPII),PILIN(Sec/SPIII),CS Position
0,NC_000962.3_3683 # 3972552 # 3973592 # -1 # ID...,OTHER,1.000061,0.000000,0.000000,0.0,0.0,0.0,
1,NC_000962.3_3684 # 3973589 # 3974500 # -1 # ID...,OTHER,1.000057,0.000000,0.000000,0.0,0.0,0.0,
2,NC_000962.3_3685 # 3974511 # 3975296 # -1 # ID...,OTHER,1.000055,0.000002,0.000000,0.0,0.0,0.0,
3,NC_000962.3_3686 # 3975369 # 3977060 # 1 # ID=...,OTHER,0.999897,0.000083,0.000049,0.0,0.0,0.0,
4,NC_000962.3_3687 # 3977062 # 3977922 # 1 # ID=...,OTHER,1.000042,0.000005,0.000000,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...
4077,NC_000962.3_3678 # 3965884 # 3967038 # -1 # ID...,OTHER,1.000015,0.000053,0.000001,0.0,0.0,0.0,
4078,NC_000962.3_3679 # 3967038 # 3967820 # -1 # ID...,OTHER,1.000057,0.000000,0.000000,0.0,0.0,0.0,
4079,NC_000962.3_3680 # 3967817 # 3968944 # -1 # ID...,OTHER,1.000049,0.000000,0.000000,0.0,0.0,0.0,
4080,NC_000962.3_3681 # 3969343 # 3970563 # 1 # ID=...,OTHER,1.000033,0.000009,0.000000,0.0,0.0,0.0,


In [126]:
result_signalp6_df = result_signalp6_df.drop(["CS Position"], axis=1)

In [127]:
result_signalp6_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4082 entries, 0 to 4081
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 4082 non-null   object 
 1   Prediction         4082 non-null   object 
 2   OTHER              4082 non-null   float64
 3   SP(Sec/SPI)        4082 non-null   float64
 4   LIPO(Sec/SPII)     4082 non-null   float64
 5   TAT(Tat/SPI)       4082 non-null   float64
 6   TATLIPO(Tat/SPII)  4082 non-null   float64
 7   PILIN(Sec/SPIII)   4082 non-null   float64
dtypes: float64(6), object(2)
memory usage: 255.2+ KB


In [128]:
result_signalp6_df[['seqid','start','end','phase','ID_nro']] = result_signalp6_df["ID"].str.split(" # ", expand = True)

In [129]:

#8. Filtrado de los datos de interes donde predijo Lipoproteinas Likelihood > 0.4


#result_LIPO = result_signalp6_df[(result_signalp6_df['Prediction'] == 'TATLIPO') | (result_signalp6_df['Prediction'] == 'LIPO')]
result_LIPO = result_LIPO[(result_LIPO['LIPO(Sec/SPII)'] > 0.4) | (result_LIPO['TATLIPO(Tat/SPII)'] > 0.4)]

In [130]:
result_LIPO

Unnamed: 0,ID,Prediction,OTHER,SP(Sec/SPI),LIPO(Sec/SPII),TAT(Tat/SPI),TATLIPO(Tat/SPII),PILIN(Sec/SPIII),seqid,start,end,phase,ID_nro
54,NC_000962.3_144 # 159700 # 160782 # -1 # ID=4_...,TATLIPO,0.000000,0.000000,0.000006,0.000008,0.999995,0.000000,NC_000962.3_144,159700,160782,-1,ID=4_144;partial=00;start_type=ATG;rbs_motif=N...
101,NC_000962.3_187 # 204065 # 205237 # 1 # ID=4_1...,LIPO,0.001248,0.008024,0.990632,0.000076,0.000023,0.000009,NC_000962.3_187,204065,205237,1,ID=4_187;partial=00;start_type=ATG;rbs_motif=A...
178,NC_000962.3_256 # 287186 # 288352 # 1 # ID=4_2...,LIPO,0.000000,0.000000,1.000020,0.000000,0.000000,0.000000,NC_000962.3_256,287186,288352,1,ID=4_256;partial=00;start_type=ATG;rbs_motif=N...
208,NC_000962.3_283 # 316511 # 317503 # -1 # ID=4_...,TATLIPO,0.000000,0.000000,0.000216,0.000050,0.999723,0.000000,NC_000962.3_283,316511,317503,-1,ID=4_283;partial=00;start_type=GTG;rbs_motif=N...
258,NC_000962.3_328 # 377931 # 378587 # 1 # ID=4_3...,LIPO,0.456774,0.049797,0.487810,0.000470,0.000284,0.004865,NC_000962.3_328,377931,378587,1,ID=4_328;partial=00;start_type=ATG;rbs_motif=N...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3809,NC_000962.3_3810 # 4105459 # 4107084 # -1 # ID...,LIPO,0.000000,0.000046,1.000015,0.000000,0.000000,0.000000,NC_000962.3_3810,4105459,4107084,-1,ID=4_3810;partial=00;start_type=ATG;rbs_motif=...
3912,NC_000962.3_3913 # 4209047 # 4209526 # 1 # ID=...,LIPO,0.000000,0.000000,1.000043,0.000000,0.000000,0.000000,NC_000962.3_3913,4209047,4209526,1,ID=4_3913;partial=00;start_type=GTG;rbs_motif=...
3947,NC_000962.3_3948 # 4249878 # 4251005 # 1 # ID=...,TATLIPO,0.000192,0.002493,0.006592,0.041162,0.949528,0.000022,NC_000962.3_3948,4249878,4251005,1,ID=4_3948;partial=00;start_type=ATG;rbs_motif=...
3961,NC_000962.3_3962 # 4273739 # 4274593 # 1 # ID=...,LIPO,0.041122,0.015608,0.763581,0.024236,0.155091,0.000356,NC_000962.3_3962,4273739,4274593,1,ID=4_3962;partial=00;start_type=GTG;rbs_motif=...


In [131]:
len(result_LIPO)

94

In [132]:
#/home/mebigi/WBDSLA_Camp/NC_000962.3.faa

lista_candidatos = result_LIPO['seqid'].tolist()
lista_candidatos_sin_espacios= []
for seq_id in lista_candidatos:
    seq_id = seq_id.strip()
    lista_candidatos_sin_espacios.append(seq_id)
lista_candidatos_sin_espacios

['NC_000962.3_144',
 'NC_000962.3_187',
 'NC_000962.3_256',
 'NC_000962.3_283',
 'NC_000962.3_328',
 'NC_000962.3_363',
 'NC_000962.3_401',
 'NC_000962.3_420',
 'NC_000962.3_432',
 'NC_000962.3_439',
 'NC_000962.3_440',
 'NC_000962.3_453',
 'NC_000962.3_506',
 'NC_000962.3_553',
 'NC_000962.3_612',
 'NC_000962.3_623',
 'NC_000962.3_635',
 'NC_000962.3_706',
 'NC_000962.3_714',
 'NC_000962.3_875',
 'NC_000962.3_1006',
 'NC_000962.3_1044',
 'NC_000962.3_1054',
 'NC_000962.3_1060',
 'NC_000962.3_1066',
 'NC_000962.3_1113',
 'NC_000962.3_1223',
 'NC_000962.3_1238',
 'NC_000962.3_886',
 'NC_000962.3_973',
 'NC_000962.3_977',
 'NC_000962.3_979',
 'NC_000962.3_1287',
 'NC_000962.3_1294',
 'NC_000962.3_1311',
 'NC_000962.3_1329',
 'NC_000962.3_1333',
 'NC_000962.3_1334',
 'NC_000962.3_1339',
 'NC_000962.3_1413',
 'NC_000962.3_1433',
 'NC_000962.3_1475',
 'NC_000962.3_1599',
 'NC_000962.3_1613',
 'NC_000962.3_1751',
 'NC_000962.3_1764',
 'NC_000962.3_1782',
 'NC_000962.3_1938',
 'NC_000962.3_19

In [133]:
#9. a partir de la lista de candidatos obtengo la secuencia de proteinas y posteriormente lo guardo en un archivo

query_str= ""
for record in SeqIO.parse('NC_000962.3.faa', 'fasta'):
    seq_id = record.id
    if (seq_id in lista_candidatos_sin_espacios):
        seq_str = str(record.seq)
        query_str+=">"+seq_id+"\n"+seq_str+"\n"
        query_str = query_str.replace("*","")

In [134]:
candidatos = open("Lipos_candidatas.txt", "w")
candidatos.write(query_str)

32324

In [135]:
candidatos.close()

In [136]:
# 10. Alineamiento con los genes identificados previamente de referencia, en tuberculosis se le asigna un númer de Rv ademas del nombre del gen

In [137]:
#Creación de una base de datos tipo BLAST


#bnstallacion Blast local


# wget ncbi-blast-2.13.0+-x64-linux.tar.gz 

# tar ncbi-blast-2.13.0+-x64-linux.tar.gz
# mv ncbi-blast-2.13.0+ blast
# rm ncbi-blast-2.10.0+-x64-linux.tar.gz

# Add a folder for blast databases

# cd blast
# mkdir db
# cd ..

# you become administrator (you enter your password):

# sudo su

# You place the executables in /usr/share to make them usable by everyone:

# mv blast /usr/share
# chmod a+w+r /usr/share/blast/db
# export PATH="$PATH:/usr/share/blast/bin"
# export BLASTDB="/usr/share/blast/db/"
# exit


In [138]:
from Bio.Blast import NCBIWWW
help(NCBIWWW.qblast)

Help on function qblast in module Bio.Blast.NCBIWWW:

qblast(program, database, sequence, url_base='https://blast.ncbi.nlm.nih.gov/Blast.cgi', auto_format=None, composition_based_statistics=None, db_genetic_code=None, endpoints=None, entrez_query='(none)', expect=10.0, filter=None, gapcosts=None, genetic_code=None, hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, matrix_name=None, nucl_penalty=None, nucl_reward=None, other_advanced=None, perc_ident=None, phi_pattern=None, query_file=None, query_believe_defline=None, query_from=None, query_to=None, searchsp_eff=None, service=None, threshold=None, ungapped_alignment=None, word_size=None, short_query=None, alignments=500, alignment_view=None, descriptions=500, entrez_links_new_window=None, expect_low=None, expect_high=None, format_entrez_query=None, format_object=None, format_type='XML', ncbi_gi=None, results_file=None, show_overview=None, megablast=None, template_type=None, template_length=None, username='blast', password=No

In [139]:
# Mycobacterium_tuberculosis_H37Rv_proteins_v4.fasta base de datos de referencia 
# descagrado https://mycobrowser.epfl.ch/releases#fasta_proteins4
makeblastdb_path = "/usr/share/blast/bin/makeblastdb"
makeblastdb_command = [makeblastdb_path,'-in',"Mycobacterium_tuberculosis_H37Rv_proteins_v4.fasta",'-dbtype','prot']
subprocess.call(makeblastdb_command)
blastp_path      = "/usr/share/blast/bin/blastp"
blast_out_format = "6 qseqid sseqid qlen slen qstart sstart qend send score evalue length positive"
blast_out_file   = "candidatas.blast.tsv"
blastp_command   = [blastp_path,
                    "-db",          "Mycobacterium_tuberculosis_H37Rv_proteins_v4.fasta",
                    "-query",       "Lipos_candidatas.txt",
                    "-evalue",      "1e-9",
                    "-out",         blast_out_file,
                    "-outfmt",      blast_out_format,
                    "-num_threads", "12"]
subprocess.call(blastp_command)



Building a new DB, current time: 03/17/2023 19:15:04
New DB name:   /home/mebigi/WBDSLA_Camp/Mycobacterium_tuberculosis_H37Rv_proteins_v4.fasta
New DB title:  Mycobacterium_tuberculosis_H37Rv_proteins_v4.fasta
Sequence type: Protein
Deleted existing Protein BLAST database named /home/mebigi/WBDSLA_Camp/Mycobacterium_tuberculosis_H37Rv_proteins_v4.fasta
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 4031 sequences in 0.058928 seconds.




FASTA-Reader: Ignoring invalid residues at position(s): On line 18355: 59
FASTA-Reader: Ignoring invalid residues at position(s): On line 24436: 14
FASTA-Reader: Ignoring invalid residues at position(s): On line 28172: 7


0

In [192]:
blastp_column_names = blast_out_format.split(" ")[1:]
blastp_lipo = pd.read_csv("candidatas.blast.tsv",sep="\t",names=blastp_column_names)

In [193]:
blastp_lipo

Unnamed: 0,qseqid,sseqid,qlen,slen,qstart,sstart,qend,send,score,evalue,length,positive
0,NC_000962.3_144,Rv0132c|fgd2|Putative,360,360,1,1,360,360,1883,0.000000e+00,360,360
1,NC_000962.3_144,Rv0407|fgd1|F420-dependent,360,336,40,2,360,333,501,2.180000e-61,332,176
2,NC_000962.3_144,Rv2951c|Rv2951c|Possible,360,381,92,74,272,257,181,1.110000e-15,190,89
3,NC_000962.3_144,Rv0953c|Rv0953c|Possible,360,282,45,4,254,227,170,1.320000e-14,231,99
4,NC_000962.3_144,Rv2893|Rv2893|Possible,360,325,57,27,240,209,172,1.420000e-14,191,86
...,...,...,...,...,...,...,...,...,...,...,...,...
228,NC_000962.3_3913,Rv3763|lpqH|19,159,159,1,1,159,159,775,1.740000e-108,159,159
229,NC_000962.3_3948,Rv3796|Rv3796|Conserved,375,375,1,1,375,375,1956,0.000000e+00,375,375
230,NC_000962.3_3948,Rv2407|Rv2407|Conserved,375,273,63,4,369,263,211,5.600000e-20,314,122
231,NC_000962.3_3962,Rv3810|pirG|Exported,284,284,1,1,284,284,1367,0.000000e+00,284,284


In [194]:
# me quedo con las secuancias identificadas al 100
blastp_lipo = blastp_lipo[blastp_lipo['qlen'] == blastp_lipo['positive']]
blastp_lipo

Unnamed: 0,qseqid,sseqid,qlen,slen,qstart,sstart,qend,send,score,evalue,length,positive
0,NC_000962.3_144,Rv0132c|fgd2|Putative,360,360,1,1,360,360,1883,0.000000e+00,360,360
7,NC_000962.3_187,Rv0173|lprK|Possible,390,390,1,1,390,390,2037,0.000000e+00,390,390
12,NC_000962.3_256,Rv0237|lpqI|Probable,388,388,1,1,388,388,1953,0.000000e+00,388,388
13,NC_000962.3_283,Rv0265c|Rv0265c|Probable,330,330,1,1,330,330,1696,0.000000e+00,330,330
14,NC_000962.3_328,Rv0309|Rv0309|Possible,218,218,1,1,218,218,1145,6.190000e-163,218,218
...,...,...,...,...,...,...,...,...,...,...,...,...
227,NC_000962.3_3810,Rv3666c|dppA|Probable,541,541,1,1,541,541,2838,0.000000e+00,541,541
228,NC_000962.3_3913,Rv3763|lpqH|19,159,159,1,1,159,159,775,1.740000e-108,159,159
229,NC_000962.3_3948,Rv3796|Rv3796|Conserved,375,375,1,1,375,375,1956,0.000000e+00,375,375
231,NC_000962.3_3962,Rv3810|pirG|Exported,284,284,1,1,284,284,1367,0.000000e+00,284,284


In [195]:
#11 Agrego los datos de prediccion a los resultdos 
#blastp_lipo
#result_LIPO

def macheo_seq(un_resultado):
    df= blastp_lipo
    #print(str(un_resultado['seqid']))
    for index , row in df.iterrows():
     if str(row['qseqid']).strip() == str(un_resultado['seqid']).strip():
        return row['sseqid']

    
    

result_LIPO['RV'] = result_LIPO.apply(macheo_seq, axis="columns")



In [174]:
#Se identificarion todos los genes predichos por pyrodigal #94 en total

In [196]:
#Divido los campos sseqid
result_LIPO[["Rv","name", "funtion"]]  = result_LIPO["RV"].str.split("|", expand = True)
result_LIPO

        

Unnamed: 0,ID,Prediction,OTHER,SP(Sec/SPI),LIPO(Sec/SPII),TAT(Tat/SPI),TATLIPO(Tat/SPII),PILIN(Sec/SPIII),seqid,start,end,phase,ID_nro,RV,Rv,name,funtion,Descriptions
54,NC_000962.3_144 # 159700 # 160782 # -1 # ID=4_...,TATLIPO,0.000000,0.000000,0.000006,0.000008,0.999995,0.000000,NC_000962.3_144,159700,160782,-1,ID=4_144;partial=00;start_type=ATG;rbs_motif=N...,Rv0132c|fgd2|Putative,Rv0132c,fgd2,Putative,
101,NC_000962.3_187 # 204065 # 205237 # 1 # ID=4_1...,LIPO,0.001248,0.008024,0.990632,0.000076,0.000023,0.000009,NC_000962.3_187,204065,205237,1,ID=4_187;partial=00;start_type=ATG;rbs_motif=A...,Rv0173|lprK|Possible,Rv0173,lprK,Possible,Possible Mce-family lipoprotein LprK (Mce-fami...
178,NC_000962.3_256 # 287186 # 288352 # 1 # ID=4_2...,LIPO,0.000000,0.000000,1.000020,0.000000,0.000000,0.000000,NC_000962.3_256,287186,288352,1,ID=4_256;partial=00;start_type=ATG;rbs_motif=N...,Rv0237|lpqI|Probable,Rv0237,lpqI,Probable,Probable conserved lipoprotein LpqI
208,NC_000962.3_283 # 316511 # 317503 # -1 # ID=4_...,TATLIPO,0.000000,0.000000,0.000216,0.000050,0.999723,0.000000,NC_000962.3_283,316511,317503,-1,ID=4_283;partial=00;start_type=GTG;rbs_motif=N...,Rv0265c|Rv0265c|Probable,Rv0265c,Rv0265c,Probable,Probable periplasmic iron-transport lipoprotein
258,NC_000962.3_328 # 377931 # 378587 # 1 # ID=4_3...,LIPO,0.456774,0.049797,0.487810,0.000470,0.000284,0.004865,NC_000962.3_328,377931,378587,1,ID=4_328;partial=00;start_type=ATG;rbs_motif=N...,Rv0309|Rv0309|Possible,Rv0309,Rv0309,Possible,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3809,NC_000962.3_3810 # 4105459 # 4107084 # -1 # ID...,LIPO,0.000000,0.000046,1.000015,0.000000,0.000000,0.000000,NC_000962.3_3810,4105459,4107084,-1,ID=4_3810;partial=00;start_type=ATG;rbs_motif=...,Rv3666c|dppA|Probable,Rv3666c,dppA,Probable,Probable periplasmic dipeptide-binding lipopro...
3912,NC_000962.3_3913 # 4209047 # 4209526 # 1 # ID=...,LIPO,0.000000,0.000000,1.000043,0.000000,0.000000,0.000000,NC_000962.3_3913,4209047,4209526,1,ID=4_3913;partial=00;start_type=GTG;rbs_motif=...,Rv3763|lpqH|19,Rv3763,lpqH,19,19 kDa lipoprotein antigen precursor LpqH
3947,NC_000962.3_3948 # 4249878 # 4251005 # 1 # ID=...,TATLIPO,0.000192,0.002493,0.006592,0.041162,0.949528,0.000022,NC_000962.3_3948,4249878,4251005,1,ID=4_3948;partial=00;start_type=ATG;rbs_motif=...,Rv3796|Rv3796|Conserved,Rv3796,Rv3796,Conserved,
3961,NC_000962.3_3962 # 4273739 # 4274593 # 1 # ID=...,LIPO,0.041122,0.015608,0.763581,0.024236,0.155091,0.000356,NC_000962.3_3962,4273739,4274593,1,ID=4_3962;partial=00;start_type=GTG;rbs_motif=...,Rv3810|pirG|Exported,Rv3810,pirG,Exported,


In [197]:
# ler el listados de lipoproteinas categoriazadas segun busqueda "Lipoprotein" en base de datos Mycrobrowser
# lipoproteinas_tuberculist.txt
tubeculist_lipo = pd.read_csv("lipoproteinas_tuberculist.txt",sep="\t", header=None)
tubeculist_lipo.columns = ['#','RV_ID','name','Descriptions']
tubeculist_lipo

Unnamed: 0,#,RV_ID,name,Descriptions
0,1,Rv0237,lpqI,Probable conserved lipoprotein LpqI
1,2,Rv0265c,Rv0265c,Probable periplasmic iron-transport lipoprotein
2,3,Rv0173,lprK,Possible Mce-family lipoprotein LprK (Mce-fami...
3,4,Rv0179c,lprO,Possible lipoprotein LprO
4,5,Rv0399c,lpqK,Possible conserved lipoprotein LpqK
...,...,...,...,...
83,84,Rv3390,lpqD,Probable conserved lipoprotein LpqD
84,85,Rv3576,lppH,Possible conserved lipoprotein LppH
85,86,Rv3666c,dppA,Probable periplasmic dipeptide-binding lipopro...
86,87,Rv3759c,proX,Possible osmoprotectant (glycine betaine/carni...


In [198]:
def macheo_seq(un_resultado):
    df= tubeculist_lipo
    for index , row in df.iterrows():
        if str(row['RV_ID']).strip() == str(un_resultado['Rv']).strip():
            return row['Descriptions']

    
    
result_LIPO['Descriptions'] = ''
result_LIPO['Descriptions'] = result_LIPO.apply(macheo_seq, axis="columns")


In [199]:
result_LIPO

Unnamed: 0,ID,Prediction,OTHER,SP(Sec/SPI),LIPO(Sec/SPII),TAT(Tat/SPI),TATLIPO(Tat/SPII),PILIN(Sec/SPIII),seqid,start,end,phase,ID_nro,RV,Rv,name,funtion,Descriptions
54,NC_000962.3_144 # 159700 # 160782 # -1 # ID=4_...,TATLIPO,0.000000,0.000000,0.000006,0.000008,0.999995,0.000000,NC_000962.3_144,159700,160782,-1,ID=4_144;partial=00;start_type=ATG;rbs_motif=N...,Rv0132c|fgd2|Putative,Rv0132c,fgd2,Putative,
101,NC_000962.3_187 # 204065 # 205237 # 1 # ID=4_1...,LIPO,0.001248,0.008024,0.990632,0.000076,0.000023,0.000009,NC_000962.3_187,204065,205237,1,ID=4_187;partial=00;start_type=ATG;rbs_motif=A...,Rv0173|lprK|Possible,Rv0173,lprK,Possible,Possible Mce-family lipoprotein LprK (Mce-fami...
178,NC_000962.3_256 # 287186 # 288352 # 1 # ID=4_2...,LIPO,0.000000,0.000000,1.000020,0.000000,0.000000,0.000000,NC_000962.3_256,287186,288352,1,ID=4_256;partial=00;start_type=ATG;rbs_motif=N...,Rv0237|lpqI|Probable,Rv0237,lpqI,Probable,Probable conserved lipoprotein LpqI
208,NC_000962.3_283 # 316511 # 317503 # -1 # ID=4_...,TATLIPO,0.000000,0.000000,0.000216,0.000050,0.999723,0.000000,NC_000962.3_283,316511,317503,-1,ID=4_283;partial=00;start_type=GTG;rbs_motif=N...,Rv0265c|Rv0265c|Probable,Rv0265c,Rv0265c,Probable,Probable periplasmic iron-transport lipoprotein
258,NC_000962.3_328 # 377931 # 378587 # 1 # ID=4_3...,LIPO,0.456774,0.049797,0.487810,0.000470,0.000284,0.004865,NC_000962.3_328,377931,378587,1,ID=4_328;partial=00;start_type=ATG;rbs_motif=N...,Rv0309|Rv0309|Possible,Rv0309,Rv0309,Possible,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3809,NC_000962.3_3810 # 4105459 # 4107084 # -1 # ID...,LIPO,0.000000,0.000046,1.000015,0.000000,0.000000,0.000000,NC_000962.3_3810,4105459,4107084,-1,ID=4_3810;partial=00;start_type=ATG;rbs_motif=...,Rv3666c|dppA|Probable,Rv3666c,dppA,Probable,Probable periplasmic dipeptide-binding lipopro...
3912,NC_000962.3_3913 # 4209047 # 4209526 # 1 # ID=...,LIPO,0.000000,0.000000,1.000043,0.000000,0.000000,0.000000,NC_000962.3_3913,4209047,4209526,1,ID=4_3913;partial=00;start_type=GTG;rbs_motif=...,Rv3763|lpqH|19,Rv3763,lpqH,19,19 kDa lipoprotein antigen precursor LpqH
3947,NC_000962.3_3948 # 4249878 # 4251005 # 1 # ID=...,TATLIPO,0.000192,0.002493,0.006592,0.041162,0.949528,0.000022,NC_000962.3_3948,4249878,4251005,1,ID=4_3948;partial=00;start_type=ATG;rbs_motif=...,Rv3796|Rv3796|Conserved,Rv3796,Rv3796,Conserved,
3961,NC_000962.3_3962 # 4273739 # 4274593 # 1 # ID=...,LIPO,0.041122,0.015608,0.763581,0.024236,0.155091,0.000356,NC_000962.3_3962,4273739,4274593,1,ID=4_3962;partial=00;start_type=GTG;rbs_motif=...,Rv3810|pirG|Exported,Rv3810,pirG,Exported,


In [200]:

# Encontramos 4 nuevas posiblers lipo proteinas predichas con signalP6
result_LIPO[result_LIPO['Descriptions'].isnull()]


Unnamed: 0,ID,Prediction,OTHER,SP(Sec/SPI),LIPO(Sec/SPII),TAT(Tat/SPI),TATLIPO(Tat/SPII),PILIN(Sec/SPIII),seqid,start,end,phase,ID_nro,RV,Rv,name,funtion,Descriptions
54,NC_000962.3_144 # 159700 # 160782 # -1 # ID=4_...,TATLIPO,0.0,0.0,6e-06,8e-06,0.999995,0.0,NC_000962.3_144,159700,160782,-1,ID=4_144;partial=00;start_type=ATG;rbs_motif=N...,Rv0132c|fgd2|Putative,Rv0132c,fgd2,Putative,
258,NC_000962.3_328 # 377931 # 378587 # 1 # ID=4_3...,LIPO,0.456774,0.049797,0.48781,0.00047,0.000284,0.004865,NC_000962.3_328,377931,378587,1,ID=4_328;partial=00;start_type=ATG;rbs_motif=N...,Rv0309|Rv0309|Possible,Rv0309,Rv0309,Possible,
340,NC_000962.3_401 # 456915 # 457823 # -1 # ID=4_...,LIPO,0.0,0.0,1.000049,0.0,0.0,0.0,NC_000962.3_401,456915,457823,-1,ID=4_401;partial=00;start_type=GTG;rbs_motif=G...,Rv0381c|Rv0381c|Hypothetical,Rv0381c,Rv0381c,Hypothetical,
397,NC_000962.3_453 # 519600 # 520322 # 1 # ID=4_4...,LIPO,3.7e-05,0.002513,0.997407,3e-06,3.7e-05,2e-06,NC_000962.3_453,519600,520322,1,ID=4_453;partial=00;start_type=ATG;rbs_motif=G...,Rv0432|sodC|Periplasmic,Rv0432,sodC,Periplasmic,
552,NC_000962.3_553 # 616885 # 617496 # 1 # ID=4_5...,LIPO,0.0,0.0,1.000044,0.0,0.0,0.0,NC_000962.3_553,616885,617496,1,ID=4_553;partial=00;start_type=ATG;rbs_motif=A...,Rv0526|Rv0526|Possible,Rv0526,Rv0526,Possible,
713,NC_000962.3_714 # 779543 # 780040 # -1 # ID=4_...,LIPO,0.000139,0.004212,0.995642,2.7e-05,7e-06,2e-06,NC_000962.3_714,779543,780040,-1,ID=4_714;partial=00;start_type=GTG;rbs_motif=N...,Rv0679c|Rv0679c|Conserved,Rv0679c,Rv0679c,Conserved,
926,NC_000962.3_1044 # 1115767 # 1116525 # 1 # ID=...,LIPO,0.025968,0.474981,0.498771,2.9e-05,0.000244,1e-05,NC_000962.3_1044,1115767,1116525,1,ID=4_1044;partial=00;start_type=GTG;rbs_motif=...,Rv0999|Rv0999|Unknown,Rv0999,Rv0999,Unknown,
936,NC_000962.3_1054 # 1128091 # 1129179 # 1 # ID=...,LIPO,0.146166,0.220087,0.63242,0.000409,0.000273,0.000651,NC_000962.3_1054,1128091,1129179,1,ID=4_1054;partial=00;start_type=ATG;rbs_motif=...,Rv1009|rpfB|Probable,Rv1009,rpfB,Probable,
1120,NC_000962.3_1238 # 1313725 # 1315191 # 1 # ID=...,LIPO,0.381349,0.004451,0.613956,3e-05,4.2e-05,0.000167,NC_000962.3_1238,1313725,1315191,1,ID=4_1238;partial=00;start_type=GTG;rbs_motif=...,Rv1180|pks3|Probable,Rv1180,pks3,Probable,
1168,NC_000962.3_886 # 942680 # 944194 # -1 # ID=4_...,TATLIPO,1e-06,6e-06,0.000373,0.07249,0.92712,0.0,NC_000962.3_886,942680,944194,-1,ID=4_886;partial=00;start_type=ATG;rbs_motif=A...,Rv0846c|Rv0846c|Probable,Rv0846c,Rv0846c,Probable,


In [201]:
len(result_LIPO[result_LIPO['Descriptions'].isnull()])

29

In [186]:
#12. obtención de la tabla final 


tabla_final = pd.DataFrame()
tabla_final['Rv_id'] = result_LIPO["Rv"]
tabla_final['name'] = result_LIPO["name"]
tabla_final['Descriptions'] = result_LIPO["Descriptions"]
tabla_final['LIPO(Sec/SPII)'] = result_LIPO["LIPO(Sec/SPII)"]
tabla_final['TATLIPO(Tat/SPII)'] = result_LIPO["TATLIPO(Tat/SPII)"]

In [190]:
tabla_final.to_csv('tabla_final_lipos.txt')
tabla_final

Unnamed: 0,Rv_id,name,Descriptions,LIPO(Sec/SPII),TATLIPO(Tat/SPII)
54,Rv0132c,fgd2,,0.000006,0.999995
101,Rv0173,lprK,Possible Mce-family lipoprotein LprK (Mce-fami...,0.990632,0.000023
178,Rv0237,lpqI,Probable conserved lipoprotein LpqI,1.000020,0.000000
208,Rv0265c,Rv0265c,Probable periplasmic iron-transport lipoprotein,0.000216,0.999723
258,Rv0309,Rv0309,,0.487810,0.000284
...,...,...,...,...,...
3809,Rv3666c,dppA,Probable periplasmic dipeptide-binding lipopro...,1.000015,0.000000
3912,Rv3763,lpqH,19 kDa lipoprotein antigen precursor LpqH,1.000043,0.000000
3947,Rv3796,Rv3796,,0.006592,0.949528
3961,Rv3810,pirG,,0.763581,0.155091


In [202]:
# Conclusión Se se predijeron 94 lipoproteinas, como resultado preliminar se obtuvieron 29 lipoproteinas que se diferencian de la lista ya anotadas. 
#Estos daos deben ser auditados manualmente a fin de corrregir posibles diferencias en la anotación respecto a la Base de datos de referencia.