In [3]:
from urllib.request import urlopen

from io import TextIOWrapper

from Bio import SeqIO 

import pandas as pd

import reframed

import re

In [4]:
possible_transporters = pd.read_excel("/Users/idunmariaburgos/Documents/Work/Project/chain elongation/Clostridia BL3 and BL4/possible_transporters_tcdb.xlsx",sheet_name="overview")
possible_transporters

Unnamed: 0,Acetate,Propionate,Butyrate,Valerate,Hexanoate,Iso-butyrate
0,3.A.1.120.5,1.A.46.2.1,2.A.1.13.1,2.A.73.1.1,1.B.25.1.8,
1,1.A.14.2.2,2.A.21.4.1,2.A.1.19.18,,,
2,2.A.1.13.1,2.A.21.5.3,2.A.21.4.1,,,
3,2.A.1.6.11,2.A.21.7.3,2.A.21.5.3,,,
4,2.A.102.4.13,2.A.8.1.13,2.A.21.5.4,,,
5,2.A.21.7.3,3.A.1.205.3,2.A.21.5.5,,,
6,2.A.23.1.10,,2.A.73.1.1,,,
7,2.A.6.2.9,,2.A.8.1.13,,,
8,2.A.96.1.11,,,,,
9,2.A.96.1.4,,,,,


- 3.A.1.120.5 - abc
- 1.A.14.2.2 - acetate uptake
- 2.A.23.1.10 - uptake
- 2.A.6.2.9 - abc
- 2.A.96.1.11 - uptake


- 2.A.1.13.1 - proton linked efflux/influx
- 2.A.1.6.11 - Acetate/haloacid transporter (no mechanism)
- 2.A.102.4.13 - acetate transporter
- 2.A.21.7.3 - Pyruvate/acetate/propionate: H+ symporter
- 2.A.96.1.4 - acetate permease

In [5]:
possible_transporters = ['2.A.1.13.1','2.A.1.6.11','2.A.102.4.13','2.A.21.7.3','2.A.96.1.4']

In [6]:
possible_transporters_regex = "|".join(possible_transporters)

In [7]:
possible_transporters_regex

'2.A.1.13.1|2.A.1.6.11|2.A.102.4.13|2.A.21.7.3|2.A.96.1.4'

**Filter data from tcdb**

In [8]:
handle = TextIOWrapper(urlopen("https://www.tcdb.org/public/tcdb"))
recs = list(SeqIO.parse(handle, 'fasta'))
handle.close()

In [9]:
ofile = open("transporters/transporters.faa","w")

for rec in recs:
    id_ = str(rec.id)
    
    if re.search(possible_transporters_regex, id_):
        
        
        sequence = str(rec.seq)
        
        ofile.write(">"+id_+"\n"+sequence+"\n")
ofile.close()        


In [10]:
import os


In [19]:
ACt2r_MAGs = []

ACt2r_MAGs_data = []
for filename in os.listdir("transporters/"):
    
    if filename.endswith(".tsv"):

        transport= pd.read_csv("transporters/"+filename,sep="\t",header=None)
        transport.columns = ["query acc.ver", "subject acc.ver", "% identity", "alignment length", "mismatches", "gap opens", "q. start", "q. end", "s. start", "s. end", "evalue", "bit score"]
        transport.reset_index(inplace=True)
        
        if transport.shape[0]>1:
            
            transport.sort_values(by="evalue",ascending=True,inplace=True)
            transport.reset_index(inplace=True)
            
            query = transport.loc[0,"query acc.ver"]
            gene = transport.loc[0,"subject acc.ver"]
            best_evalue = transport.loc[0,"evalue"]
            bit_score = transport.loc[0,"bit score"]
            
            if best_evalue<1e-5 and bit_score>20:
                if "2.A.1.13.1" in query:
                    ACt2r_MAGs.append(filename[:-4])
                    
                    ACt2r_MAGs_data.append((filename[:-4],gene,best_evalue,bit_score,"2.A.1.13.1"))

                elif "2.A.21.7.3" in query: 
                    ACt2r_MAGs.append(filename[:-4])
                    ACt2r_MAGs_data.append((filename[:-4],gene,best_evalue,bit_score,"2.A.21.7.3"))


In [14]:
len(ACt2r_MAGs)

55

In [20]:
pd.DataFrame(ACt2r_MAGs_data,columns=["MAG","gene","evalue","bit_score","TCDB_id"]).sort_values("evalue")

Unnamed: 0,MAG,gene,evalue,bit_score,TCDB_id
43,CH15-bin.17,gene545,4.9000000000000004e-157,456.0,2.A.21.7.3
12,CH13-bin.1,gene374,4.9100000000000004e-157,456.0,2.A.21.7.3
40,CH7-bin.16,gene2185,8.2e-156,452.0,2.A.21.7.3
2,CH7-bin.4,gene3341,4.48e-153,445.0,2.A.21.7.3
28,CH8-bin.25,gene3105,2.22e-151,441.0,2.A.21.7.3
41,CH1-bin.9,gene769,1.28e-150,439.0,2.A.21.7.3
13,CH15-bin.6,gene1704,1.4299999999999999e-142,420.0,2.A.21.7.3
11,CH1-bin.1,gene727,3.5899999999999997e-56,191.0,2.A.21.7.3
25,CH14-bin.2,gene1003,9.490000000000001e-23,99.0,2.A.21.7.3
49,CH9-bin.1,gene3888,1.19e-22,99.0,2.A.21.7.3
