In [161]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
import glob
import urllib
import xml.etree.ElementTree as ET
import os
import os.path
from Bio.Alphabet import generic_dna

fastaList=list(SeqIO.parse("./All_Parts_iGEM.txt", "fasta"))
fastaDict={ele.id:ele for ele in fastaList}

In [162]:
with open("./pSB1C3.gbk",'r') as file_handle:
    record_dict = SeqIO.to_dict(SeqIO.parse(file_handle, 'gb'))
    pSB1C3 = record_dict[list(record_dict.keys())[0]]
with open("./pSB1A2.gbk",'r') as file_handle:
    record_dict = SeqIO.to_dict(SeqIO.parse(file_handle, 'gb'))
    pSB1A2 = record_dict[list(record_dict.keys())[0]]

In [163]:
plates={}
parts=[]
for infile_loc in glob.glob("./Plate_Layouts/*.csv"):
    plateNum = infile_loc.split()[-2]
    plate = pd.read_csv(infile_loc,encoding = "ISO-8859-1")
    plateParts=plate["  Part "].values.tolist()
    parts.extend([ele.strip() for ele in plateParts])
    plates[plateNum]=plateParts         
parts = list(set(parts))

In [164]:
strains=pd.read_csv("igem2019_strain_metadata.csv",index_col=0)
strains['accession']=strains['accession'].str.strip()

measuredParts = {str(ele).strip() for ele in set(list(strains['accession']))}
allParts={ele.split("BBa_")[-1] for ele in parts}

In [165]:
allPartsSeqs={}
c=0
for curPart in measuredParts&allParts:
    print(".",end="")
    c+=1
    igemURL = f'http://parts.igem.org/cgi/xml/part.cgi?part={curPart}'
    tree = ET.parse(urllib.request.urlopen(igemURL))
    root = tree.getroot()
    for seq in root.iter('seq_data'):
        #record = seq.text + gbkFile
        sequence=seq.text
        sequence=sequence.replace("\n", "")
        sequence=Seq(sequence,generic_dna)

    allPartsSeqs[curPart]=sequence

...............................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [166]:
notAnno=[]
sequenceRecords=[]
for strain in strains.index:
    curPart = strains.loc[strain]['accession']
    if curPart in allPartsSeqs.keys():
        curBackbone=strains[strains['accession'] == curPart]['vector'].values[0]
        if curBackbone == "pSB1C3" or curBackbone == "pSB1A2":
            if curBackbone == "pSB1C3":
                bb=pSB1C3
            if curBackbone == "pSB1A2":
                bb=pSB1A2

            record = bb + allPartsSeqs[curPart].lower()
            featLoc=FeatureLocation(len(bb),len(bb)+len(allPartsSeqs[curPart]),1)
            record.features.append(SeqFeature(featLoc, type='BioBrick (accession)',qualifiers={"label": curPart}))
            record.id = curPart
            #BBnum=strains[strains['accession'] == curPart]['strain'].values
            record.name = strain
            sequenceRecords.append(record)
        else:
            notAnno.append((curPart,curBackbone))
notAnno=pd.DataFrame(notAnno,columns=["accession","backbone"]).set_index('accession')

In [167]:
for curPlas in sequenceRecords:
    SeqIO.write(curPlas,f'./reference_files/{curPlas.name}.gbk', 'gb')