# Purpose

Now that we've gathered our raw xml data from uniprot. I need to gather and mark the proteins I have as being amyloid or not, and which sections of their protein turn into amyloids.

To do this, I'll be looking at the amypro.net database. I'll first mine the fasta sequences present there.

In [23]:
import pandas as pd
import os
import re
from Bio import SeqIO
from tqdm import tqdm_notebook
import sqlite3

conn = sqlite3.connect("human_protein.db")

In [2]:
amyloid_recs = [x for x in SeqIO.parse("data/amypro.fasta","fasta")]

In [43]:
def descriptionParser(description):
    desc_re = "AP\d{5}\s\((.*)\)\suniprot=(.*)\spdbs=(.*)\spmid=\{(.*)\}\scategory=(.*)\sprion_domain=(yes|no)\smutations=\{(.*)\}\sregions=\[(.*)\]"
    desc_result = list(re.findall(desc_re, description)[0])
    return desc_result

def amyloidFastaParser(parse_obj):
    name = parse_obj.name
    description = parse_obj.description
    desc_parsed = descriptionParser(description)
    species = desc_parsed[0]
    uniprot = desc_parsed[1]
    pdbs = [[name, x] for x in desc_parsed[2].split(",") if len(x) > 1]
    pmid = desc_parsed[3]
    category = desc_parsed[4]
    prion_domain = desc_parsed[5]
    mutations = desc_parsed[6]
    regions = [[name]+x.split('-') for x in desc_parsed[7].split(',') if len(x)>0]
    seq = str(parse_obj.seq)
    
    # Write to sqlite
    # General table, one entry per amyloid
    amyloid_df = pd.DataFrame(
        [[name, seq, species, uniprot, pmid, category]],
        columns=["amy_name","sequence","species","uniprot_id","pmid","category"]
    )
    # Protein db references for each amyloid, zero-to-multiple entries
    pdb_ref_df = pd.DataFrame(pdbs,columns=["amy_name","pdb_ref"])
    # Prion domains, zero-to-multiple entries
    prion_df = pd.DataFrame(regions,columns=["amy_name","begin","end"])
    
    if amyloid_df.shape[0]!=0:
        amyloid_df.to_sql('amyloid',conn,if_exists='append',index=False)
    if pdb_ref_df.shape[0]!=0:
        pdb_ref_df.to_sql('amyloid_pdf_refs',conn,if_exists='append',index=False)
    if prion_df.shape[0]!=0:
        prion_df.to_sql('amyloid_prion',conn,if_exists='append',index=False)


In [44]:
for amy in tqdm_notebook(amyloid_recs):
    amyloidFastaParser(amy)


HBox(children=(IntProgress(value=0, max=143), HTML(value='')))


