# Purpose

The purpose of this notebook is to format my training dataset for the machine learning task of predicting amyloid protein sequences.

In [11]:
import sqlite3
import pandas as pd
import numpy as np
import nltk
from tqdm import tqdm_notebook

conn = sqlite3.connect("human_protein.db")

In [12]:
#conn.close()

In [13]:
cur = conn.cursor()
tables = cur.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;").fetchall()
tables

[('amyloid',),
 ('amyloid_pdf_refs',),
 ('amyloid_prion',),
 ('protein',),
 ('protein_amino_acid_modifications',),
 ('protein_comments',),
 ('protein_diseases',),
 ('protein_secondary_structure',),
 ('protein_subcellular_localization',),
 ('protein_tissue_expression',)]

In [14]:
protein_df = pd.read_sql('SELECT * FROM protein',con=conn)
display(protein_df.head())

Unnamed: 0,protein,sequence,keywords
0,RL37A_HUMAN,MAKRTKKVGIVGKYGTRYGASLRKMVKKIEISQHAKYTCSFCGKTK...,"3D-structure, Complete proteome, Metal-binding..."
1,PYRG1_HUMAN,MKYILVTGGVISGIGKGIIASSVGTILKSCGLHVTSIKIDPYINID...,"3D-structure, Acetylation, Alternative splicin..."
2,RL6_HUMAN,MAGEKVEKPDTKEKKPEAKKVDAGGKVKKGNLKAKKPKKGKPHCSR...,"3D-structure, Acetylation, Complete proteome, ..."
3,RAB10_HUMAN,MAKKTYDLLFKLLLIGDSGVGKTCVLFRFSDDAFNTTFISTIGIDF...,"3D-structure, Acetylation, Cell projection, Co..."
4,RAB30_HUMAN,MSMEDYDFLFKIVLIGNAGVGKTCLVRRFTQGLFPPGQGATIGVDF...,"3D-structure, Alternative splicing, Complete p..."


In [15]:
# Parse keywords to columns
protein_df.loc[:,'keyword_split'] = protein_df['keywords'].str.split(',').apply(frozenset)
for keyword in tqdm_notebook(frozenset.union(*protein_df.keyword_split)):
    protein_df[keyword] = protein_df.apply(lambda x: int(keyword in x.keyword_split),axis=1)

HBox(children=(IntProgress(value=0, max=825), HTML(value='')))




Unnamed: 0,protein,sequence,keywords,keyword_split,cAMP,Copper transport,Menaquinone biosynthesis,Hermansky-Pudlak syndrome,Cytoskeleton,Amidation,...,Complement activation lectin pathway,Cholesterol metabolism,Protein transport,Aortic aneurysm,Glycogen storage disease,Amyotrophic lateral sclerosis,Methotrexate resistance,Biotin,Emery-Dreifuss muscular dystrophy,Aicardi-Goutieres syndrome
0,RL37A_HUMAN,MAKRTKKVGIVGKYGTRYGASLRKMVKKIEISQHAKYTCSFCGKTK...,"3D-structure, Complete proteome, Metal-binding...","( Ribosomal protein, Metal-binding, Complete...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,PYRG1_HUMAN,MKYILVTGGVISGIGKGIIASSVGTILKSCGLHVTSIKIDPYINID...,"3D-structure, Acetylation, Alternative splicin...","( Alternative splicing, Nucleotide-binding, ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,RL6_HUMAN,MAGEKVEKPDTKEKKPEAKKVDAGGKVKKGNLKAKKPKKGKPHCSR...,"3D-structure, Acetylation, Complete proteome, ...","( Isopeptide bond, Ribosomal protein, Acetyl...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,RAB10_HUMAN,MAKKTYDLLFKLLLIGDSGVGKTCVLFRFSDDAFNTTFISTIGIDF...,"3D-structure, Acetylation, Cell projection, Co...","( Cytoplasmic vesicle, Endosome, Transport, ...",0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,RAB30_HUMAN,MSMEDYDFLFKIVLIGNAGVGKTCLVRRFTQGLFPPGQGATIGVDF...,"3D-structure, Alternative splicing, Complete p...","( Alternative splicing, Nucleotide-binding, ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
protein_df_keywords = protein_df.drop(["keywords","keyword_split"],axis=1)
protein_df_keywords.head()

Unnamed: 0,protein,sequence,cAMP,Copper transport,Menaquinone biosynthesis,Hermansky-Pudlak syndrome,Cytoskeleton,Amidation,Complement pathway,Ubl conjugation pathway,...,Complement activation lectin pathway,Cholesterol metabolism,Protein transport,Aortic aneurysm,Glycogen storage disease,Amyotrophic lateral sclerosis,Methotrexate resistance,Biotin,Emery-Dreifuss muscular dystrophy,Aicardi-Goutieres syndrome
0,RL37A_HUMAN,MAKRTKKVGIVGKYGTRYGASLRKMVKKIEISQHAKYTCSFCGKTK...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,PYRG1_HUMAN,MKYILVTGGVISGIGKGIIASSVGTILKSCGLHVTSIKIDPYINID...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,RL6_HUMAN,MAGEKVEKPDTKEKKPEAKKVDAGGKVKKGNLKAKKPKKGKPHCSR...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,RAB10_HUMAN,MAKKTYDLLFKLLLIGDSGVGKTCVLFRFSDDAFNTTFISTIGIDF...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,RAB30_HUMAN,MSMEDYDFLFKIVLIGNAGVGKTCLVRRFTQGLFPPGQGATIGVDF...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
protein_df_keywords.to_sql('protein_features', con=conn, if_exists='replace')

  dtype=dtype)


In [18]:
conn.close()

In [None]:
disease_df = pd.read_sql('SELECT * FROM protein_diseases',con=conn)
display(disease_df.head())
disease_df.groupby('disease').count().head()

Judging by how sparse disease is - I don't think it will contain enough information without some sort of parsing to be useful. Skip for now.