In [1]:
import sqlite3
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from matplotlib import pyplot as plt
import seaborn as sns
import nltk
import sklearn
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import math

In [2]:
conn = sqlite3.connect("protein_training.db")
proteins_df = pd.read_sql('SELECT * from protein_ngram LIMIT 1000',con=conn)
keyword_df = pd.read_sql('SELECT * from protein_keywords LIMIT 1000',con=conn)
seqfeat_df = pd.read_sql('SELECT * from protein_seq_features LIMIT 1000',con=conn)
subloc_df = pd.read_sql('SELECT * from protein_subloc LIMIT 1000',con=conn)
tissue_df = pd.read_sql('SELECT * from protein_tissue_exp_keywords LIMIT 1000',con=conn)
conn.close()

In [3]:
for df in [proteins_df, seqfeat_df, subloc_df, tissue_df]:
    display(df.head())

Unnamed: 0,protein,gram_num,gram_1,gram_2,gram_3,gram_4,gram_5,accession,amyloid_1,amyloid_2,amyloid_3,amyloid_4,amyloid_5
0,RL37A_HUMAN,1,M,A,K,R,T,"P61513,P12751,Q6FGF5",0,0,0,0,0
1,RL37A_HUMAN,2,A,K,R,T,K,"P61513,P12751,Q6FGF5",0,0,0,0,0
2,RL37A_HUMAN,3,K,R,T,K,K,"P61513,P12751,Q6FGF5",0,0,0,0,0
3,RL37A_HUMAN,4,R,T,K,K,V,"P61513,P12751,Q6FGF5",0,0,0,0,0
4,RL37A_HUMAN,5,T,K,K,V,G,"P61513,P12751,Q6FGF5",0,0,0,0,0


Unnamed: 0,protein,R_num,H_num,K_num,D_num,E_num,S_num,T_num,N_num,Q_num,...,flex_mean_6,flex_range_6,flex_mean_7,flex_range_7,flex_mean_8,flex_range_8,flex_mean_9,flex_range_9,flex_mean_10,flex_range_10
0,RL37A_HUMAN,0.076087,0.021739,0.163043,0.01087,0.021739,0.065217,0.097826,0.01087,0.021739,...,0.97007,0.056417,0.989251,0.062321,0.987525,0.040536,0.998399,0.04556,1.032863,0.016655
1,PYRG1_HUMAN,0.055838,0.030457,0.060914,0.05753,0.076142,0.069374,0.043993,0.028765,0.042301,...,1.002592,0.093726,0.991874,0.10006,1.007784,0.086429,1.003565,0.111845,1.007482,0.102762
2,RL6_HUMAN,0.076389,0.027778,0.180556,0.03125,0.045139,0.048611,0.055556,0.020833,0.03125,...,0.98418,0.073798,0.986485,0.075167,1.019604,0.084369,1.0271,0.083238,0.991721,0.091595
3,RAB10_HUMAN,0.045,0.015,0.105,0.07,0.065,0.055,0.075,0.05,0.02,...,1.01156,0.073774,1.024263,0.085345,1.000924,0.083452,1.022086,0.09881,1.011547,0.06181
4,RAB30_HUMAN,0.059113,0.0,0.049261,0.044335,0.093596,0.073892,0.059113,0.054187,0.049261,...,0.999652,0.084238,1.023271,0.062095,1.014127,0.086548,0.993532,0.079179,1.01006,0.117869


Unnamed: 0,protein,loc_cytoplasm,loc_cytosol,loc_rough_endoplasmic_reticulum,loc_cytoplasmic_vesicle_membrane,loc_golgi_apparatus_membrane,loc_golgi_apparatus,loc_trans-golgi_network_membrane,loc_endosome_membrane,loc_recycling_endosome_membrane,...,loc_nucleoid,loc_fimbrium,loc_cell_wall,loc_host_golgi_apparatus_membrane,loc_host_nucleus,loc_host_cytoplasm,loc_forespore_intermembrane_space,loc_spore_wall,loc_host_mitochondrion_inner_membrane,loc_host_cytosol
0,1433B_HUMAN,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1433E_HUMAN,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1433G_HUMAN,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1433S_HUMAN,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1433T_HUMAN,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,protein,exp_glands,exp_skin,exp_by,exp_the,exp_expressed,exp_concentrations,exp_conditions,exp_cone,exp_cones,...,exp_circular,exp_circulates,exp_circulating,exp_circumvallate,exp_cirrhotic,exp_cisplatin,exp_cla,exp_clara,exp_class,exp_classical
0,PYRG1_HUMAN,0.0,0.0,0.0,0.0,0.386972,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,RAB43_HUMAN,0.0,0.0,0.0,0.0,0.113534,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,RAB2B_HUMAN,0.0,0.0,0.0,0.0,0.083868,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,RAB36_HUMAN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,RAB17_HUMAN,0.0,0.0,0.0,0.0,0.164799,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Lets build the train set and the sample set

Train is all of the n-grams for the proteins with some amyloid

Sample are proteins with no amyloid at all

In [4]:
# List of proteins
conn = sqlite3.connect("protein_training.db")
proteins = pd.read_sql("SELECT DISTINCT protein FROM protein_ngram", con=conn)
conn.close()

In [5]:
conn = sqlite3.connect("protein_training.db")
subloc_prots = pd.read_sql("SELECT DISTINCT protein FROM protein_subloc", con=conn)
tissue_prots = pd.read_sql("SELECT DISTINCT protein FROM protein_tissue_exp_keywords", con=conn)
conn.close()

In [6]:
# Proteins not in subloc keywords
print(proteins[proteins.protein.isin(subloc_prots.protein)==False].shape[0])
# Proteins not in the tissue keywords
print(proteins[proteins.protein.isin(tissue_prots.protein)==False].shape[0])

3856
10748


In [10]:
# Manually delete table before running
conn = sqlite3.connect("protein_training.db")
dfs = ["protein_ngram", "protein_seq_features"]
all_cols = []
col_count_dict = {}
for df in dfs:
    cols = [x[1] for x in conn.execute("PRAGMA table_info("+df+");") if x[1]!="accession"]
    for col in cols:
        if col not in col_count_dict:
            col_count_dict[col] = 1
        else:
            col_count_dict[col] += 1
        
    all_cols.extend(cols)

conn.close()

In [11]:
col_df = pd.DataFrame.from_dict(col_count_dict,orient="index")
col_df[col_df.iloc[:,0]>1]
len(col_df)

82

In [46]:
# Put all the frames together
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield tuple(l[i:i + n])

def merger(cols, proteins, conn):
    """Given protein name, construct the full feature arrays for that protein
    and write to the sqlite repo."""
    dfs = ["protein_ngram", "protein_seq_features"]
    final_frame = pd.DataFrame(list(proteins), columns=['protein'])
    for df in dfs:
        sub_set = pd.read_sql("SELECT * FROM "+df+" WHERE `protein` IN "+str(proteins), con=conn)
        if sub_set.shape[0] > 0:
            final_frame = final_frame.merge(sub_set, on='protein', how='outer')
    # Backfill missing cols
    for col in cols:
        if col not in final_frame.columns:
            final_frame.loc[:,col] = 0
    # Drop certain columns
    drop_cols = ["accession"]
    final_frame.drop(drop_cols,axis=1, inplace=True)
    return final_frame.fillna(0)

def protein_writer(cols, proteins, chunk_size, conn):
    prot_chunks = chunks(proteins, chunk_size)
    for chunk in tqdm_notebook(prot_chunks):
        frame = merger(cols, chunk, conn)
        frame.to_sql("train",con=conn, if_exists="append", index=False)

In [47]:
conn = sqlite3.connect("protein_training.db")
protein_writer(all_cols, proteins.protein.tolist(), 5000, conn)
conn.close()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [44]:
s_its = [1.99, 2.66, 5.61, 8.46, 26.66, 49.14]
c_size = [10, 50, 500, 1000, 5000, 10000]
n = len(proteins.protein.tolist())

sits_df = pd.DataFrame(list(zip(s_its, c_size)), columns=["sec_per_it","chunk_size"])

In [45]:
sits_df.loc[:,'total_its'] = n / sits_df.chunk_size
sits_df.loc[:,'time_to_completion'] = sits_df['sec_per_it'] * sits_df['total_its']
sits_df

Unnamed: 0,sec_per_it,chunk_size,total_its,time_to_completion
0,1.99,10,2045.4,4070.346
1,2.66,50,409.08,1088.1528
2,5.61,500,40.908,229.49388
3,8.46,1000,20.454,173.04084
4,26.66,5000,4.0908,109.060728
5,49.14,10000,2.0454,100.510956
