In [16]:
import sqlite3
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from matplotlib import pyplot as plt
import seaborn as sns
import nltk
import sklearn
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import math
from functools import reduce

In [54]:
conn = sqlite3.connect("protein_training.db")

proteins_df = pd.read_sql("SELECT DISTINCT protein FROM protein_ngram", con=conn)
amy_df = pd.read_sql("SELECT DISTINCT protein FROM amyloid", con=conn)

# Sanity check that all proteins are ones I plan on using
print(proteins_df[proteins_df['protein'].isin(amy_df['protein'])==False].shape[0])

conn.close()
# Lets chunk this by protein name
chunk_size = 2000
chunk_list = list(range(0, proteins_df.shape[0],chunk_size))
chunk_pairs = [[chunk_list[i-1], chunk_list[i]] for i, x in enumerate(chunk_list) if i != 0]
chunk_pairs.append([chunk_pairs[-1][-1]+1, proteins_df.shape[0]])
print(chunk_pairs[:5],"...",chunk_pairs[-5:])

0
[[0, 2000], [2000, 4000], [4000, 6000], [6000, 8000], [8000, 10000]] ... [[12000, 14000], [14000, 16000], [16000, 18000], [18000, 20000], [20001, 20454]]


In [55]:
proteins_df.shape[0]

20454

In [18]:
# Iterant 1, join all

# conn = sqlite3.connect("protein_training.db")

# for chunk in tqdm_notebook(chunk_pairs[:2]):
#     start = chunk[0]
#     end = chunk[1]
    
#     protein_chunk = proteins_df.iloc[start:end,:]
#     prot_list = "("+", ".join(["'"+x+"'" for x in protein_chunk.protein.values.tolist()])+")"
    
#     train_chunk = pd.read_sql(
#         "SELECT * \
#         FROM \
#             protein_ngram as a \
#             INNER JOIN protein_seq_features as b ON b.protein = a.protein \
#             INNER JOIN protein_ngram_features as c ON c.protein = a.protein AND c.gram_num = a.gram_num \
#             INNER JOIN protein_subngram_features as d ON d.protein = a.protein AND d.gram_num = a.gram_num \
#         WHERE a.protein IN "+prot_list,
#         con=conn
#     )
    
#     display(train_chunk.sample(5))
    
# conn.close()

# Took 148 sec per iterant

In [47]:
# Iterant 2, select subset, then join

conn = sqlite3.connect("protein_training.db")

for chunk in tqdm_notebook(chunk_pairs):
    start = chunk[0]
    end = chunk[1]
    
    protein_chunk = proteins_df.iloc[start:end,:]
    prot_list = "("+", ".join(["'"+x+"'" for x in protein_chunk.protein.values.tolist()])+")"
    
    ngram_chunk = pd.read_sql(
        "SELECT * FROM protein_ngram WHERE protein IN "+prot_list, 
        con=conn
    )
    
    ngram_feat_chunk = pd.read_sql(
        "SELECT * FROM protein_ngram_features WHERE protein IN "+prot_list, 
        con=conn
    )
    
    ngram_sub_feat_chunk = pd.read_sql(
        "SELECT * FROM protein_subngram_features WHERE protein IN "+prot_list, 
        con=conn
    )
    
    frames = [ngram_chunk, ngram_feat_chunk, ngram_sub_feat_chunk]
    
    train_chunk = reduce(
        lambda left,right: pd.merge(
            left,right,on=['protein','gram_num'],
            how='outer'
        ),
        frames
    )
    
    protein_chunk = pd.read_sql(
        "SELECT * FROM protein_seq_features WHERE protein IN "+prot_list,
        con=conn
    )
    
    train_chunk = train_chunk.merge(protein_chunk, on=["protein"])
        
    train_chunk.to_sql("train_final", con=conn, index=False, if_exists="append")
    
conn.close()

# Takes 67 sec per iterant

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))

In [28]:
# Sanity check
conn = sqlite3.connect("protein_training.db")

c = conn.cursor()

nrow_protein_ngram = [x[0] for x in c.execute("SELECT COUNT(*) FROM protein_ngram")][0]
nrow_protein_ngram_features = [x[0] for x in c.execute("SELECT COUNT(*) FROM protein_ngram_features")][0]
nrow_protein_subngram_features = [x[0] for x in c.execute("SELECT COUNT(*) FROM protein_subngram_features")][0]

conn.close()

nrow_protein_ngram == nrow_protein_ngram_features == nrow_protein_subngram_features

False

In [29]:
nrow_protein_ngram

11319554

In [30]:
nrow_protein_ngram_features

11320684

In [31]:
nrow_protein_subngram_features

11319552

In [48]:
# Check for null values in all the columns

conn = sqlite3.connect("protein_training.db")

c = conn.cursor()

train_columns = [x[1] for x in c.execute("PRAGMA table_info(train_final)")]

conn.close()

In [50]:
conn = sqlite3.connect("protein_training.db")

c = conn.cursor()

col_nulls = {}

for col in tqdm_notebook(train_columns):
    na_cells = [x[0] for x in c.execute(
        "SELECT COUNT(*) \
        FROM train_final \
            WHERE "+col+" IS NULL"
    )][0]
    
    col_nulls[col] = na_cells

conn.close()

HBox(children=(IntProgress(value=0, max=119), HTML(value='')))

In [51]:
col_nulls

{'protein': 0,
 'gram_num': 0,
 'gram_1': 0,
 'gram_2': 0,
 'gram_3': 0,
 'gram_4': 0,
 'gram_5': 0,
 'accession': 0,
 'amyloid_1': 0,
 'amyloid_2': 0,
 'amyloid_3': 0,
 'amyloid_4': 0,
 'amyloid_5': 0,
 'gram_frac': 0,
 'gram_mol_weight': 0,
 'gram_arom': 0,
 'gram_insta': 0,
 'gram_gravy': 0,
 'gram_isoel': 0,
 'gram_helix_perc': 0,
 'gram_turn_perc': 0,
 'gram_sheet_perc': 0,
 'gram_reduced_cys_num': 0,
 'gram_disulfide_num': 0,
 'gram_1_side_class': 1,
 'gram_1_side_polarity': 1,
 'gram_1_side_charge': 1,
 'gram_1_hydropathy_index': 1,
 'gram_1_mol_weight': 1,
 'gram_2_side_class': 1,
 'gram_2_side_polarity': 1,
 'gram_2_side_charge': 1,
 'gram_2_hydropathy_index': 1,
 'gram_2_mol_weight': 1,
 'gram_3_side_class': 1,
 'gram_3_side_polarity': 1,
 'gram_3_side_charge': 1,
 'gram_3_hydropathy_index': 1,
 'gram_3_mol_weight': 1,
 'gram_4_side_class': 1,
 'gram_4_side_polarity': 1,
 'gram_4_side_charge': 1,
 'gram_4_hydropathy_index': 1,
 'gram_4_mol_weight': 1,
 'gram_5_side_class': 1,