# Swerve

Building a model off of essentially the protein features didn't produce all that much that was interesting. Lets go back and engineer some more n-gram level features and bake those into a different prediction algorithm.

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from matplotlib import pyplot as plt
import seaborn as sns
import nltk
import sklearn
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import math

In [2]:
conn = sqlite3.connect("protein_training.db")
ngrams_df = pd.read_sql('SELECT * from protein_ngram',con=conn)
protein_gram_num = pd.read_sql("SELECT DISTINCT protein, COUNT(gram_num) as gram_total FROM protein_ngram GROUP BY protein", con=conn)
conn.close()
ngrams_df.drop("accession",axis=1,inplace=True)

In [3]:
# Add full n-gram sequence to ngram df
ngrams_df.loc[:,'seq'] = ngrams_df['gram_1'] + ngrams_df['gram_2'] + ngrams_df['gram_3'] + ngrams_df['gram_4'] + ngrams_df['gram_5']

In [4]:
ngrams_df.sample(5)

Unnamed: 0,protein,gram_num,gram_1,gram_2,gram_3,gram_4,gram_5,amyloid_1,amyloid_2,amyloid_3,amyloid_4,amyloid_5,seq
8386027,ZC3HD_HUMAN,12,N,T,K,T,I,0,0,0,0,0,NTKTI
1738395,ATS13_HUMAN,1421,W,K,G,K,E,0,0,0,0,0,WKGKE
2574371,CE192_HUMAN,1361,W,D,S,G,V,0,0,0,0,0,WDSGV
6099764,F184B_HUMAN,285,D,L,K,K,Y,0,0,0,0,0,DLKKY
957929,SCND3_HUMAN,191,S,F,L,W,F,0,0,0,0,0,SFLWF


In [5]:
protein_gram_num.sample(5)

Unnamed: 0,protein,gram_total
547,AGAL_HUMAN,425
1321,AT5G3_HUMAN,138
20330,ZNF16_HUMAN,678
14943,RPF1_HUMAN,345
14611,RIOK1_HUMAN,564


In [6]:
def ngramFeatureArrayizer(protein, gram_num, seq):
    """Analyze the ngram sequence as a whole"""
    feat_array = []
    prot_total_grams = protein_gram_num[protein_gram_num['protein']==protein]["gram_total"].values[0]
    gram_frac = round(gram_num / prot_total_grams, 3)
    object_ = ProteinAnalysis(seq)
    object_no_u = ProteinAnalysis(seq.replace("U","C"))
    # Mol Weight
    mol_weight = object_.molecular_weight()
    # Aromaticiy
    arom = object_.aromaticity()
    # Instability index
    try:
        insta_in = object_.instability_index()
    except:
        insta_in = object_no_u.instability_index()
    # No flexibility because the size has to be 9 amino acids at the very least
    # Gravy
    try:
        gravy_ = object_.gravy()
    except:
        gravy_ = object_no_u.gravy()
    # Isoelectric point
    try:
        iso_el = object_.isoelectric_point()
    except:
        iso_el = object_no_u.isoelectric_point()
    # Helix percentage
    sec_stru = object_.secondary_structure_fraction()
    hel_perc = sec_stru[0]
    # Turn Percentage
    turn_perc = sec_stru[1]
    # Sheet Percentage
    sheet_perc = sec_stru[2]
    # Reduced Cys Num
    mol_extinc_coeff = object_.molar_extinction_coefficient()
    red_cys = mol_extinc_coeff[0]
    # Disulfide Bridge Num
    oxi_cys = mol_extinc_coeff[1]
    feat_array.extend([
        gram_frac, mol_weight, arom, insta_in, gravy_, iso_el, hel_perc, 
        sheet_perc, turn_perc, red_cys, oxi_cys]
    )
    return feat_array

In [7]:
ngram_feat_cols = [
    "gram_frac","gram_mol_weight","gram_arom","gram_insta",
    "gram_gravy","gram_isoel","gram_helix_perc","gram_turn_perc","gram_sheet_perc",
    "gram_reduced_cys_num","gram_disulfide_num"]

In [18]:
# Lets chunk this
chunk_size = 10000
chunk_list = list(range(0, ngrams_df.shape[0],chunk_size))
chunk_pairs = [[chunk_list[i-1], chunk_list[i]] for i, x in enumerate(chunk_list) if i != 0]
chunk_pairs.append([chunk_pairs[-1][-1]+1, ngrams_df.shape[0]-1])
print(chunk_pairs[:5],"...",chunk_pairs[-5:])

[[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000]] ... [[11270000, 11280000], [11280000, 11290000], [11290000, 11300000], [11300000, 11310000], [11310001, 11319553]]


In [22]:
conn = sqlite3.connect("protein_training.db")

for i in tqdm_notebook(chunk_pairs):
    
    start = i[0]
    end = i[1]
    
    ngram_slice = ngrams_df.loc[start:end,:]
    
    ngram_slice_vals = ngram_slice.apply(
        lambda row: ngramFeatureArrayizer(
            row['protein'], 
            row['gram_num'],
            row['seq'], 
        ), axis=1).values.tolist()
    
    ngram_slice_df = pd.DataFrame(ngram_slice_vals, columns=ngram_feat_cols)
    
    ngram_feat_full_df = pd.concat([ngram_slice[["protein","gram_num"]], ngram_slice_df], axis=1)

    if start == 0:
        ngram_slice.to_sql('protein_ngram_features', con=conn, index=False, if_exists='replace')
    else:
        ngram_slice.to_sql('protein_ngram_features', con=conn, index=False, if_exists='append')
        
conn.close()

#ngram_feat_array = ngrams_df.apply(lambda row: ngramFeatureArrayizer(row['protein'], row['gram_num'], row['seq']), axis=1)

HBox(children=(IntProgress(value=0, max=1132), HTML(value='')))

Now that we've got the ngram sequence analyzed and recorded as a whole, lets look into the amino acids and put some features in based around those.