# Amino acid characteristics

Lets add some features for each fraction of the n-gram (amino acid).

1. Class (aliphatic, basic, amide, acid, sulfur-containing, basic aromatic, cyclic, hydroxyl-containing)
2. Side chain polarity
3. Side chain charge
4. Hydropathy index
5. Molecular Weight

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from matplotlib import pyplot as plt
import seaborn as sns
import nltk
import sklearn
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import math

In [2]:
amino_acid_dict = {
    "A":{
        "side_chain_class":"aliphatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":1.8,
        "molecular_weight":89.094
    },
    "R":{
        "side_chain_class":"basic",
        "side_chain_polarity":"basic polar",
        "side_chain_charge":"positive",
        "hydropathy_index":-4.5,
        "molecular_weight":174.203
    },
    "N":{
        "side_chain_class":"amide",
        "side_chain_polarity":"polar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-3.5,
        "molecular_weight":132.119
    },
    "D":{
        "side_chain_class":"acid",
        "side_chain_polarity":"acidic polar",
        "side_chain_charge":"negative",
        "hydropathy_index":-3.5,
        "molecular_weight":133.104
    },
    "C":{
        "side_chain_class":"sulfur-containing",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":2.5,
        "molecular_weight":121.154
    },
    "E":{
        "side_chain_class":"acid",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-3.5,
        "molecular_weight":147.131
    },
    "Q":{
        "side_chain_class":"amide",
        "side_chain_polarity":"polar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-3.5,
        "molecular_weight":146.146
    },
    "G":{
        "side_chain_class":"aliphatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-0.4,
        "molecular_weight":75.067
    },
    "H":{
        "side_chain_class":"basic aromatic",
        "side_chain_polarity":"basic polar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-3.2,
        "molecular_weight":155.156
    },
    "I":{
        "side_chain_class":"aliphatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":4.5,
        "molecular_weight":131.175
    },
    "L":{
        "side_chain_class":"aliphatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":3.8,
        "molecular_weight":131.175
    },
    "K":{
        "side_chain_class":"basic",
        "side_chain_polarity":"basic polar",
        "side_chain_charge":"positive",
        "hydropathy_index":-3.9,
        "molecular_weight":146.189
    },
    "M":{
        "side_chain_class":"sulfur-containing",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":1.9,
        "molecular_weight":149.208
    },
    "F":{
        "side_chain_class":"aromatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":2.8,
        "molecular_weight":165.192
    },
    "P":{
        "side_chain_class":"cyclic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-1.6,
        "molecular_weight":115.132
    },
    "S":{
        "side_chain_class":"hydroxyl-containing",
        "side_chain_polarity":"polar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-0.8,
        "molecular_weight":105.093
    },
    "T":{
        "side_chain_class":"hydroxyl-containing",
        "side_chain_polarity":"polar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-0.7,
        "molecular_weight":119.119
    },
    "W":{
        "side_chain_class":"aromatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-0.9,
        "molecular_weight":204.228
    },
    "Y":{
        "side_chain_class":"aromatic",
        "side_chain_polarity":"polar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-1.3,
        "molecular_weight":181.191
    },
    "V":{
        "side_chain_class":"aliphatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":4.2,
        "molecular_weight":117.148
    }
}

In [3]:
conn = sqlite3.connect("protein_training.db")
ngrams_df = pd.read_sql('SELECT protein, gram_num, gram_1, gram_2, gram_3, gram_4, gram_5 from protein_ngram',con=conn)
conn.close()
ngrams_df.sample(5)

Unnamed: 0,protein,gram_num,gram_1,gram_2,gram_3,gram_4,gram_5
9983039,PLCL1_HUMAN,922,S,L,K,Q,C
9090249,DIP2B_HUMAN,949,Q,P,G,V,G
11203698,NU205_HUMAN,1593,E,T,D,P,Q
3146096,CAN15_HUMAN,228,R,V,P,P,F
4026919,GUC2F_HUMAN,204,R,V,A,S,A


In [4]:
aa_vals = pd.melt(
    ngrams_df[[
        'protein','gram_num','gram_1','gram_2','gram_3','gram_4','gram_5']],
    id_vars=['protein','gram_num'],
    value_vars=['gram_1','gram_2','gram_3','gram_4','gram_5']
).value.unique()

In [5]:
for a in aa_vals:
    if a == "U":
        a = "C"
    if a not in amino_acid_dict:
        print(a)

In [6]:
def ngramMapper(grams):
    return_arr = []
    for gram in grams:
        if gram == "U":
            gram = "C"
        gram_vals = list(amino_acid_dict[gram].values())
        return_arr.extend(gram_vals)
    return return_arr

In [None]:
# Find which chunk_size is the fastest
measured_times = []
for test_chunk in tqdm_notebook([100, 500, 1000, 1500, 2000, 4000, 8000, 10000, 15000, 20000]):
    start_time = time.time()
    
    start = 0
    end = test_chunk
        
    ngram_df_chunk = ngrams_df.iloc[start:end,:]
    
    chunk_vals = ngram_df_chunk.apply(
        lambda row: ngramMapper(
            row[['gram_1','gram_2','gram_3','gram_4','gram_5']].values
        ),
        axis=1
    ).values.tolist()
    
    chunk_df = pd.DataFrame(chunk_vals, columns=gram_val_cols, index=range(start,end))
        
    write_df = pd.concat(
        [ngram_df_chunk[['protein','gram_num']], chunk_df],
        axis=1,
        sort=False
    )
    
    ngram_slice_vals = ngram_slice.apply(
        lambda row: ngramFeatureArrayizer(
            row['protein'],
            row['gram_num'],
            row['seq'], 
        ), axis=1).values.tolist()
    
    ngram_slice_df = pd.DataFrame(ngram_slice_vals, index=range(start,end+1), columns=ngram_feat_cols)
    
    ngram_feat_full_df = pd.concat(
        [ngram_slice[["protein","gram_num"]], ngram_slice_df],
        axis=1
    )
    
    spent_time = time.time() - start_time
    
    measured_times.append([test_chunk, spent_time])

In [None]:
time_df = pd.DataFrame(measured_times, columns=['chunk_size','time_spent'])
time_df.loc[:,'iterations_needed'] = ngrams_df.shape[0] / time_df['chunk_size']
time_df.loc[:,'time_to_completion'] = (time_df['iterations_needed'] * time_df['time_spent']) / 60 / 60
time_df.sort_values('time_to_completion',inplace=True, ascending=False)
time_df.plot(kind='bar', x="chunk_size", y="time_to_completion")

Looks like the ideal chunk size is X.