# Data Processing Exploration

I'd like to try a few things including:
 - Use different n-gram widths
 - Find performance difference for overlapping vs non-overlapping subsequences
 - Utilize more of the sparse columns

In [1]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Helper functions
import sys
import os
sys.path.append(os.pardir)
from src.functions import *

os.pardir

In [2]:
X, y = get_training_data()

I'll make it a little easier to switch between permutation sizes

In [47]:
def get_perms(n):
    """Return list of subsequences with length n"""
    from itertools import permutations
    bases = 'CATGN'
    return [''.join(perm) for perm in permutations(bases, n)]

In [30]:
# Every permutation of every size
all_perms = [perm for n in range(1, 6) for perm in get_perms(n)]

Now I'd like to have a function that can produce overlapping substring counts

As shown below, the `str.count()` method does not do this


In [31]:
seq = 'jakeandjakeandjake'
subseq = 'jakeandjake'
seq.count(subseq)

1

In [37]:
seq.find(subseq, 2)

7

In [44]:
def find_overlapping(seq, subseq):
    pos, count = 0, 0
    while True:
        pos = seq.find(subseq, pos)
        if pos < 0:
            break
        pos += 1    
        count += 1
    return count   
    
find_overlapping(seq, subseq)    

2

Awesome, I'll put this in the [functions file](../src/functions.py) and integrate it into the existing `get_ngram_features()` function

In [45]:
def get_ngram_features(data, subsequences, overlapping=False):
    """Generate counts for each subsequence.

    Args:
        data (DataFrame): The data you want to create features from. Must include a "sequence" column.
        subsequences (list): A list of subsequences to count.
        overlapping (bool): True if you want overlapping counts, False by default

    Returns:
        DataFrame: A DataFrame with one column for each subsequence.
    """
    features = pd.DataFrame(index=data.index)
    
    for subseq in subsequences:
        if overlapping:
            features[subseq] = data.sequence.apply(find_overlapping, args=(subseq, ))
        else:
            features[subseq] = data.sequence.str.count(subseq)
        
            
    return features


In [67]:
get_ngram_features(X[:5], get_perms(4), overlapping=False)

Unnamed: 0_level_0,CATG,CATN,CAGT,CAGN,CANT,CANG,CTAG,CTAN,CTGA,CTGN,...,NTAC,NTAG,NTGC,NTGA,NGCA,NGCT,NGAC,NGAT,NGTC,NGTA
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9ZIMC,28,0,25,0,0,0,13,0,44,0,...,0,0,0,0,0,0,0,0,0,0
5SAQC,2,0,3,0,0,0,1,0,6,0,...,0,0,0,0,0,0,0,0,0,0
E7QRO,0,0,0,0,0,0,0,0,2,2,...,0,0,0,0,1,0,0,0,0,0
CT5FP,3,0,3,0,0,0,6,0,8,0,...,0,0,0,0,0,0,0,0,0,0
7PTD8,7,0,4,0,0,1,2,0,4,0,...,0,0,0,0,1,0,0,0,0,0


In [68]:
get_ngram_features(X[:5], get_perms(4), overlapping=True)

Unnamed: 0_level_0,CATG,CATN,CAGT,CAGN,CANT,CANG,CTAG,CTAN,CTGA,CTGN,...,NTAC,NTAG,NTGC,NTGA,NGCA,NGCT,NGAC,NGAT,NGTC,NGTA
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9ZIMC,28,0,25,0,0,0,13,0,44,0,...,0,0,0,0,0,0,0,0,0,0
5SAQC,2,0,3,0,0,0,1,0,6,0,...,0,0,0,0,0,0,0,0,0,0
E7QRO,0,0,0,0,0,0,0,0,2,2,...,0,0,0,0,1,0,0,0,0,0
CT5FP,3,0,3,0,0,0,6,0,8,0,...,0,0,0,0,0,0,0,0,0,0
7PTD8,7,0,4,0,0,1,2,0,4,0,...,0,0,0,0,1,0,0,0,0,0


In [66]:
X.sequence[:5]

sequence_id
9ZIMC    CATGCATTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCA...
5SAQC    GCTGGATGGTTTGGGACATGTGCAGCCCCGTCTCTGTATGGAGTGA...
E7QRO    NNCCGGGCTGTAGCTACACAGGGCGGAGATGAGAGCCCTACGAAAG...
CT5FP    GCGGAGATGAAGAGCCCTACGAAAGCTGAGCCTGCGACTCCCGCAG...
7PTD8    CGCGCATTACTTCACATGGTCCTCAAGGGTAACATGAAAGTGATCC...
Name: sequence, dtype: object

I believe it worked, I will check in the morning