# Data Processing Exploration

I'd like to try a few things including:
 - Use different n-gram widths
 - Find performance difference for overlapping vs non-overlapping subsequences
 - Utilize more of the sparse columns

In [2]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Helper functions
import sys
import os
sys.path.append(os.pardir)
from src.functions import *

os.pardir

In [36]:
X, y = get_training_data()
lab_ids = pd.DataFrame(y.idxmax(axis=1), columns=['lab_id'])

I'll make it a little easier to switch between permutation sizes

In [47]:
def get_perms(n):
    """Return list of subsequences with length n"""
    from itertools import permutations
    bases = 'CATGN'
    return [''.join(perm) for perm in permutations(bases, n)]

In [5]:
# Every permutation of every size
all_perms = [perm for n in range(1, 6) for perm in get_perms(n)]

Now I'd like to have a function that can produce overlapping substring counts

As shown below, the `str.count()` method does not do this


In [6]:
seq = 'jakeandjakeandjake'
subseq = 'jakeandjake'
seq.count(subseq)

1

In [7]:
seq.find(subseq, 2)

7

In [8]:
def find_overlapping(seq, subseq):
    pos, count = 0, 0
    while True:
        pos = seq.find(subseq, pos)
        if pos < 0:
            break
        pos += 1    
        count += 1
    return count   
    
find_overlapping(seq, subseq)    

2

Awesome, I'll put this in the [functions file](../src/functions.py) and integrate it into the existing `get_ngram_features()` function

In [45]:
def get_ngram_features(data, subsequences, overlapping=False):
    """Generate counts for each subsequence.

    Args:
        data (DataFrame): The data you want to create features from. Must include a "sequence" column.
        subsequences (list): A list of subsequences to count.
        overlapping (bool): True if you want overlapping counts, False by default

    Returns:
        DataFrame: A DataFrame with one column for each subsequence.
    """
    features = pd.DataFrame(index=data.index)
    
    for subseq in subsequences:
        if overlapping:
            features[subseq] = data.sequence.apply(find_overlapping, args=(subseq, ))
        else:
            features[subseq] = data.sequence.str.count(subseq)
        
            
    return features


In [25]:
a = get_ngram_features(X[:50], get_perms(3), overlapping=False)
a.head()

Unnamed: 0_level_0,CAT,CAG,CAN,CTA,CTG,CTN,CGA,CGT,CGN,CNA,...,NCG,NAC,NAT,NAG,NTC,NTA,NTG,NGC,NGA,NGT
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9ZIMC,103,157,0,82,137,0,101,75,0,0,...,0,0,0,0,0,0,0,0,0,0
5SAQC,6,10,0,2,14,0,3,2,0,0,...,0,0,0,0,0,0,0,0,0,0
E7QRO,1,8,1,3,11,0,10,0,0,0,...,2,0,0,3,0,0,0,2,2,0
CT5FP,18,20,0,12,25,0,10,3,0,0,...,0,0,0,0,0,0,0,0,0,0
7PTD8,19,17,2,12,21,0,15,8,0,0,...,1,0,0,0,0,0,0,2,1,0


In [26]:
b = get_ngram_features(X[:50], get_perms(3), overlapping=True)
b.head()

Unnamed: 0_level_0,CAT,CAG,CAN,CTA,CTG,CTN,CGA,CGT,CGN,CNA,...,NCG,NAC,NAT,NAG,NTC,NTA,NTG,NGC,NGA,NGT
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9ZIMC,103,157,0,82,137,0,101,75,0,0,...,0,0,0,0,0,0,0,0,0,0
5SAQC,6,10,0,2,14,0,3,2,0,0,...,0,0,0,0,0,0,0,0,0,0
E7QRO,1,8,1,3,11,0,10,0,0,0,...,2,0,0,3,0,0,0,2,2,0
CT5FP,18,20,0,12,25,0,10,3,0,0,...,0,0,0,0,0,0,0,0,0,0
7PTD8,19,17,2,12,21,0,15,8,0,0,...,1,0,0,0,0,0,0,2,1,0


In [27]:
assert a.equals(b)

Okay, I am just now realizing that there cannot be any overlapping permuations.... 
This function will still be useful when I start checking for common subsequences that are *not* permutations

### Submitting

For now I'll make a quick pipeline to produce submittable predictions. The `test_values.csv` file is in the data directory

In [37]:
X = get_ngram_features(X, get_perms(3)).join(X.drop('sequence', axis=1))

For testing I'll use the Multinomial Naive-Bayes classifier because it is good at handling integer 'counted' features like I have here.

In [38]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X, lab_ids.values.ravel())

MultinomialNB()

Now I'll load in the test values and preprocess in the same way

In [48]:
submission_format = pd.read_csv('../data/submission_format_3TFRxH6.csv', index_col='sequence_id')
X_test = pd.read_csv('../data/test_values.csv', index_col='sequence_id')

In [50]:
X_test = get_ngram_features(X_test, get_perms(3)).join(X_test.drop('sequence', axis=1))

Get my predictions

In [54]:
probas = mnb.predict_proba(X_test)

# Sanity check
assert probas.shape == submission_format.shape, 'wrong shape'
assert (mnb.classes_ == submission_format.columns).all(), 'wrong columns'

Make predictions into a DataFrame and export into the `data` directory as `submission.csv`

In [61]:
submission = pd.DataFrame(data=probas, 
                          columns=mnb.classes_, 
                          index=submission_format.index)

submission.to_csv('../data/submission.csv')