In [37]:
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.feather as feather
import matplotlib

matplotlib.style.use('ggplot')

### Load in the raw protein sequences

In [38]:
protein_sequences = feather.read_feather('train-ready_data/protein_sequences.feather')
protein_sequences.head()

Unnamed: 0,sequence
P20536,MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIP...
O73864,MTEYRNFLLLFITSLSVIYPCTGISWLGLTINGSSVGWNQTHHCKL...
O95231,MRLSSSPPRGPQQLSSFGSVDWLSQSSCSGPTHTPRPADFSLGSLP...
A0A0B4J1F4,MGGEAGADGPRGRVKSLGLVFEDESKGCYSSGETVAGHVLLEAAEP...
P54366,MVETNSPPAGYTLKRSPSDLGEQQQPPRQISRSPGNTAAYHLTTAM...


In [39]:
proteins_featurized = protein_sequences.copy()

### Bag of words model

Inspired by the Natural Language Processing paradigms, let's create an adapted version of the bag of words model. Instead of words, we have aminoacids. We will count the number of times each aminoacid appears in the sequence and create a vector with the counts. 

In [40]:
# Get a set of all unique letters in the protein sequences
train_alphabet = [set(sequence) for sequence in protein_sequences['sequence']]
train_alphabet = list(set.union(*train_alphabet))
train_alphabet.sort()

# Loop over every letter and count the number of times it appears in each sequence
for letter in train_alphabet:
    print(f'Processing {letter}...', end='\r')
    proteins_featurized[letter] = protein_sequences['sequence'].str.count(letter)
    
# Add the number of unique letters as a feature
proteins_featurized['unique_letters'] = protein_sequences['sequence'].apply(lambda x: len(set(x)))

Processing Z...

### Length and compression features

We will also add the length of the sequence and the compression ratio of the sequence. The compression ratio is defined as the ratio between the length of the sequence and the length of the compressed sequence. The compression is done using the Lempel-Ziv-Welch algorithm.

In [41]:
# Add the length of the sequence as a feature
proteins_featurized['sequence_length'] = protein_sequences['sequence'].str.len()

# Add the compressed sequence length as a feature
import zlib
proteins_featurized['compressed_sequence_length'] = protein_sequences['sequence'].apply(lambda x: len(zlib.compress(x.encode('utf-8'))))

# Add the compression ratio as a feature
proteins_featurized['compression_ratio'] = proteins_featurized['compressed_sequence_length'] / proteins_featurized['sequence_length']

### Save the post-processed data

My hope is that the data we save here is the final version of the train data we are to use. Essentially, labels_df is a pure y vector of labels and train_sequences is a pure X vector of features.

In [42]:
proteins_featurized = proteins_featurized.drop('sequence', axis=1)

feather.write_feather(proteins_featurized, 'train-ready_data/proteins_featurized.feather')