# Joint Embedding of Protein Signals and Amino Acid Sequences

This notebook aims to create joint embeddings of protein signals and their corresponding amino acid sequences in the same vector space.

Try:
- Take the raw signal and match them to the corresponding peptide using a joint embedding space and contrastive learning

TODO:
- Dataset
    - Read in df
    - Split df into train/test split
        - Use the exact splits from the other models (CNN, RF) to make them directly comparable
- Model
    - Create an encoder that takes in reading and returns embedding 
    - Create an encoder that takes in sequence and returns embedding
    - Train
    - Test
- Evaluation
    - Crete a dummy classifier (not sure how)
    - Compare to the CNN and RF models
    - Try to do a 1:1 comparison between existing models and my embedding model

In [1]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import seaborn as sns
# import os, time
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
import random
# import math
# import warnings
# import scipy.signal as scisignal
# import sklearn.utils.class_weight as class_weight
# from sklearn.decomposition import PCA


## Understanding data

In [23]:
# index_to_aa = [aa for aa in 'CSAGTVNQMILYWFPHRKDE']
aa_to_index = {aa:i for i, aa in enumerate('CSAGTVNQMILYWFPHRKDE')}

run_to_peptide = {
    '20220824_run02_a': 'HDKER',
    '20220826_run01_a': 'GNQST',
    '20220826_run02_a': 'FYWCP',
    '20220826_run03_a': 'AVLIM',
    '20220907_run01_a': 'GNQST',
    '20221010_run02_a': 'GNQST',
    '20221011_run02_a': 'HDKER',
    '20221026_run01_a': 'VGDNY',
    '20221028_run01_a': 'TWAFH',
    '20221028_run02_a': 'PRMQE',
    '20221107_run01_a': 'TWAFH',
    '20221108_run01_a': 'TWAFH',
    '20221109_run01_a': 'VGDNY',
    '20221109_run02_a': 'PRMQE',
    '20221109_run03_a': 'KSILC',
    '20221109_run04_a': 'FYWCP',
    '20221110_run02_a': 'AVLIM',
    '20221121_run01_a': 'KSILC',
    '20221122_run01_a': 'KSILC',
    '20221122_run02_a': 'PRMQE',
    '20221122_run03_a': 'KSILC', 
    '20221213_run02_a': 'FYWCP',
    '20221214_run01_a': 'FYWCP',
    '20221214_run04_a': 'FYWCP',
    '20221219_run01_a': 'VGDNY',
    '20221220_run01_a': 'FYWCP',
    '20221220_run04_a': 'AVLIM',
    '20221221_run01_a': 'TWAFH',
    '20221221_run02_a': 'KSILC',
 }

run_df = pd.read_json('./data/run_df.json')

In [29]:
print(f"Run to Peptide Mapping: {run_to_peptide}")
print(f"Amino Acids to Index Mapping: {aa_to_index}")
print(f"Number of unique peptides: {len(set(run_to_peptide.values()))}")
print(f"Number of runs: {len(set(run_to_peptide.keys()))}")
print(f"Example Peptide for run_df row 1 ({run_df.iloc[0].name}): {run_to_peptide.get(run_df.iloc[0].name, 'Unknown')}")

Run to Peptide Mapping: {'20220824_run02_a': 'HDKER', '20220826_run01_a': 'GNQST', '20220826_run02_a': 'FYWCP', '20220826_run03_a': 'AVLIM', '20220907_run01_a': 'GNQST', '20221010_run02_a': 'GNQST', '20221011_run02_a': 'HDKER', '20221026_run01_a': 'VGDNY', '20221028_run01_a': 'TWAFH', '20221028_run02_a': 'PRMQE', '20221107_run01_a': 'TWAFH', '20221108_run01_a': 'TWAFH', '20221109_run01_a': 'VGDNY', '20221109_run02_a': 'PRMQE', '20221109_run03_a': 'KSILC', '20221109_run04_a': 'FYWCP', '20221110_run02_a': 'AVLIM', '20221121_run01_a': 'KSILC', '20221122_run01_a': 'KSILC', '20221122_run02_a': 'PRMQE', '20221122_run03_a': 'KSILC', '20221213_run02_a': 'FYWCP', '20221214_run01_a': 'FYWCP', '20221214_run04_a': 'FYWCP', '20221219_run01_a': 'VGDNY', '20221220_run01_a': 'FYWCP', '20221220_run04_a': 'AVLIM', '20221221_run01_a': 'TWAFH', '20221221_run02_a': 'KSILC'}
Amino Acids to Index Mapping: {'C': 0, 'S': 1, 'A': 2, 'G': 3, 'T': 4, 'V': 5, 'N': 6, 'Q': 7, 'M': 8, 'I': 9, 'L': 10, 'Y': 11, 'W': 

In [None]:
## Train/Validation/Test Split
### Based off the splits used in the other models (CNN, RF) to make them directly comparable
chosen_runs = ['20220824_run02_a', '20220826_run02_a', '20220826_run03_a', '20220907_run01_a', '20221213_run02_a', '20221214_run01_a']

class MyDataset(Dataset):
    def __init__(self, input_data, output_data):
        self.input_data = input_data
        self.output_data = output_data

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        return self.input_data[idx], self.output_data[idx]

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# train / validation / test split
test = run_df.loc[chosen_runs]
train = run_df.drop(chosen_runs)

test['data']
    
def get_data(segment_len, batch_size, acids=index_to_aa):
    acid_to_index = {aa:i for i, aa in enumerate(acids)}

#     train_input = tr_segments[tr_segments.aa.isin(acids)].transformed.apply(lambda s: stretch(s, segment_len)).apply(torch.tensor).values
# #     val_input   = vl_segments[vl_segments.aa.isin(acids)].transformed.apply(lambda s: stretch(s, segment_len)).apply(torch.tensor).values
#     test_input  = te_segments[te_segments.aa.isin(acids)].transformed.apply(lambda s: stretch(s, segment_len)).apply(torch.tensor).values
# #     val_output    = vl_segments[vl_segments.aa.isin(acids)].aa.apply(lambda a: acid_to_index[a]).values
#     train_output = tr_segments[tr_segments.aa.isin(acids)].aa.apply(lambda a: acid_to_index[a]).values
#     test_output  = te_segments[te_segments.aa.isin(acids)].aa.apply(lambda a: acid_to_index[a]).values


    # Create instances of the dataset and dataloader
    train_dataset = MyDataset(train_input, train_output)
#     val_dataset = MyDataset(val_input, val_output)
    test_dataset = MyDataset(test_input, test_output)


    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,worker_init_fn=seed_worker)
#     vld_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True,worker_init_fn=seed_worker)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True,worker_init_fn=seed_worker)

    # class_weights=class_weight.compute_class_weight(class_weight='balanced',classes=np.unique(train_output),y=train_output)
    # class_weights=torch.tensor(class_weights,dtype=torch.float)
    return train_loader, test_loader, # class_weights  #vld_loader,


20220824_run02_a    {'1.0': [32.0503955078, 32.1850610352, 31.6463...
20220826_run02_a    {'22.0': [30.718359375, 30.5787304687, 31.9750...
20220826_run03_a    {'9.0': [28.5575097656, 27.4998242187, 28.2930...
20220907_run01_a    {'58.0': [35.0544482422, 35.6291113281, 35.054...
20221213_run02_a    {'146.0': [31.6711230469, 32.6185498047, 32.07...
20221214_run01_a    {'77.0': [30.9753662109, 30.271380615200002, 2...
Name: data, dtype: object

## Model

### Define Model

## Evaluation