In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt


In [5]:
def import_data(sel='../../data/csv/A3_Sup_pos_LynD_R1_001.csv', 
                anti='../../data/csv/A5_Elu_pos_LynD_R1_001.csv'):
    """Import and curate data for modeling"""
    sel = pd.read_csv(sel)
    sel = sel.drop_duplicates(subset=['seq'])
    
    anti = pd.read_csv(anti)
    anti = anti.drop_duplicates(subset=['seq'])

    # find intersecting seqs and remove them as unreliable
    intersect = np.intersect1d(anti.seq.values, sel.seq.values)
    sel = sel.loc[~sel.seq.isin(intersect)].reset_index(drop=True)
    anti = anti.loc[~anti.seq.isin(intersect)].reset_index(drop=True)
    
    # extract variable-region sequence for modeling
    sel['var_seq'] = sel.seq.str[22:25] + sel.seq.str[26:29]
    anti['var_seq'] = anti.seq.str[22:25] + anti.seq.str[26:29]
    
    print('Curated dataset sizes:\nSelection:\t', sel.shape[0], '\nAntiselection:\t', anti.shape[0])
    return sel, anti

sel, anti = import_data()
print(sel.columns)

Curated dataset sizes:
Selection:	 90190 
Antiselection:	 94613
Index(['ORF', 'round', 'seq', 'count', 'pct', 'var_seq'], dtype='object')


In [6]:
def prep_dataset(sel, anti):
    """Split dataset into 5 random folds for cross-validation"""
    sel['var_seq'] = sel.seq.str[22:25] + sel.seq.str[26:29]
    anti['var_seq'] = anti.seq.str[22:25] + anti.seq.str[26:29]
    return

df = pd.concat([sel, anti], axis=0)
print(df.shape)

(184803, 6)
