In [1]:
import numpy as np
import os
import pandas as pd
import pickle as pkl
from utils import variants_to_onehot_seq, variants_to_seq
from Bio import SeqIO

In [2]:
data_dir = "../Data/"
output_data_dir = "data"

In [3]:
protein_variants_files = dict()
# List only folders in data folder
for folder in os.listdir(data_dir,):
    if os.path.isdir(os.path.join(data_dir, folder)):
        variants = []
        # List every file in folder with fasta extension
        for file in os.listdir(data_dir + folder):
            if file.endswith(".csv"):
                variants.append(file)
        protein_variants_files[folder] = variants

In [4]:
protein_variants_files

{'avgfp': ['avgfp_encoded.csv'],
 'bg_strsq': ['BG_STRSQ_Abate2015_encoded.csv'],
 'blat_ecolx': ['BLAT_ECOLX_Ostermeier2014_encoded.csv',
  'BLAT_ECOLX_Palzkill2012_encoded.csv',
  'BLAT_ECOLX_Ranganathan2015_encoded.csv',
  'BLAT_ECOLX_Tenaillon2013-singles_encoded.csv'],
 'brca1_human': ['BRCA1_HUMAN_Fields2015_e3_encoded.csv',
  'BRCA1_HUMAN_Fields2015_y2h_encoded.csv'],
 'dlg4_rat': ['DLG4_RAT_Ranganathan2012_encoded.csv'],
 'gal4_yeast': ['GAL4_YEAST_Shendure2015_encoded.csv'],
 'hg_flu': ['HG_FLU_Bloom2016_encoded.csv'],
 'hsp82_yeast': ['HSP82_YEAST_Bolon2016_encoded.csv'],
 'kka2_klepn': ['KKA2_KLEPN_Mikkelsen2014_encoded.csv'],
 'mth3_haeaestabilized': ['MTH3_HAEAESTABILIZED_Tawfik2015_encoded.csv'],
 'pabp_yeast': ['PABP_YEAST_Fields2013-doubles_encoded.csv',
  'PABP_YEAST_Fields2013-singles_encoded.csv'],
 'polg_hcvjf': ['POLG_HCVJF_Sun2014_encoded.csv'],
 'rl401_yeast': ['RL401_YEAST_Bolon2013_encoded.csv',
  'RL401_YEAST_Bolon2014_encoded.csv'],
 'ube4b_mouse': ['UBE4B_MO

In [8]:
for protein, variants_files in protein_variants_files.items():
    for variants_file in variants_files:
        
        df = pd.read_csv(os.path.join(data_dir, protein, variants_file), sep=";")
        fitness_df = df["y"]
        variants_df = df["variant"]
        
        # Get the wild type
        wild_type_file = os.path.join(data_dir, protein, protein+".fasta")
        wild_type = SeqIO.read(wild_type_file, "fasta")
        wild_type_seq = wild_type.seq
        print(wild_type.description)
        wilt_type_startpos = int(wild_type.description.lower().split("start: ")[1].split(",")[0])

        # Convert variants to one-hot encoded sequences
        X = variants_to_seq(variants_df, wild_type_seq, wilt_type_startpos)
        y = fitness_df.to_numpy()

        # Save to pickle
        # Create protein directory if it doesn't exist
        if not os.path.exists(os.path.join(output_data_dir, protein)):
            os.makedirs(os.path.join(output_data_dir, protein))
        pkl.dump(X, open(os.path.join(output_data_dir, protein, variants_file.split("_encoded")[0] + "_X.pkl"), "wb"))
        pkl.dump(y, open(os.path.join(output_data_dir, protein, variants_file.split("_encoded")[0] + "_y.pkl"), "wb"))

 avgfp, Start: 1, End: 235, Length: 235, Offset: 1, Continuous: True


AttributeError: 'Series' object has no attribute 'split'

In [7]:
for protein, variants_files in protein_variants_files.items():
    for variants_file in variants_files:
        X = pkl.load(open(os.path.join(output_data_dir, protein, variants_file.split("_encoded")[0] + "_X.pkl"), "rb"))
        y = pkl.load(open(os.path.join(output_data_dir, protein, variants_file.split("_encoded")[0] + "_y.pkl"), "rb"))
        print(X.shape, y.shape)

(32610, 235, 20) (32610,)
(2598, 478, 20) (2598,)
(4799, 286, 20) (4799,)
(4922, 286, 20) (4922,)
(4921, 286, 20) (4921,)
(975, 286, 20) (975,)
(2846, 303, 20) (2846,)
(1278, 303, 20) (1278,)
(1388, 83, 20) (1388,)
(803, 64, 20) (803,)
(2198, 564, 20) (2198,)
(4065, 230, 20) (4065,)
(3827, 264, 20) (3827,)
(1611, 329, 20) (1611,)
(33771, 75, 20) (33771,)
(1142, 75, 20) (1142,)
(1613, 86, 20) (1613,)
(1154, 75, 20) (1154,)
(1282, 75, 20) (1282,)
(518, 102, 20) (518,)
(313, 34, 20) (313,)
