## Onehot All Sequences

In [1]:
with open("data/sequence/CT-20.txt") as sequences, open("data/sequence/CT-20.fasta", "w") as fasta_out:
    for idx, line in enumerate(sequences, 1):
        seq = line.split()[0]
        fasta_out.write(f">Sequence {idx}\n{seq}\n")
        print(f">Sequence {idx}\n{seq}\n")

>Sequence 1
ACACATCACTCCCTCCTCCTTTTCGCACNNNNNNN

>Sequence 2
CATGCATGCCCTCCTCACCTCNNNNNNNNNNNNNN

>Sequence 3
ACACACAATCACCCTCTTTGCCTTCNNNNNNNNNN

>Sequence 4
ACTGCAACATTGTCCCTTCCTGCCTTCNNNNNNNN

>Sequence 5
TCATGCATCCTTTCCCTCTCGCNNNNNNNNNNNNN

>Sequence 6
TTACATGCCATTCCCTCACCTCNNNNNNNNNNNNN

>Sequence 7
ATTACCATCACTGCCTTACCCTCNNNNNNNNNNNN

>Sequence 8
ATGATGTCTTTCCCCTCCCTCNNNNNNNNNNNNNN

>Sequence 9
ACATTGCACATTCTTCACCTCNNNNNNNNNNNNNN

>Sequence 10
CATGTAGCTCATTCCCTTCTGCCTCTCNNNNNNNN

>Sequence 11
ATTACACCTCCCTCTGCCTCATTCNNNNNNNNNNN

>Sequence 12
ATAACGCATACCCTTGCCTTCCCTCNNNNNNNNNN

>Sequence 13
ATAACGCCTCATCATCCTCCCTTCTCACCTCGCTG

>Sequence 14
TAACATGATTGCCCTTCCTCNNNNNNNNNNNNNNN

>Sequence 15
ATTCATCACATCACCTCTCTCTCNNNNNNNNNNNN

>Sequence 16
CATGCACACACACCTCNNNNNNNNNNNNNNNNNNN

>Sequence 17
ACATTCCCTTACCCTTCCGCTTNNNNNNNNNNNNN

>Sequence 18
ACATCACACATACTACTTCTTCTCACTCNNNNNNN

>Sequence 19
ACACAACGCCATCATACCACTGCCTCTCNNNNNNN

>Sequence 20
ACATCACAGCCCATCCTCATTTCGTCCCCTTTCGC

>Sequence

In [2]:
letter_to_array = {
    'A': [1.0,0.0,0.0,0.0],
    'T': [0.0,0.0,1.0,0.0],
    'C': [0.0,1.0,0.0,0.0],
    'G': [0.0,0.0,0.0,1.0],
    'N': [0.0,0.0,0.0,0.0]
}

In [3]:
CT_dna_sequences = []
with open("data/sequence/CT-20.fasta") as fasta_file:
    for line in fasta_file:
        if not line.startswith('>'):
            CT_dna_sequences.append(line.strip())

In [4]:
CT_onehot_sequences = []

for seq in CT_dna_sequences:
    onehot = []
    for letter in seq:
        onehot.append(letter_to_array[letter])
    
    CT_onehot_sequences.append(onehot)

In [5]:
len(CT_onehot_sequences[0])

35

In [6]:
import numpy as np
np.array(CT_onehot_sequences).shape

(19000, 35, 4)

## Create Shapes for Sequences

In [1]:
MgW = []
HelT = []
Roll = []
Prot = []

In [2]:
from collections import defaultdict

count = defaultdict(int)

In [3]:
with open("data/sequence/CT_Shapes/HelT_CT-20.txt.HelT") as txt:
    for line in txt:
        if line.startswith('>'):
            HelT.append([])
        else:
            nums = line.split(",")
            for num in nums:
                num = num.strip()
                if num == "NA":
                    HelT[-1].append(0.0)
                else:
                    HelT[-1].append(float(num))

In [4]:
for shape in HelT:
    count[len(shape)] += 1

count

defaultdict(int, {34: 19000})

In [5]:
with open("data/sequence/CT_Shapes/MGW_CT-20.txt.MGW") as txt:
    for line in txt:
        if line.startswith('>'):
            MgW.append([])
        else:
            nums = line.split(",")
            for num in nums:
                num = num.strip()
                if num == "NA":
                    MgW[-1].append(0.0)
                else:
                    MgW[-1].append(float(num))

In [6]:
for shape in MgW:
    count[len(shape)] += 1

count

defaultdict(int, {34: 19000, 35: 19000})

In [7]:
with open("data/sequence/CT_Shapes/ProT_CT-20.txt.ProT") as txt:
    for line in txt:
        if line.startswith('>'):
            Prot.append([])
        else:
            nums = line.split(",")
            for num in nums:
                num = num.strip()
                if num == "NA":
                    Prot[-1].append(0.0)
                else:
                    Prot[-1].append(float(num))

In [8]:
for shape in Prot:
    count[len(shape)] += 1

count

defaultdict(int, {34: 19000, 35: 38000})

In [9]:
with open("data/sequence/CT_Shapes/Roll_CT-20.txt.Roll") as txt:
    for line in txt:
        if line.startswith('>'):
            Roll.append([])
        else:
            nums = line.split(",")
            for num in nums:
                num = num.strip()
                if num == "NA":
                    Roll[-1].append(0.0)
                else:
                    Roll[-1].append(float(num))

In [10]:
for shape in Roll:
    count[len(shape)] += 1

count

defaultdict(int, {34: 38000, 35: 38000})

In [11]:
length_of_shape = 34 + 34 + 35 + 35
length_of_shape

138

In [12]:
# of zeros we append to shape, 1 on each side
140 - length_of_shape

2

In [13]:
shapes = []

for i in range(len(HelT)):
    shape = [0] + MgW[i] + Roll[i] + Prot[i] + HelT[i] + [0]
    shapes.append(shape)

In [20]:
shapes[0][-9:]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0]

In [20]:
import numpy as np

normalized_shapes = []

def zero_mean_normalize(arr):

    mean = np.mean(arr)
    std = np.std(arr)
    
    if std == 0:
        print("STD was 0")
        return np.zeros_like(arr)  # Avoid division by zero
    
    return (arr - mean) / std

In [21]:
for shape in shapes:
    normalized_shapes.append(zero_mean_normalize(shape))

In [22]:
np.array(normalized_shapes).shape

(19000, 140)

## Combine onehot and shapes

In [23]:
import pickle as pkl

In [24]:
data = {
    "onehot_sequences": CT_onehot_sequences,
    "shapes": normalized_shapes
}

In [25]:
with open("data/sequence/CT_Shapes/CT-20_data.pkl", "wb") as f:
    pkl.dump(data, f)