In [1]:
from Bio import SeqIO
import numpy as np

In [2]:
def oneHotEncodeDNA(sequence):
    # A function to convert a dna sequence (list ['A','A','C', etc.]) into a one hot encoding version.
    baseEncodings = {
        'A': [1,0,0,0],
        'a': [1,0,0,0],
        'C': [0,1,0,0],
        'c': [0,1,0,0],
        'G': [0,0,1,0],
        'g': [0,0,1,0],
        'T': [0,0,0,1],
        't': [0,0,0,1],
        '-': [0,0,0,0],
        '?': [0.25,0.25,0.25,0.25],
        '*': [0.25,0.25,0.25,0.25],
        'B': [0,0.33,0.33,0.33],
        'D': [0.33,0,0.33,0.33],
        'H': [0.33,0.33,0,0.33],
        'K': [0,0,0.5,0.5],
        'M': [0.5,0.5,0,0],
        'N': [0.25,0.25,0.25,0.25],
        'R': [0.5,0,0.5,0],
        'S': [0,0.5,0.5,0],
        'V': [0.33,0.33,0.33,0],
        'W': [0.5,0,0,0.5],
        'Y': [0,0.5,0,0.5]
                    }

    encodedSequence = []
    for base in sequence:
        encodedSequence.append(baseEncodings[base])
        
    return encodedSequence

# Test
oneHotEncodeDNA(['A', 'C', 'G', 'T', 'T'])

[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 1]]

In [3]:
def FastaToMatrix(fastaFile):
    # A function to convert a fasta file into a one hot encoding version.
    # Load the fasta file.
    fastaData = SeqIO.parse(fastaFile, "fasta")

    # Encode the sequences.
    encodedSequences = []

    for record in fastaData:
        sequence = list(record.seq)
        encodedSequence = oneHotEncodeDNA(sequence)
        encodedSequences.append(encodedSequence)

    return encodedSequences

# Test
#FastaToMatrix('./data/testData/CD2.fasta')