# Sequence example

To get started with sequences, we begin by importing some useful python packages:


In [1]:
# useful imports
import numpy as np
import pandas as pd
from Bio import AlignIO, SeqIO, Seq 

#### Read the raw sequence:

In [6]:
rna_itr = SeqIO.parse('./samples/sequence.fasta', 'fasta')

#### Translation (Convert the RNA sequence to the Amino Acids):

In [7]:
aa_itr = [] # Defining an array to store the translated sequence

for record in rna_itr:
    if ((len(record.seq))%3 != 0): # A codon is the triplet of a set of bases
        print("The sequence %s 's length is not devided by three" %record.id)
    # Translation is terminated at the first in frame stop codon o.w.use False.
    tmp_aa = record.translate(to_stop=True) 
    tmp_aa.id = record.id    
    aa_itr.append(tmp_aa)

print("The translation is done.")   

The translation is done.


#### Save the result in the a 'fasta' file:

In [8]:
SeqIO.write(aa_itr, './samples/sequence_aa.fasta', 'fasta')

8

#### Read the multiple sequence alignment(MSA):

In [12]:
align = AlignIO.read('./samples/msa.fasta', 'fasta')

#### Convert alignment file from 'Fasta' to 'Phylip':

In [13]:
AlignIO.write(align, './samples/msa.phy', 'phylip')# phylip-relaxed or phylip

1

###Get alignment length:

In [17]:
print("The total lenght of alignment is %s." %align.get_alignment_length())

The total lenght of alignment is 100.


Convert msa to numpy matrix:

In [3]:
# Read the MSA file
align = AlignIO.read('./samples/msa.fasta', 'fasta')

# Get the number of rows (sequences) and columns (alignment length)
num_sequences = len(align)
alignment_length = align.get_alignment_length()

# Extract sequences as a list of strings
sequences = [str(record.seq) for record in align]

# Create an empty NumPy matrix filled with gaps (e.g., '-')
alignment_matrix = np.full((len(sequences), alignment_length), '-', dtype='U1')

# Fill the NumPy matrix with the aligned sequences
for i, seq in enumerate(sequences):
    alignment_matrix[i, :len(seq)] = list(seq)

# Print the NumPy matrix
print("Alignment Matrix:")
print(alignment_matrix)


Alignment Matrix:
[['-' 'C' 'C' 'C' 'T' 'C' 'A' '-' 'T' 'T' '-' 'C' 'C' '-' 'G' 'A' '-' '-'
  'C' 'G' 'C' 'G' 'C' 'T' 'C' '-' 'T' 'C' 'T' '-' 'G' 'G' 'A' '-' 'A' 'C'
  '-' 'G' 'A' 'T' 'T' 'G' 'G' '-' 'T' 'T' '-' 'C' 'T' 'G' 'T' 'C' '-' 'T'
  'A' 'C' '-' 'A' 'A' 'T' 'C' 'T' 'A' 'A' 'T' 'A' '-' 'A' 'T' 'T' 'T' 'T'
  'A' '-' 'C' 'C' 'A' 'T' 'G' '-' 'C' 'A' 'T' 'G' '-' 'C' 'G' '-' '-' 'T'
  '-' 'G' '-' 'C' 'A' 'A' 'T' '-' 'A' 'G']
 ['-' 'T' 'C' 'C' 'T' 'C' 'A' '-' 'C' 'T' '-' 'C' 'C' '-' 'G' 'A' '-' '-'
  'C' 'G' 'C' 'G' 'C' 'T' 'T' '-' 'T' 'C' 'T' '-' 'G' 'G' 'A' '-' 'A' 'C'
  '-' 'G' 'A' 'C' 'T' 'G' 'G' '-' 'T' '-' '-' 'C' 'T' 'G' 'T' 'C' '-' 'T'
  'A' 'C' '-' 'A' 'A' 'T' 'C' 'T' 'A' 'A' 'T' 'A' '-' 'A' 'T' 'T' 'T' 'T'
  'G' '-' 'C' 'C' 'A' 'T' 'G' '-' 'C' 'A' 'T' 'G' '-' 'C' 'G' '-' '-' 'T'
  '-' 'G' '-' 'C' 'A' 'A' 'T' '-' 'A' 'G']
 ['-' 'C' 'C' 'C' 'T' 'C' 'A' '-' 'T' 'C' '-' 'A' 'C' '-' 'G' 'A' '-' '-'
  'C' 'G' 'C' 'G' 'C' 'T' 'C' '-' 'T' 'C' 'G' '-' 'G' 'G' 'A' '-' 'A' 'C'
  '-' 'G

Map characters to digits in numpy 

In [16]:
# Read the MSA file
align = AlignIO.read('./samples/msa.fasta', 'fasta')

# Define the MSA alphabet: DNA/Amino Acids
alphabet_type = 'dna'

if alphabet_type=='aa':
    #defining amino acid letters :ARNDCQEGHILKMFPSTWYV-
    letter = {'A': 1, 'R':2, 'N':3, 'D':4, 'C':5, 'Q':6, 
              'E':7, 'G':8, 'H':9, 'I':10, 'L':11, 'K':12, 
              'M':13, 'F':14,'P':15, 'S':16, 'T':17, 'W':18,
              'Y':19, 'V':20, 'B':0, 'X':0, '-':-1 }
elif alphabet_type== 'dna':
    #defining dna letters ACGT-
    letter = {'A':1, 'C':2, 'G':3, 'T':4, 'X': 0, 'N':0, 'B':0, '-':-1}

print("The letter and their corresponding codes:\n", letter)


# Extract sequences as a list of strings
sequences = [str(record.seq) for record in align]

# Create an empty NumPy matrix filled with gaps (e.g., '-')
alignment_matrix = np.full((len(sequences), alignment_length), 0, dtype=int)

# Fill the NumPy matrix with mapped numbers
for i, seq in enumerate(sequences):
    for j, char in enumerate(seq):
        alignment_matrix[i, j] = letter.get(char, -1)

# Print the NumPy matrix
print("\nAlignment Matrix:\n")
print(alignment_matrix)

The letter and their corresponding codes:
 {'A': 1, 'C': 2, 'G': 3, 'T': 4, 'X': 0, 'N': 0, 'B': 0, '-': -1}

Alignment Matrix:

[[-1  2  2  2  4  2  1 -1  4  4 -1  2  2 -1  3  1 -1 -1  2  3  2  3  2  4
   2 -1  4  2  4 -1  3  3  1 -1  1  2 -1  3  1  4  4  3  3 -1  4  4 -1  2
   4  3  4  2 -1  4  1  2 -1  1  1  4  2  4  1  1  4  1 -1  1  4  4  4  4
   1 -1  2  2  1  4  3 -1  2  1  4  3 -1  2  3 -1 -1  4 -1  3 -1  2  1  1
   4 -1  1  3]
 [-1  4  2  2  4  2  1 -1  2  4 -1  2  2 -1  3  1 -1 -1  2  3  2  3  2  4
   4 -1  4  2  4 -1  3  3  1 -1  1  2 -1  3  1  2  4  3  3 -1  4 -1 -1  2
   4  3  4  2 -1  4  1  2 -1  1  1  4  2  4  1  1  4  1 -1  1  4  4  4  4
   3 -1  2  2  1  4  3 -1  2  1  4  3 -1  2  3 -1 -1  4 -1  3 -1  2  1  1
   4 -1  1  3]
 [-1  2  2  2  4  2  1 -1  4  2 -1  1  2 -1  3  1 -1 -1  2  3  2  3  2  4
   2 -1  4  2  3 -1  3  3  1 -1  1  2 -1  3  1  4  4  3  2 -1  4 -1 -1  2
   4  3  4  2 -1  2  1  2 -1  1  1  4  2  4  1  1  4  1 -1  1  4  4  4  4
   1 -1  2  2  1  4  3 -1  

-10
