In [2]:
import jinfo as j

#### Create simple DNA sequence objects and retrieve sequence, label, length, molecular weight and melting temp:


In [3]:
seq_1 = j.DNASeq("ATGAGGATAGATCCCTATTAA", label="simple_dna_sequence")
print(seq_1)
print(seq_1.len)
print(seq_1.MW())
print(seq_1.tm())

simple_dna_sequence	ATGAGGATAGATCCCTATTAA
21
13006.49
43.87


#### Can get the mRNA transcription of a DNA sequence object, and probe features:

In [4]:
seq_1_mRNA = j.RNASeq(seq_1.transcribe(), label="simple_rna_sequence") #Should transcribe/translate return an RNASeq/AASeq object - print would still work? 
print(seq_1_mRNA)
print(seq_1_mRNA.reverse_transcribe())
print(seq_1_mRNA.MW())

simple_rna_sequence	AUGAGGAUAGAUCCCUAUUAA
ATGAGGATAGATCCCTATTAA
6448.090000000001


#### Translate the DNA or RNA sequences to get a protein:

In [5]:
seq_1_prot = j.AASeq(seq_1.translate(), label="simple_protein_sequence")
print(seq_1_prot)
print(seq_1_prot.MW())

simple_protein_sequence	MRIDPY*
883


#### Can perform DNA or protein alignments:
(requires MUSCLE backend)

In [6]:
seq_2 = j.DNASeq("ATGAGGAACTTGATAGATCCCTA", label="simple_dna_homolog_1")
seq_3 = j.DNASeq("ATGAGGATAGATCCTTACCTCTA", label="simple_dna_homolog_2")
seq_4 = j.DNASeq("ATGAGGATAGAGGCCTCCCTA", label="simple_dna_homolog_3")

simple_alignment = seq_1.align(seq_2)
print(simple_alignment)

# Type of underlying seq object is preserved:
type(simple_alignment.seqs[0])

simple_dna_sequence	ATGAG------GATAGATCCCTATTAA
simple_dna_homolog_1	ATGAGGAACTTGATAGATCCCTA----



jinfo.sequence.DNASeq

In [7]:
multiple_alignment = j.multialign([seq_1, seq_2, seq_3, seq_4])
print(multiple_alignment) 

simple_dna_homolog_2	ATGAG------GATAGA----TCCTTACCTCTA
simple_dna_homolog_3	ATGAG------GATAGAGGCCTCCCTA------
simple_dna_sequence	ATGAG------GATAGA----TCCCTA--TTAA
simple_dna_homolog_1	ATGAGGAACTTGATAGA----TCCCTA------



#### From alignment objects phylogenetic trees can be calculated:
(requires FastTree backend)

In [8]:
simple_tree = multiple_alignment.calc_tree()
print(simple_tree.tree) # Newick format tree...

(simple_dna_sequence:0.00054,simple_dna_homolog_1:0.00055,(simple_dna_homolog_3:0.00055,simple_dna_homolog_2:0.16226)0.177:0.00055);



#### For ML applications One-hot encoding DNA is helpful:

In [9]:
print(seq_1.one_hot())

[1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0
 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1
 0 0 1 0 0 0 1 0 0 0]


#### You can read sequence objects and alignments from fasta files:

In [10]:
# Example real workflow using 10 feline coronavirus spike protein variants:
# Import sequences into a list of seq objects:
spike_homologs = j.seq_list_from_fasta("docs/sequence.fasta", seq_obj=j.AASeq)

# Check out the first protein:
print(spike_homologs[0])

AEY70255.1 spike protein, partial [Feline coronavirus]	MALGSITSAVAVPFAMQVQARLNYVALQTDVLQENQKILANAFNNAIGNITLALGKVSNAIGTISDGFNTMASALTKIQSVVNQQGEALSQLTSQLQKNFQAICSSIAEIYNRLEKVXADAQVDRLITGRLAALNAYVSQTLTQQAEFKAQYALANGKYSRKPPSTPTKTLTLLIMT


In [15]:
# Align the homologues:
feline_spike_alignment = j.multialign(spike_homologs)

# Show the percentage identity array from the alignment:
low_id_alignment = feline_spike_alignment.identity_filter(95, show_id_array=True)

Calculated alignment identity array:
AEY70255.1 spike protein, partial [Feline coronavirus]	[100.0, 81.72, 77.42, 80.65, 77.96, 78.49, 77.96, 79.03, 79.03, 79.03]
AEY70253.1 spike protein, partial [Feline coronavirus]	[81.72, 100.0, 80.11, 84.95, 81.72, 82.8, 80.65, 82.26, 82.26, 90.32]
AEY70248.1 spike protein, partial [Feline coronavirus]	[77.42, 80.11, 100.0, 90.86, 95.7, 94.62, 95.7, 96.24, 94.62, 88.17]
AEY70254.1 spike protein, partial [Feline coronavirus]	[80.65, 84.95, 90.86, 100.0, 92.47, 94.09, 91.4, 93.55, 94.09, 87.1]
AEY70247.1 spike protein, partial [Feline coronavirus]	[77.96, 81.72, 95.7, 92.47, 100.0, 96.24, 95.16, 96.77, 97.31, 88.71]
AEY70251.1 spike protein, partial [Feline coronavirus]	[78.49, 82.8, 94.62, 94.09, 96.24, 100.0, 94.62, 96.77, 96.77, 90.32]
AEY70249.1 spike protein, partial [Feline coronavirus]	[77.96, 80.65, 95.7, 91.4, 95.16, 94.62, 100.0, 96.77, 95.16, 89.78]
AEY70250.1 spike protein, partial [Feline coronavirus]	[79.03, 82.26, 96.24, 93.55, 96.77,

In [16]:
# Calculate phylogenetic trees from the alignments:
tree = feline_spike_alignment.calc_tree()
print(tree.tree)

tree2 = low_id_alignment.calc_tree()
print(tree2.tree)

(AEY70247.1:0.01399,AEY70251.1:0.01237,((((AEY70255.1:0.19026,AEY70248.1:0.02555)0.380:0.00054,(AEY70246.1:0.00053,AEY70249.1:0.01535)0.838:0.00507)0.762:0.00504,AEY70250.1:0.00506)0.853:0.01016,((AEY70254.1:0.02125,AEY70252.1:0.00503)0.983:0.00446,AEY70253.1:0.02451)0.052:0.00056)0.741:0.00434);

(AEY70255.1:0.15826,AEY70246.1:0.00728,(AEY70253.1:0.02468,AEY70254.1:0.03120)0.752:0.01592);

