In [1]:
import jinfo as j

#### Create simple DNA sequence objects and retrieve sequence, label, length, molecular weight and melting temp:


In [2]:
seq_1 = j.DNASeq("ATGAGGATAGATCCCTATTAA", label="simple_dna_sequence")
print(seq_1)
print(seq_1.len)
print(seq_1.MW())
print(seq_1.tm())

simple_dna_sequence	ATGAGGATAGATCCCTATTAA
21
13006.49
43.87


#### Can get the mRNA transcription of a DNA sequence object, and probe features:

In [3]:
seq_1_mRNA = j.RNASeq(seq_1.transcribe(), label="simple_rna_sequence") #Should transcribe/translate return an RNASeq/AASeq object - print would still work? 
print(seq_1_mRNA)
print(seq_1_mRNA.reverse_transcribe())
print(seq_1_mRNA.MW())

simple_rna_sequence	AUGAGGAUAGAUCCCUAUUAA
ATGAGGATAGATCCCTATTAA
6448.090000000001


#### Translate the DNA or RNA sequences to get a protein:

In [4]:
seq_1_prot = j.AASeq(seq_1.translate(), label="simple_protein_sequence")
print(seq_1_prot)
print(seq_1_prot.MW())

simple_protein_sequence	MRIDPY*
883


#### Can perform DNA or protein alignments:
(requires MUSCLE backend)

In [13]:
seq_2 = j.DNASeq("ATGAGGAACTTGATAGATCCCTA", label="simple_dna_homolog_1")
seq_3 = j.DNASeq("ATGAGGATAGATCCTTACCTCTA", label="simple_dna_homolog_2")
seq_4 = j.DNASeq("ATGAGGATAGAGGCCTCCCTA", label="simple_dna_homolog_3")

simple_alignment = seq_1.align(seq_2)
print(simple_alignment)

# Type of underlying seq object is preserved:
type(simple_alignment.seqs[0])

simple_dna_sequence	ATGAG------GATAGATCCCTATTAA
simple_dna_homolog_1	ATGAGGAACTTGATAGATCCCTA----



jinfo.sequence.DNASeq

In [15]:
multiple_alignment = j.multialign([seq_1, seq_2, seq_3, seq_4])
print(multiple_alignment) 

simple_dna_homolog_2	ATGAG------GATAGA----TCCTTACCTCTA
simple_dna_homolog_3	ATGAG------GATAGAGGCCTCCCTA------
simple_dna_sequence	ATGAG------GATAGA----TCCCTA--TTAA
simple_dna_homolog_1	ATGAGGAACTTGATAGA----TCCCTA------



#### From alignment objects phylogenetic trees can be calculated:
(requires FastTree backend)

In [7]:
simple_tree = multiple_alignment.calc_tree()
print(simple_tree.tree) # Newick format tree...

(simple_dna_sequence:0.00054,simple_dna_homolog_1:0.00055,(simple_dna_homolog_3:0.00055,simple_dna_homolog_2:0.16226)0.177:0.00055);



#### For ML applications One-hot encoding DNA is helpful:

In [8]:
print(seq_1.one_hot())

[1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0
 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1
 0 0 1 0 0 0 1 0 0 0]


#### You can read sequence objects and alignments from fasta files:

In [9]:
# Example real workflow using 10 feline coronavirus spike protein variants:
# Import sequences into a list of seq objects:
spike_homologs = j.seq_list_from_fasta("docs/sequence.fasta", seq_obj=j.AASeq)

# Check out the first protein:
print(spike_homologs[0])

AEY70255.1 spike protein, partial [Feline coronavirus]	MALGSITSAVAVPFAMQVQARLNYVALQTDVLQENQKILANAFNNAIGNITLALGKVSNAIGTISDGFNTMASALTKIQSVVNQQGEALSQLTSQLQKNFQAICSSIAEIYNRLEKVXADAQVDRLITGRLAALNAYVSQTLTQQAEFKAQYALANGKYSRKPPSTPTKTLTLLIMT


In [10]:
# Align the homologues:
feline_spike_alignment = j.multialign(spike_homologs)
print(feline_spike_alignment)

AEY70255.1 spike protein, partial [Feline coronavirus]	---------MALGSITSAVAVPFAMQVQARLNYVALQTDVLQENQKILANAFNNAIGNITLALGKVSNAIGTISDGFNTMASALTKIQSVVNQQGEALSQLTSQLQKNFQAICSSIAEIYNRLEKVXADAQVDRLITGRLAALNAYVSQTLTQQAEFKAQYALANGKYSRKPPSTPTKTLTLLIMT
AEY70253.1 spike protein, partial [Feline coronavirus]	---------MAMGSITSAVAVPFAMQVQARLNYVALQTDVLQENQKILANAFNNAIGNITLALGKVSDAITTISDGFNSMASALTKIQSVVNQQGEALSQLTSQLQKNFQAISSSIAEIYNRLENVEADAEVDRLITGRLAALNAYVSQTLTQYAEVKASRQLAMEK-------------------
AEY70248.1 spike protein, partial [Feline coronavirus]	MYTASLIGGMALGSITSAVAVPFAMQVQARLNYVALQTDVLQENQKILANAFNNAIGNITLALDKVSNAVTTISEGFYTMASALTKIQSVVNQQGEALSQLTSQLQKNFQAISSSIAEIYNRLEKVEADAHVDRLITGRLAALNAYVSQTLTQYAEVKASRQLAMEKVNECVKSQSDRYGFCGTRG
AEY70254.1 spike protein, partial [Feline coronavirus]	----IFIGGMALGSITSAVAVPFAMQVQARLNYVALQTDVLQENQKILANAFNNAIGNITLALGKVSNVITTISDGFNSMASALTKIQSVVNQQGEALSQLTSQLQKNFQAISSSIAEIYNRLEKVEADAQVDRLITGRLAALNAYVSQTLTQYAEVKASRQMAMEKVNECVKSQSDRYGFCGN--
AEY70247.1 spike protein, partia

In [18]:
# Calculate a phylogenetic tree from alignment:
tree = feline_spike_alignment.calc_tree()
print(tree.tree)

(AEY70247.1:0.01399,AEY70251.1:0.01237,((((AEY70255.1:0.19026,AEY70248.1:0.02555)0.380:0.00054,(AEY70246.1:0.00053,AEY70249.1:0.01535)0.838:0.00507)0.762:0.00504,AEY70250.1:0.00506)0.853:0.01016,((AEY70254.1:0.02125,AEY70252.1:0.00503)0.983:0.00446,AEY70253.1:0.02451)0.052:0.00056)0.741:0.00434);

