# Examples of module operation

## Custom Random Forest

In [1]:
import numpy as np
from custom_random_forest import RandomForestClassifierCustom
from sklearn.datasets import make_classification


In [2]:
X, y = make_classification(n_samples=1000000)
random_forest = RandomForestClassifierCustom(max_depth=30, n_estimators=30, 
                                             max_features=2, random_state=42)

In [3]:
%%time

random_forest.fit(X, y)

CPU times: user 277 ms, sys: 550 ms, total: 827 ms
Wall time: 3min 18s


In [4]:
%%time

random_forest.fit(X, y, n_jobs = 2)

CPU times: user 563 ms, sys: 1.14 s, total: 1.7 s
Wall time: 1min 53s


In [5]:
%%time

predict_one_proc = random_forest.predict(X)

CPU times: user 678 ms, sys: 1.33 s, total: 2 s
Wall time: 6.02 s


In [6]:
%%time

predict_two_proc = random_forest.predict(X, n_jobs = 2)

CPU times: user 847 ms, sys: 1.94 s, total: 2.79 s
Wall time: 4.85 s


In [7]:
np.array_equal(predict_one_proc, predict_two_proc)

True

## Custom Genscan API

In [8]:
from bioinformatics_toolkit import run_genscan

In [9]:
genscan_result = run_genscan(sequence_file = './data/sequence.txt', exon_cutoff = 0.5)

In [10]:
genscan_result.exon_list

[{1: {'Type': 'Intr', 'Begin': 696, 'End': 779}},
 {2: {'Type': 'Intr', 'Begin': 3570, 'End': 3818}},
 {3: {'Type': 'Intr', 'Begin': 10735, 'End': 10775}},
 {4: {'Type': 'Intr', 'Begin': 10992, 'End': 11106}},
 {5: {'Type': 'Intr', 'Begin': 13936, 'End': 13985}},
 {6: {'Type': 'Intr', 'Begin': 15412, 'End': 15523}},
 {7: {'Type': 'Intr', 'Begin': 16765, 'End': 17880}},
 {8: {'Type': 'Intr', 'Begin': 19361, 'End': 19450}},
 {9: {'Type': 'Intr', 'Begin': 20758, 'End': 25689}},
 {10: {'Type': 'Intr', 'Begin': 31320, 'End': 31389}},
 {11: {'Type': 'Intr', 'Begin': 39354, 'End': 39781}},
 {12: {'Type': 'Intr', 'Begin': 40921, 'End': 41102}},
 {13: {'Type': 'Intr', 'Begin': 42235, 'End': 42422}},
 {14: {'Type': 'Intr', 'Begin': 47672, 'End': 48026}},
 {15: {'Type': 'Intr', 'Begin': 54895, 'End': 55050}},
 {16: {'Type': 'Intr', 'Begin': 55449, 'End': 55593}},
 {17: {'Type': 'Intr', 'Begin': 61163, 'End': 61284}},
 {18: {'Type': 'Intr', 'Begin': 63810, 'End': 64008}},
 {19: {'Type': 'Intr', 'B

In [11]:
genscan_result.intron_list

[{1: {'Begin': 780, 'End': 3569}},
 {2: {'Begin': 3819, 'End': 10734}},
 {3: {'Begin': 10776, 'End': 10991}},
 {4: {'Begin': 11107, 'End': 13935}},
 {5: {'Begin': 13986, 'End': 15411}},
 {6: {'Begin': 15524, 'End': 16764}},
 {7: {'Begin': 17881, 'End': 19360}},
 {8: {'Begin': 19451, 'End': 20757}},
 {9: {'Begin': 25690, 'End': 31319}},
 {10: {'Begin': 31390, 'End': 39353}},
 {11: {'Begin': 39782, 'End': 40920}},
 {12: {'Begin': 41103, 'End': 42234}},
 {13: {'Begin': 42423, 'End': 47671}},
 {14: {'Begin': 48027, 'End': 54894}},
 {15: {'Begin': 55051, 'End': 55448}},
 {16: {'Begin': 55594, 'End': 61162}},
 {17: {'Begin': 61285, 'End': 63809}},
 {18: {'Begin': 64009, 'End': 64242}},
 {19: {'Begin': 64407, 'End': 64499}},
 {20: {'Begin': 64639, 'End': 70522}},
 {21: {'Begin': 70545, 'End': 79208}},
 {22: {'Begin': 79427, 'End': 81390}}]

In [12]:
genscan_result.cds_list

['XALATFRVLNVASGTGLDSTAVKCSHPHNLGPISLNWFEELSSEAPPYNSEPAEESEHKNNNYEPNLFKTPQRKPSYNQLASTPIIFKEQGLTLPLYQSPVKELDKFKLDLVVCGSLFHTPKFVKGRQTPKHISESLGAEVDPDMSWSSSLATPPTLSSTVLIVRNEEASETVFPHDTTANVKSYFSNHDESLKKNDRFIASVTDSENTNQREAASHGFGKTSGNSFKVNSCKDHIGKSMPNVLEDEVYETVVDTSEEDSFSLCFSKCRTKNLQKVRTSKTRKKIFHEANADECEKSKNQVKEKYSFVSEVEPNDTDPLDSNVANQKPFESGSDKISKEVVPSLACEWSQLTLSGLNGAQMEKIPLLHISSCDQNISEKDLLDTENKRKKDFLTSENSLPRISSLPKSEKPLNEETVVNKRDEEQHLESHTDCILAVKQAISGTSPVASSFQGIKKSIFRIRESPKETFNASFSGHMTDPNFKKETEASESGLEIHTVCSQKEDSLCPNLIDNGSWPATTTQNSVALKNAGLISTLKKKTNKFIYAIHDETSYKGKKIPKDQKSELINCSAQFEANAFEAPLTFANADSVHLATKLYVYKVILSPYEKTLVNSSVNSRVGLLHSSVKRSCSQNDSEEPTLSLTSSFGTILRKCSRNETCSNNTVISQDLDYKEAKCNKEKLQLFITPEADSLSCLQEGQCENDPKSKKVSDIKEEVLAAACHPVQHSKVEYSDTDFQSQKSLLYDHENASTLILTPTSKDVLSNLVMISRGKESYKMSDKLKGNNYESDVELTKNIPMEKNQDVCALNENYKNVELLPPEKYMRVASPSRKVQFNQNTNLRVIQKNQEETTSISKITVNPDSEELFSDNENNFVFQVANERNNLALGNTKELHETDLTCVNEPIFKNSTMVLYGDTGDKQATQVSIKKDLVYVLAEENKNSVKQHIKMTLGQDLKSDISLNIDKIPEKNNDYMNKWAGLLGPISNHSFGGSFRTASNK

## OpenFasta context manager

In [13]:
from bio_files_processor import OpenFasta

In [14]:
fasta_file_path = './data/example_fasta.fasta'
with OpenFasta(fasta_file_path) as fasta_file:
    print(fasta_file.read_record())
    print(fasta_file.read_record())
    print(fasta_file.read_record())
    print(fasta_file.read_record())
    print(fasta_file.read_record())


id = GTD323452
description = 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+)
sequence = ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG
id = GTD678345
description = 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+)
sequence = TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGAGCCCTTGGGAGTGGTCCATTTGAGCCGGCAACGGCACGTTTGGACTGCAAACTTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGT
id = GTD174893
description = 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+)
sequence = TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTT
id = GTD906783
description = 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-)
sequence = TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTG

## Biological Sequence API

In [15]:
from bioinformatics_toolkit import DNASequence, RNASequence, AminoAcidSequence

### DNASequence

In [16]:
with OpenFasta(fasta_file_path) as fasta_file:
    first_record = fasta_file.read_record()
    my_DNA_seq = first_record.seq

In [17]:
DNA_seq = DNASequence(my_DNA_seq)
my_RNA_seq = str(DNA_seq.transcribe())
print(f'{type(DNA_seq.transcribe())}: {DNA_seq.transcribe()}')

<class 'bioinformatics_toolkit.RNASequence'>: UGCCGGUAUCCUGAAACUUUCGUGGCGUAGGGCAGGCUAGACGCUUCAAUUGGUUCUACGGCGGACCAAUCAUGGUACCACCCCCUGGUGUACCCUUAGGGACCACGACAC


### RNASequence

In [18]:
RNA_seq = RNASequence(my_RNA_seq)
print(RNA_seq.complement())

ACGGCCAUAGGACUUUGAAAGCACCGCAUCCCGUCCGAUCUGCGAAGUUAACCAAGAUGCCGCCUGGUUAGUACCAUGGUGGGGGACCACAUGGGAAUCCCUGGUGCUGUG


### AminoAcidSequence

In [19]:
amino_acid_seq = AminoAcidSequence(genscan_result.cds_list[1])
amino_acid_seq.check_alphabet()

True

In [20]:
amino_acid_seq.count_molecular_weight()

43076