# RandomForestClassifierCustom

In [1]:
from custom_random_forest import RandomForestClassifierCustom
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [2]:
X, y = make_classification(n_samples=100000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [3]:
parallel_tree = RandomForestClassifierCustom(n_estimators=10, 
                                             max_depth=30, 
                                             max_features=2, 
                                             random_state=42)

**Один процесс**

In [4]:
%%time
parallel_tree.fit(X_train, y_train, n_jobs=1)

CPU times: user 48.5 ms, sys: 77.4 ms, total: 126 ms
Wall time: 2.22 s


In [5]:
%%time
results_1 = parallel_tree.predict(X_test, n_jobs=1)

CPU times: user 67.3 ms, sys: 55.2 ms, total: 123 ms
Wall time: 191 ms


**Два процесса**

In [6]:
%%time
parallel_tree.fit(X_train, y_train, n_jobs=2)

CPU times: user 34.6 ms, sys: 92 ms, total: 127 ms
Wall time: 1.28 s


In [7]:
%%time
results_2 = parallel_tree.predict(X_test, n_jobs=2)

CPU times: user 66.5 ms, sys: 63 ms, total: 130 ms
Wall time: 178 ms


**Сравниваем предикты**

In [8]:
(results_1 == results_2).all()

True

# OpenFasta

In [9]:
from bio_files_processor import OpenFasta

In [10]:
path_to_file = 'data/example_fasta.fasta'

In [11]:
fasta_list = []

with OpenFasta(path_to_file) as file:
    for i in file:
        print(i)
        fasta_list.append(i)

<class FastaRecord>, id='GTD323452', seq='ACGGC...'. =(
<class FastaRecord>, id='GTD678345', seq='TTGGC...'. =(
<class FastaRecord>, id='GTD174893', seq='TTGAA...'. =(
<class FastaRecord>, id='GTD906783', seq='TTGAA...'. =(
<class FastaRecord>, id='GTD129563', seq='CGGAC...'. =(


In [12]:
print(fasta_list[0])

print(fasta_list[0].description)

<class FastaRecord>, id='GTD323452', seq='ACGGC...'. =(
5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+)


In [13]:
print(fasta_list[0].wish_beauty)

False


In [14]:
fasta_list[0].wish_beauty = True
fasta_list[0]

<class FastaRecord>, id='GTD323452', seq='ACGGC...'
             __
        _   /  |
       | \  \/_/
       \_\| / __              
          \/_/__\           .--=/~\
   ____,__/__,_____,______)/  /{~}}}
   -,-----,--\--,-----,---,\  \{{{~}
           __/\_            --=.\}/
          /_/ |\\
               \/

# Genscan

In [15]:
import requests

from general import run_genscan

## Пример 1

Проверяем отправку файла

[Сиквенс для примера](https://plants.ensembl.org/Zea_mays/Gene/Summary?db=otherfeatures;g=Zm00001d027233;r=1:94551-130553;t=Zm00001d027233_T001)

In [16]:
result_1 = run_genscan(sequence = None, sequence_file = 'data/maize.fasta', organism = 'Maize', exon_cutoff = 0.05)

In [17]:
result_1.status

200

In [18]:
result_1.exon_list[:5]

[('1.01', 614, 890),
 ('1.02', 972, 1134),
 ('1.03', 4126, 4273),
 ('1.04', 4498, 4503),
 ('2.00', 5189, 5228)]

In [19]:
result_1.intron_list[:5]

[('1.01', 891, 971),
 ('1.02', 1135, 4125),
 ('2.00', 5229, 7813),
 ('3.00', 9648, 12435),
 ('4.00', 15281, 17268)]

In [20]:
result_1.cds_list[0]

('>/tmp/04_30_24-06:01:11.fasta|GENSCAN_predicted_CDS_1|588_bp',
 'atggaggacccaacatatacccctgaggttgtacaccctacagcggatgcaacggaacctgatggatcctcagttaccgcctgcgactgggttatccctgaattcggttccatgccaaggcgctaccttgccaacaaatgggagtgcaagcataaccgaacaaggaagtggggtggagcatacacaagcgaagccaagggtgatgatgatgatgaggttgtcatattcagagaagataatgacgacgacgacgagggatacatattcgctgaccaatatgacgagaccgacgaggacatcgagatcgatggtactcaagatgaatctactgccactgatgtgcctgacccgtacgacaaggtgtacagtaacctccctgaagaaacacatatgctgaagcttgttcctgactgcggttattgcaccgcgaagaagtttgaagcgacaacccgatcgaacataaagatccttgtcatcccagtcgtcgatggaaagaagaggtcgaggaagggtgtaagaaagaaccccacaatagattgtgggacatacaccaagaacatcgtctacaaggaggtcctaataaactag')

## Пример 2

Проверяем отправку строкой. Загрузим сиквенс с NCBI.

[Сиквенс для примера](https://www.ncbi.nlm.nih.gov/nuccore/NC_003074.8?report=fasta&from=314693&to=317431&strand=true)

In [21]:
url = 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi'

query_string_params = {
                       'id': '240255695',
                       'db': 'nuccore',
                       'report': 'fasta',
                       'from': '314693',
                       'to': '317431',
                       'strand': 'on',
                       'retmode': 'html',
                       }

response = requests.get(url, params=query_string_params)

sequence_ara = ''.join(response.text.split('sequence')[1].split('\n'))

In [22]:
response.status_code

200

In [23]:
sequence_ara[:100]

'TGTTTGGGCTCAATTTGGGCTCTGTCTGTTATCCACAAATTAAGAGACGCAAACCTAAAACGACGCCGTTCCATAGTTTTCCTTTGAAGAAGAGAGAGAC'

In [24]:
# Предупреждение, когда одновременно указаны и сиквенс, и файл
# Т.к. на сайте приоритет у введённого сиквенса, в этом туле также: файл игнорируется 
result_2 = run_genscan(sequence = sequence_ara, sequence_file = 'maize.fasta', organism = 'Arabidopsis', exon_cutoff = 0.05)



In [25]:
result_2.status

200

In [26]:
result_2.exon_list[:5]

[('1.01', 77, 232),
 ('1.02', 523, 585),
 ('1.03', 797, 878),
 ('1.04', 947, 1065),
 ('1.05', 1149, 1244)]

In [27]:
result_2.intron_list[:5]

[('1.01', 233, 522),
 ('1.02', 586, 796),
 ('1.03', 879, 946),
 ('1.04', 1066, 1148),
 ('1.05', 1245, 1367)]

In [28]:
result_2.cds_list[0]

('>/tmp/04_30_24-06:01:22.fasta|GENSCAN_predicted_CDS_1|1008_bp',
 'ttttcctttgaagaagagagagacgtcacaaaggaaagcactcaatcgtcatttcctcggtatctcctcctcgaaagaagaatgcctggaattagaggtccttcggaatactcgcaggaaccacctcgtcacccttctctcaaggtcaacgccaagctactccgtcacccttactggattgatccagaacccgagaaagctctttatcaaagacatcagtgtgccggtaacagaaggactgccatgagcaaagttaggaatgttagaggtgttggatgggatgtttctgctattggcaacgctgtctggggtggggcgaaactggccgatgttcttgagcttgtggggataccaaagctgactgcttctaccaatttaggagccagacatgttgagttcgttagtgttgatcgctgtaaggaggaaaatgggggcccttataaggcgtcaatcactctaagtcaagccacaaatcctgaagcggatgttctactcgcttatgagatgaatggagagaccctgaacagggatcacggatttccgttaagggtggttgtccctggtgtgattggtgctcgttcggtcaaatggcttgattccatcaatgtcatcgctgaagaaagccagagtgcaatctgctctgtggaggatgtgcaaatggtgaagcctggaaaggtaagtatcaaaggatatgcggtttcaggaggtggacgcgggatagaaagagtggacatatccctggatggaggcaaaaactgggtggaagcttctagaacgcaggaaccaggaaagcagtacatctcagaacacagctccagtgacaaatgggcatgggtgttgtttgaagccaccattgatgtttcacagactacagaggtcatcgccaaagcggttgattcggcggcgaatgttcaaccggaaaatgtggagtc

## Пример 3

Проверим какой-нибудь смешной сиквенс без экзонов

In [29]:
no_ex_seq = 'agagagagagagagaagagaggaa'

In [30]:
result_3 = run_genscan(sequence = no_ex_seq, organism = 'Vertebrate', exon_cutoff = 0.05)

In [31]:
result_3

GenscanOutput(status=200, exon_list=None, cds_list=None, intron_list=None)

## Пример 4

Если ошиблись в параметрах

In [32]:
result_4 = run_genscan(sequence = sequence_ara, organism = 'Ara', exon_cutoff = 5)

ValueError: Incorrect input of "exon_cutoff": 5! Should be: 1.00, 0.50, 0.25, 0.05, 0.02 or 0.01

# BioSeqs Classes

In [33]:
from general import DNASequence, RNASequence, AminoAcidSequence

In [34]:
wrong_dna_test = DNASequence('AAAACGTacgttttU')

wrong_dna_test.check_sequence()

False

In [35]:
dna_test = DNASequence('AAAACGTacgttttt')

print(dna_test.gc_content)

transcribed_seq = dna_test.transcribe()

print(transcribed_seq)

print(type(transcribed_seq))

26.7
AAAACGUacguuuuu
<class 'general.RNASequence'>


In [36]:
rna_test = RNASequence(transcribed_seq.sequence)

print(rna_test.complement())

UUUUGCAugcaaaaa


In [37]:
protein_test = AminoAcidSequence('LKMFPSTWYVARNDCQEGHI')

protein_test.gravy

-0.49

# Test

In [38]:
! python3 -m pytest

platform linux -- Python 3.10.12, pytest-7.4.2, pluggy-1.3.0
rootdir: /home/holydiver/Main/IB/2_Python/18/DAYWWYD_HW18
plugins: anyio-4.0.0
collected 8 items                                                              [0m

test_general.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[33m                                                 [100%][0m

general.py:18
  Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
  (to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
  but was not found to be installed on your system.
  If this would cause problems for you,
  please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
          
    import pandas as pd

