In [None]:
# Biopython experiments section
from Bio import Entrez
Entrez.email = "alakhrasyunes@gmail.com"
handle = Entrez.einfo(db = 'taxonomy')
result = Entrez.read(handle)
for field in result['DbInfo']['FieldList']:
    print('%(Name)s, %(FullName)s, %(Description)s' % field)

In [None]:
# Tests of code
import xml.etree.ElementTree as ET
from Bio import Entrez
from Bio.SeqIO import FastaIO
import pandas as pd
import re
Entrez.email = 'alakhrayunes@gmail.com'
request = 'Ybey RNAse'
dbase = 'protfam'
handle = Entrez.esearch(db = dbase, term = request, retmax=1)
result = Entrez.read(handle)
for id in result['IdList']:
    handle = Entrez.esummary(db = dbase, id = id, retmode = 'xml')
    result = Entrez.read(handle)
print(result)
# print(ET.fromstring(result)[])

In [None]:
# Get Data
from Bio import Entrez
from Bio.SeqIO import FastaIO
import pandas as pd
import re

def search_ybey_nuccore(species):
    # TODO: the check of a nuccore record (does it exist, is it full or not etc.) before ybey check need to be realized 
    request = f'(Ybey[title] OR Ybey[prot] OR Ybey[gene]) AND {species}'
    handle = Entrez.esearch(db = 'nuccore', term = request, retmax = 1)
    respond = Entrez.read(handle)
    return int(respond['RetMax'])

def search_ybey_protein(species):
    request = f'Ybey[title] AND {species}'
    handle = Entrez.esearch(db = 'protein', term = request, retmax = 1)
    respond = Entrez.read(handle)
    return int(respond['RetMax'])

def us11_analysis(sequence):
    for n in ['ng', 'dg', 'gg', 'np', 'lg', 'fp']:
        scan = re.findall('d.{6}%s' %n, sequence)
        if scan != []:
            if scan[0][7:] in ['ng','dg']:
                return 1 # ng and dg groups
            else:
                return 0 # other groups
#             return n
    else:
        return 'Not frequent motif or abscent'
# Maybe it would be better to use Ybey instead of S11
Entrez.email = 'alakhrayunes@gmail.com'
requests = ['S11[Title] AND ribosomal[Title] AND protein[Title] AND bacteria[filter]',
           'S11[Title] AND ribosomal[Title] AND protein[Title] AND mitochondrial[Title] AND Animals[Filter]',
           'S11[Title] AND ribosomal[Title] AND protein[Title] AND (mitochondrial[Title] OR chloroplast[Title]) AND Plants[Filter]',
           'S11[Title] AND ribosomal[Title] AND mitochondrial[Title] AND Fungi[Filter]',
            'S11[Title] AND ribosomal[Title] AND protein[Title] AND (mitochondrial[Title] OR chloroplast[Title]) AND Protists[Filter]',
          ] # For fungi, 'S30' and 'S37' can be added to [filter], as well as 'chloroplastic'
all_species = []
info_dict = {'species': [], 'ybey':[],'taxon': [], 's11_sequence': [], 's11_group': [], 'id': []}
for request in requests:
    handle = Entrez.esearch(db = 'protein', term = request, retmax = 3000)
    results = Entrez.read(handle)
    for result in results["IdList"]:
        try:
            request = Entrez.efetch(db = 'protein', id = result, rettype = 'gp', retmode = 'xml')
            respond = Entrez.read(request)[0]
            species = respond['GBSeq_organism'].split(' ')
        except HTTPException:
            print ("Second (and final) attempt...")
            request = Entrez.efetch(db = 'protein', id = result, rettype = 'gp', retmode = 'xml')
            respond = Entrez.read(request)[0]
            species = respond['GBSeq_organism'].split(' ')
        if 'sp.' in species:
            species = species[:species.index('sp.') + 2]
        else:
            species = species[:2]
        species = ' '.join(species)
        if species not in all_species:
            ybey_presence = search_ybey_nuccore(species)
            all_species.append(species)
            info_dict['species'].append(species)
            info_dict['ybey'].append(ybey_presence)
            info_dict['taxon'].append(respond['GBSeq_taxonomy'])
            info_dict['s11_sequence'].append(respond['GBSeq_sequence'])
            info_dict['s11_group'].append(us11_analysis(respond['GBSeq_sequence']))
            info_dict['id'].append(result)
info_table = pd.DataFrame.from_dict(info_dict)
# Begin of the testing segment
# info_table.drop('')
# End of the testing segment
info_table.to_csv('info_table.csv', index = False)

In [None]:
# pandas tests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

info_table = pd.read_csv('info_table.csv')
info_table = info_table.loc[info_table['s11_group'] != 'Not frequent motif or abscent']
info_table = info_table.astype({'s11_group':int, 'ybey': int})
for domain in ['Eukaryota', 'Bacteria']:
    ddf = info_table.loc[info_table['taxon'].str.contains(domain)]
    ddf = ddf.drop(columns = 'id')
    plot = ddf.plot.hexbin(x = 's11_group', y = 'ybey', gridsize = (2, 1), title = f'distribution for {domain}', sharex = False, xticks = (0, 1), yticks = (0, 1) )
    corr = ddf.corr()
    print(plot)
    print(corr)
    step = 0
    for genus in set(ddf['taxon']):
        family = genus.split('; ')[:-2]
        genus_title = genus.split('; ')[-2]
        gdf = ddf.loc[ddf['taxon'].str.contains('; '.join(family))]
        if len(gdf) < 30:
            continue
        plot = gdf.plot.hexbin(x = 's11_group', y = 'ybey', gridsize = (1,2), title = f'distribution for {genus_title}', sharex = False, xticks = (0, 1), yticks = (0, 1),extent = (0.7,1,0,1) )
        print(plot)
        step += 1
        if step == 10:
            step = 0
            break

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

info_table = pd.read_csv('info_table2.csv')
info_table = info_table.loc[info_table['s11_group'] != 'Not frequent motif or abscent']
info_table = info_table.astype({'s11_group':int,})
for domain in ['Eukaryota', 'Bacteria']:
    ddf = info_table.loc[info_table['taxon'].str.contains(domain)]
    ddf = ddf.drop(columns = 'id')
    print(f'mean for {domain}' ,ddf.mean(), sep = '\n')
    step = 0
    for genus in set(ddf['taxon']):
        try:
            family = genus.split('; ')[:-2]
            genus_title = genus.split('; ')[-2]
        except:
            continue
        gdf = ddf.loc[ddf['taxon'].str.contains('; '.join(family))]
        if len(gdf) < 30:
            continue
        print(f'mean for {genus_title}', gdf.mean(), sep = '\n')
        step += 1
        if step == 10:
            step = 0
            break

In [None]:
import re
a = 'abbbbaffffaqqqq'
a = re.findall('a.{4}',a)
a[0][4]

In [None]:
import pandas as pd

non_ybey = pd.read_csv('non_ybey.csv')
with_ybey = pd.read_csv('info_table.csv')
non_ybey_plot = non_ybey.groupby(['s11_group']).size().plot.bar(title = 'uS11m distrubution in taxons possessing Ybey')
non_ybey_plot.get_figure().savefig('non_ybey_plot.png')
with_ybey_plot = with_ybey.groupby(['s11_group']).size().plot.bar(title = 'uS11m distribution in taxons without Ybey')
with_ybey_plot.get_figure().savefig('with_ybey_plot.png')

In [None]:
from Bio import Entrez

def species_treater(answer):
    # answer1 = answer['DocumentSummarySet']['DocumentSummary'][0].keys()
    # print(answer1)
    answer = answer['DocumentSummarySet']['DocumentSummary'][0]['Organism']
    counter = answer.split(' ')
    answer = answer.split(' ')
    for i in counter:
        if ('(' in i) or (')' in i): 
            answer.remove(i)
    return ' '.join(answer)

Entrez.email = 'alakhrasyunes@gmail.com'

requests = ['Bacteria[organism] AND latest[filter] AND "complete genome"[filter] NOT anomalous[filter]',
            'Eukaryota[organism] AND latest[filter] AND "complete genome"[filter] NOT anomalous[filter]'
]
species_list = []
for request in requests:
    handle = Entrez.esearch(db = 'assembly', term = request, retmax = 1000)
    response = Entrez.read(handle)
    for id in response['IdList']:
        handle = Entrez.esummary(db = 'assembly', id = id, retmode = 'xml')
        answer = Entrez.read(handle, validate = False)
        result = None
        result = species_treater(answer)
        if result not in species_list:
            species_list.append(result)

In [None]:
for species in ['a','b','t']:
    print(species)
    species = 'q'
    print(species)

In [None]:
def us11_analysis(sequence):
    if len(sequence) < 100:
        return 'NotRepresentative'
    sequence = sequence[-30:]
    # End of the new segment
    scan = re.findall('(?=(d.{8}))', sequence)
    if scan != []:
        if scan[-1][7:] in ['ng','dg']:
            return 1 # ng and dg groups
        else:
            return 0 # other groups
    else:
        return 'NotRepresentative'

us11_analysis('dddddvtpiphngvrprkrrrv')

In [None]:
print(str([123,1322]))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

ybey_info = pd.read_csv('info_table_real1.csv')
fig, axes = plt.subplots(1,2)
print(axes)
pos_ybey = ybey_info.loc[ybey_info['ybey_group'] == 1]
neg_ybey = ybey_info.loc[ybey_info['ybey_group'] == 0]
pos_ybey_plot = pos_ybey.groupby('s11_group').size().plot.bar(subplots = True, ax  = axes[0], title = )
neg_ybey_plot = neg_ybey.groupby('s11_group').size().plot.bar(subplots = True, ax = axes[1])
print(fig)


In [None]:
from Bio import Entrez

Entrez.email = 'alakh@gmail.com'

handle = Entrez.esearch(term = 'afsaffsasfsafasf[Organism] AND Ybey', db = 'protein', retmax = 1)
for i in range(1):
    result = Entrez.read(handle)

print(result)

In [None]:
import pandas as pd
import re

re.search(r'(\bbacterium\b)|(\bsp\b)', ' bacteriumales sp.') != None


In [None]:
import pandas as pd

table = pd.read_csv('uS11_tabs/uS11m.csv')
# table = table.loc[table['scientific_name'] == 'Escherichia coli']
def search_ybey_nuccore(species):
    research_table = table.loc[table['scientific_name'] == species]
    
    return 1 if len(research_table) != 0 else 0 

search_ybey_nuccore('Homo Sapiens')

In [142]:
import pandas as pd
import numpy as np
from Bio import SeqIO

fast = SeqIO.parse('s11.fasta', 'fasta')
# table = pd.read_html('s11.xml')
# print(len(table))
# table = table.drop_duplicates(ignore_index = True)
# table.to_csv('info_table.csv')
# table
fast_dict = {'description': [], 'sequence': [], 'id': []}
for i in fast:
    fast_dict['id'].append(i.id)
    fast_dict['description'].append(i.description)
    fast_dict['sequence'].append(str(i.seq).lower())

fast_dict = pd.DataFrame.from_dict(fast_dict)
fast_dict.to_csv('s11.csv', index = False)
species = "Escherichia coli"
fast_dict = fast_dict.loc[fast_dict['description'].str.contains(f'OS={species}')].reset_index(drop = True)
# fast_dict = fast_dict.loc[fast_dict['sequence'].str.contains('ng')]
# for index in fast_dict['sequence']:
    # print(index)
# domain = 'bacter'
# plot = tabls.plot.hexbin(x = 's11_group', y = 'ybey_group', gridsize = (2, 1), title = f'distribution for {domain}', sharex = False, xticks = (0, 1), yticks = (0, 1) )


In [160]:
import pandas as pd

# taxon = pd.read_csv('taxid.tab', sep = '\t')
# taxon
# taxon = taxon.loc[taxon['Scientific name'].str.contains('Bacillus velezensis')].reset_index(drop = True)
# taxon = taxon['Lineage'].reset_index(drop = True)
# taxon.iloc[0]["Taxon"]
# pd.index(taxon[0])
# taxon[('Reviewed','Lineage')]
# taxon
# print(taxon.iloc[0])
ybey = pd.read_csv('Ybey.tab', sep = '\t')
Y2 = ybey.loc[ybey['Organism'] == 'Escherichia coli']
len(Y2)

def search_ybey(species):
    research_table = ybey.loc[ybey['Organism'].str.contains(species)]
    return 1 if len(research_table) != 0 else 0
search_ybey('Escherichia coli') 

1

In [140]:
import re

q = 'candidatus'
q = re.sub('\[|\]','', q)
q

'candidatus'

In [4]:
a = 0
for i in range(2,4):
    print(i)
    

2
3


In [2]:
import pandas as pd

species = pd.read_xml('../DataTreatment/taxonomy_result.xml')
species.to_csv('taxonid.csv', index = False)