In [52]:
import pandas as pd
import numpy as np
from Bio import Entrez
from Bio import SeqIO
import requests
import xml.etree.ElementTree as ET
import subprocess
import re

In [None]:
########################################
# NOMS ET TXID DES BACTERIES D'INTERET #
########################################
# importation du fichier contenant les txid des bactéries d'intérêt (fichier obtenu via https://www.ncbi.nlm.nih.gov/Taxonomy/TaxIdentifier/tax_identifier.cgi)
recup_txid = pd.read_csv('../sources/bacteria_of_interest/bacteria_of_interest_with_txids.txt', sep = '\t')
recup_txid['taxid']

# replacement des taxid non identifiés par np.nan
recup_txid['taxid'] = recup_txid['taxid'].replace({" " : np.nan})

In [54]:
# suppression des taxid non identifiés et des doublons
recup_txid = recup_txid.dropna()
recup_txid.drop_duplicates(subset = ['taxid'], keep = 'first', inplace = True)
recup_txid.info()

df_bacteria_of_interest = pd.DataFrame(data = {'name' : recup_txid['name'],
                                               'taxid' : recup_txid['taxid']})

df_bacteria_of_interest.reset_index(drop = True, inplace = True)


<class 'pandas.core.frame.DataFrame'>
Index: 597 entries, 0 to 705
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   code            597 non-null    int64 
 1   |               597 non-null    object
 2   name            597 non-null    object
 3   |.1             597 non-null    object
 4   preferred name  597 non-null    object
 5   |.2             597 non-null    object
 6   taxid           597 non-null    object
dtypes: int64(1), object(6)
memory usage: 37.3+ KB


In [None]:
#exportation des txids dans un fichier qui servira à importer des séquences 16S correspondantes depuis NCBI
txids = df_bacteria_of_interest['taxid'].to_list()

with open("../sources/bacteria_of_interest/txids.txt", "w") as txid_file:
    for txid in txids:
        txid_file.write(txid + '\n')
txid_file.close()

In [None]:
#####################################################
# SEQUENCES REPRÉSENTATIVES DES BACTÉRIES D'INTERÊT #
#####################################################
# Importation de séquences pour les bactéries d'intérêt depuis NCBI, à partir de leur txid

Entrez.email = "marthe.leoz@gmail.com"

for txid in txids[:]:

    # Recherche des séquences d'ARN 16S
    handle = Entrez.esearch(db="nucleotide", term=f"txid{txid}[All Fields] AND 16S[All Fields] AND rRNA[All Fields] AND partial[All Fields] NOT whole[All Fields] AND (\"1300\"[SLEN] : \"1500\"[SLEN]) ", retmax=10)
    record = Entrez.read(handle)
    handle.close()

    # Obtenir les identifiants des séquences
    id_list = record["IdList"]

    # Récupérer les séquences et les enregistrer dans un fichier de type fasta
    handle = Entrez.efetch(db="nucleotide", id=id_list, rettype="fasta", retmode="text")
    sequences = SeqIO.parse(handle, "fasta")

    # Sauvegarder dans un fichier
    filepath = "../sources/bacteria_of_interest/bacteria_of_interest_sequences_max10.txt" 
    with open(filepath, "a") as output_file:
        # on inscrit chaque txid avant la série de séquences correspondantes pour pouvoir les reporter plus tard dans le dataframe
        # le fichier est donc sauvegardé au format txt et non fasta
        output_file.write('TXID : ' + str(txid) + '\n')     
        SeqIO.write(sequences, output_file, "fasta")



Nowadays, the FASTA file format is usually understood not to have any such comments, and most software packages do not allow them. Therefore, the use of comments at the beginning of a FASTA file is now deprecated in Biopython.


(1) Modify your FASTA file to remove such comments at the beginning of the file.

(2) Use SeqIO.parse with the 'fasta-pearson' format instead of 'fasta'. This format is consistent with the FASTA format defined by William Pearson's FASTA aligner software. Thie format allows for comments before the first sequence; lines starting with the ';' character anywhere in the file are also regarded as comment lines and are ignored.

(3) Use the 'fasta-blast' format. This format regards any lines starting with '!', '#', or ';' as comment lines. The 'fasta-blast' format may be safer than the 'fasta-pearson' format, as it explicitly indicates which lines are comments. 


In [None]:
# création d'un df avec identifiants NCBI, txid, et séquences
# lecture du fichier contenant les séquences regroupées par txids
fichier = open('../sources/bacteria_of_interest/bacteria_of_interest_sequences_max10.txt', 'r')
data = fichier.readlines()

#initialisation du dataframe
df_bacteria_of_interest_with_seq = pd.DataFrame(columns = ['txid', 'seq_id', 'dna_seq'])

#initialisation d'une donnée (identifiant et séquence)
txid = ''
dna_seq = ''
seq_id = ''

#récupération d'un identifiant (caractérisé par le caractère '>' en début de ligne)
#et de la séquence correspondante en concaténant les chaînes séparées par des retours à la ligne
n = 0
for ligne in data[:]:
    if ligne[:7] == 'TXID : ':
        new_txid = ligne[7:-1]
    else:
        if ligne[0] == '>' :
            df_bacteria_of_interest_with_seq.loc[n] = [txid, seq_id, dna_seq]
            txid = new_txid
            seq_id = ligne[:-1]
            dna_seq = ''
            n += 1
        else:
            dna_seq = dna_seq + ligne[:-1]
    

#suppression de la première ligne (vide) du dataframe 
df_bacteria_of_interest_with_seq = df_bacteria_of_interest_with_seq[1:].reset_index(drop = True)
# suppression des séquences identiques
df_bacteria_of_interest_with_seq = df_bacteria_of_interest_with_seq.drop_duplicates(subset = 'dna_seq')

#affichage de contrôle
display(df_bacteria_of_interest_with_seq.head())

#exportation en .csv
df_bacteria_of_interest_with_seq.to_csv('../sources/bacteria_of_interest/bacteria_of_interest_sequences_max10.csv', sep = ',')

Unnamed: 0,txid,seq_id,dna_seq
0,202789,>MH645801.1 Actinobaculum massiliense strain F...,CCTTTTTTTGGTTGGGTNCTCGAGTGGCGAACGGTGAGTATTACGT...
1,202789,>MH279688.1 Actinobaculum massiliense strain F...,TTTTTGGTTGGGTGCTNGAGTGGCGAACGGGTGAGTATTACGTGAG...
2,202789,>LT558806.1 Actinobaculum massiliense partial ...,AGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACA...
3,202789,>FJ711191.1 Actinobaculum massiliense strain N...,ACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAAAGGCCCT...
4,33007,>PQ788148.1 Winkia neuii strain som 201 16S ri...,CGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGGATCCA...


In [None]:
#####################################
# TAXONOMIE DES BACTÉRIES D'INTERÊT #
#####################################
# exportation des txids dupliqués pour récupérer les taxonomies via NCBI
txids = df_bacteria_of_interest_with_seq['txid'].to_list()

with open("../sources/bacteria_of_interest/txids_max10.txt", "w") as txid_file:
    for txid in txids:
        txid_file.write(txid + '\n')
txid_file.close()

In [59]:
df_bacteria_of_interest_with_seq = df_bacteria_of_interest_with_seq.reset_index(drop = True)
df_bacteria_of_interest_with_seq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4290 entries, 0 to 4289
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   txid     4290 non-null   object
 1   seq_id   4290 non-null   object
 2   dna_seq  4290 non-null   object
dtypes: object(3)
memory usage: 100.7+ KB


In [60]:
taxonomy_levels = ['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']

# Fonction de récupération des données de chaque espèce via NCBI
def get_data_from_ncbi(txid):
    Entrez.email = "marthe.leoz@gmail.com"

    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id={txid}&retmode=xml"
    response = requests.get(url)
        
    if response.status_code == 200:
        return response.text
    else:
        print(f"Erreur pour le txid {txid}: {response.status_code}")
    
# Fonction d'extraction des différents niveaux de taxonomie dans les données récupérées
def get_taxo_dict_from_data(data):
    try:
        tree = ET.ElementTree(ET.fromstring(data))
        root = tree.getroot()

        # instanciation du dictionnaire
        taxo_dict = {}
        for rank in taxonomy_levels:
            taxo_dict[rank] = 'undefined'    
        # Extraction de la taxonomie
        for taxon in root.iter('Taxon'):
            name = taxon.find('ScientificName').text
            rank = taxon.find('Rank').text
            taxo_dict[rank] = name
        return taxo_dict
            
    except Exception as e:
        print(f"Erreur lors de l'analyse du XML: {e}")

# Fonction d'ajout des niveaux de taxonomie dans le df
def insert_taxo_in_df(df, txid, taxo_dict):
    # index des lignes concernées
    list_ids = df.loc[df['txid'] == txid].index
    # ajout dans le df
    for k in list_ids:
        for rank in taxonomy_levels:
            df.loc[k, rank] = taxo_dict[rank]

# Récupération et ajout des niveaux de taxonomie pour chaque séquences du df, à partir de leur txid

######### WARNING #####
##### LA RECUPERATION DES TAXO A TENDANCE À PLANTER EN COURS DE ROUTE... 
##### AJOUTER UN TRY OU QQE CHOSE POUR QU'IL PUISSE REPRENDRE OÙ IL EN ÉTAIT?
##### ON PEUT RÉCUPÉRER L'INDEX À PARTIR DUQUEL REPRENDRE AVEC list(txids).index('txid_renvoyé_comme_erreur'),
##### IL FAUT ENSUITE LE PLACER COMME BORNE DE DÉPART DANS 'for k in range (...)'
#######################
txids = df_bacteria_of_interest_with_seq['txid'].unique()
for k in range(len(txids)):
    txid = txids[k]
    data = get_data_from_ncbi(txid)
    taxo_dict = get_taxo_dict_from_data(data)
    insert_taxo_in_df(df_bacteria_of_interest_with_seq, txid, taxo_dict)

In [None]:
# Vérification de la complétude du df
df_bacteria_of_interest_with_seq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4290 entries, 0 to 4289
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   txid     4290 non-null   object
 1   seq_id   4290 non-null   object
 2   dna_seq  4290 non-null   object
 3   domain   4290 non-null   object
 4   phylum   4290 non-null   object
 5   class    4290 non-null   object
 6   order    4290 non-null   object
 7   family   4290 non-null   object
 8   genus    4290 non-null   object
 9   species  4290 non-null   object
dtypes: object(10)
memory usage: 335.3+ KB


In [None]:
##################################################
# EXTRACTION DES REGIONS V3V4 DE CHAQUE SÉQUENCE #
##################################################
# le fichier .txt contenant les séquences contient également des txids. On supprime ces txids pour obtenir
# un fichier au format fasta.

with open('../sources/bacteria_of_interest/bacteria_of_interest_max10.fasta', 'a') as fasta_file:
    for seq_id, dna_seq in zip(df_bacteria_of_interest_with_seq['seq_id'], df_bacteria_of_interest_with_seq['dna_seq']):
        fasta_file.write(seq_id + '\n' + dna_seq + '\n')
fasta_file.close()

In [None]:
# Extraction des régions V3V4 des séquences à l'aide de l'outil extract regions 16S
# les résultats sont inscrits dans un fichier .fasta

script = '/home/marthe/Documents/DS/projet/extract_regions_16s/extract_regions'
input_file = '../sources/bacteria_of_interest/bacteria_of_interest_max10.fasta'
output_file = '../sources/bacteria_of_interest/bacteria_of_interest_V3V4_max10.fasta'

subprocess.call([script, '-i', input_file, '-f', '341F', '-r', '806R', '-o', output_file, '-t', '4'])

0

In [None]:
# L'extracteur de régions du gène 16S remplace les bases T par des bases U, ce qui rend les fichiers
# incompatibles avec qiime. On remet donc à nouveau des bases T à la place des bases U

def replace_U_with_T(file_path):
    with open(file_path, "r+") as file:
        #read the file contents
        file_contents = file.read()
        text_pattern = re.compile("U")
        file_contents = text_pattern.sub("T", file_contents)
        file.seek(0)
        file.truncate()
        file.write(file_contents)

replace_U_with_T('../sources/bacteria_of_interest/bacteria_of_interest_V3V4_max10.fasta')

In [None]:
# intégration des restrictions aux régions V3V4 de chaque séquence dans le dataframe
with open('../sources/bacteria_of_interest/bacteria_of_interest_V3V4_max10.fasta', 'r') as V3V4_file:
    lines = V3V4_file.readlines()
    for k in range(df_bacteria_of_interest_with_seq.shape[0]):
        extract_seq = lines[2*k+1]
        if len(extract_seq) > 250:          # on ne conserve que les séquences extraites d'une longueur suffisante
            df_bacteria_of_interest_with_seq.loc[k, 'V3V4'] = lines[2*k+1][:-1]
        else:
            df_bacteria_of_interest_with_seq.loc[k, 'V3V4'] = np.nan

df_bacteria_of_interest_with_seq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4290 entries, 0 to 4289
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   txid     4290 non-null   object
 1   seq_id   4290 non-null   object
 2   dna_seq  4290 non-null   object
 3   domain   4290 non-null   object
 4   phylum   4290 non-null   object
 5   class    4290 non-null   object
 6   order    4290 non-null   object
 7   family   4290 non-null   object
 8   genus    4290 non-null   object
 9   species  4290 non-null   object
 10  V3V4     4256 non-null   object
dtypes: object(11)
memory usage: 368.8+ KB


In [None]:
# Suppression des lignes vides (séquences pour lesquelles l'extraction n'a retourné aucun résultat)

def sup_empty_lines(file_path):
    results = ""
    with open(file_path, 'r+') as file:
        lines = file.readlines()
        i = 0
        while i < len(lines):
            if (lines[i] != '\n') & (lines[i+1] != '\n'):   #on teste la ligne de l'identifiant et la ligne de la séquence
                results = results + lines[i] + lines[i+1]
            i += 2
    file.close()
    with open(file_path, 'w') as file:
        file.write(results)
    file.close()



sup_empty_lines('../sources/bacteria_of_interest/bacteria_of_interest_V3V4_max10.fasta')

In [67]:
# suppression des lignes correspondantes dans le df
df_bacteria_of_interest_with_seq.dropna(subset = ['V3V4'], axis = 0, inplace = True)
df_bacteria_of_interest_with_seq.reset_index(drop = True, inplace = True)
df_bacteria_of_interest_with_seq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4256 entries, 0 to 4255
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   txid     4256 non-null   object
 1   seq_id   4256 non-null   object
 2   dna_seq  4256 non-null   object
 3   domain   4256 non-null   object
 4   phylum   4256 non-null   object
 5   class    4256 non-null   object
 6   order    4256 non-null   object
 7   family   4256 non-null   object
 8   genus    4256 non-null   object
 9   species  4256 non-null   object
 10  V3V4     4256 non-null   object
dtypes: object(11)
memory usage: 365.9+ KB


In [None]:
# suppression des virgules dans seq_id
pattern = re.compile(r'>[^,]*')
df_bacteria_of_interest_with_seq['seq_id'] = df_bacteria_of_interest_with_seq['seq_id'].apply(lambda x : pattern.search(x).group())

# suppression des doublons
df_bacteria_of_interest_with_seq.drop_duplicates(['dna_seq'], inplace = True)
df_bacteria_of_interest_with_seq.reset_index(drop = True, inplace = True)
df_bacteria_of_interest_with_seq.head()

df_bacteria_of_interest_with_seq.to_csv("../datasets/train_sets/urinary_max10.csv", index = False)


In [69]:
df_bacteria_of_interest_with_seq = pd.read_csv('../datasets/train_sets/urinary_max10.csv')

df_bacteria_of_interest_with_seq.drop_duplicates(['dna_seq'], inplace = True)
df_bacteria_of_interest_with_seq.reset_index(drop = True, inplace = True)
df_bacteria_of_interest_with_seq.info()

# Exportation du dataframe en csv
df_bacteria_of_interest_with_seq.to_csv("../datasets/train_sets/urinary_max10.csv", index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4256 entries, 0 to 4255
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   txid     4256 non-null   int64 
 1   seq_id   4256 non-null   object
 2   dna_seq  4256 non-null   object
 3   domain   4256 non-null   object
 4   phylum   4256 non-null   object
 5   class    4256 non-null   object
 6   order    4256 non-null   object
 7   family   4256 non-null   object
 8   genus    4256 non-null   object
 9   species  4256 non-null   object
 10  V3V4     4256 non-null   object
dtypes: int64(1), object(10)
memory usage: 365.9+ KB


In [70]:
df_bacteria_of_interest_with_seq = pd.read_csv('../datasets/train_sets/urinary_max10.csv')

In [None]:
#################################################
# création de fichiers de format adapté à qiime #
#################################################
# Fichier contenant les taxonomies
urinary_taxo_for_qiime = df_bacteria_of_interest_with_seq[['seq_id']]


feature_id_pattern = re.compile(r'>([\S]*)')      # Pour récupérer les identifiants au format '>xxxxxxxxx.x'
for i in range(urinary_taxo_for_qiime.shape[0]):
    feature_id = feature_id_pattern.findall(urinary_taxo_for_qiime.loc[i, 'seq_id'])
    urinary_taxo_for_qiime.loc[i, 'Feature ID'] = feature_id[0]

    taxons = [df_bacteria_of_interest_with_seq.loc[i, rank] for rank in taxonomy_levels]
    taxon = 'D_0__' + taxons[0] + ';D_1__' + taxons[1] + ';D_2__' + taxons[2] + ';D_3__' + taxons[3] + ';D_4__' + taxons[4] + ';D_5__' + taxons[5] + ';D_6__' + taxons[6] 
    urinary_taxo_for_qiime.loc[i, 'Taxon'] = taxon


urinary_taxo_for_qiime.drop('seq_id', axis = 1, inplace = True)

display(urinary_taxo_for_qiime.head())
urinary_taxo_for_qiime.to_csv('../sources/bacteria_of_interest/qiime_urinary_taxo_max10.tsv', sep = '\t', index = False)


# Fichier contenant les séquences complètes
with open('../sources/bacteria_of_interest/qiime_urinary_seq_max10.fasta', 'a') as fasta_file:
    for i in range(urinary_taxo_for_qiime.shape[0]):
        fasta_file.write('>' + urinary_taxo_for_qiime.loc[i, 'Feature ID'] + '\n' + df_bacteria_of_interest_with_seq.loc[i, 'dna_seq'] + '\n')
fasta_file.close()

# Fichier contenant les séquences restreintes aux régions V3V4
with open('../sources/bacteria_of_interest/qiime_urinary_seq_V3V4_max10.fasta', 'a') as fasta_file:
    for i in range(urinary_taxo_for_qiime.shape[0]):
        fasta_file.write('>' + urinary_taxo_for_qiime.loc[i, 'Feature ID'] + '\n' + df_bacteria_of_interest_with_seq.loc[i, 'V3V4'] + '\n')
fasta_file.close()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  urinary_taxo_for_qiime.loc[i, 'Feature ID'] = feature_id[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  urinary_taxo_for_qiime.loc[i, 'Taxon'] = taxon
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  urinary_taxo_for_qiime.drop('seq_id', axis = 1, inplace = True)


Unnamed: 0,Feature ID,Taxon
0,MH645801.1,D_0__Bacteria;D_1__Actinomycetota;D_2__Actinom...
1,MH279688.1,D_0__Bacteria;D_1__Actinomycetota;D_2__Actinom...
2,LT558806.1,D_0__Bacteria;D_1__Actinomycetota;D_2__Actinom...
3,FJ711191.1,D_0__Bacteria;D_1__Actinomycetota;D_2__Actinom...
4,PQ788148.1,D_0__Bacteria;D_1__Actinomycetota;D_2__Actinom...


In [72]:
df_bacteria_of_interest_with_seq.loc[df_bacteria_of_interest_with_seq['txid'].duplicated() == True]


Unnamed: 0,txid,seq_id,dna_seq,domain,phylum,class,order,family,genus,species,V3V4
1,202789,>MH279688.1 Actinobaculum massiliense strain F...,TTTTTGGTTGGGTGCTNGAGTGGCGAACGGGTGAGTATTACGTGAG...,Bacteria,Actinomycetota,Actinomycetes,Actinomycetales,Actinomycetaceae,Actinobaculum,Actinobaculum massiliense,CNTACGGGAGGCAGCAGTGGGGGATATTGCACAATGGGCGCAAGCC...
2,202789,>LT558806.1 Actinobaculum massiliense partial ...,AGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACA...,Bacteria,Actinomycetota,Actinomycetes,Actinomycetales,Actinomycetaceae,Actinobaculum,Actinobaculum massiliense,CCTACGGGAGGCAGCAGTGGGGGATATTGCACAATGGGCGCAAGCC...
3,202789,>FJ711191.1 Actinobaculum massiliense strain N...,ACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAAAGGCCCT...,Bacteria,Actinomycetota,Actinomycetes,Actinomycetales,Actinomycetaceae,Actinobaculum,Actinobaculum massiliense,CCTACGGGAGGCAGCAGTGGGGGATATTGCACAATGGGCGCAAGCC...
5,33007,>OR999579.1 Winkia neuii strain CNSY1 16S ribo...,GGCCTGCGGCGTGCTTACCATGCAAGTCGAACGGGATCCATTAGCG...,Bacteria,Actinomycetota,Actinomycetes,Actinomycetales,Actinomycetaceae,Winkia,Winkia neuii,CCTACGGGAGGCAGCAGTGGGGGATATTGCACAATGGACGAAAGTC...
6,33007,>OR260435.1 Winkia neuii strain 19 16S ribosom...,AACGGGTGAGTAACACGTGAGTAACCTGCCCTTTTCTTTGGGATAA...,Bacteria,Actinomycetota,Actinomycetes,Actinomycetales,Actinomycetaceae,Winkia,Winkia neuii,CCTACGGGAGGCAGCAGTGGGGGATATTGCACAATGGACGNAAGTC...
...,...,...,...,...,...,...,...,...,...,...,...
4251,1886637,>PV739705.1 Delftia sp. strain APQ6-11 16S rib...,CGGTGAACAGGATCTTCGGACGCTGACGAGTGGCGAACGGGTGAGT...,Bacteria,Pseudomonadota,Betaproteobacteria,Burkholderiales,Comamonadaceae,Delftia,Delftia sp.,CCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGCGAAAGCC...
4252,1886637,>PV579005.1 Delftia sp. strain KRMS06 16S ribo...,GGTAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCT...,Bacteria,Pseudomonadota,Betaproteobacteria,Burkholderiales,Comamonadaceae,Delftia,Delftia sp.,CCTACGGGAGGCGCAGTGGGGAATTTTGGACAATGGGCGAAAGCCT...
4253,1886637,>PV450672.1 Delftia sp. strain KDMSBB08 16S ri...,GAAGCACCAGTACTTACGCGACGCTGGACTGAGGGGCGAACGGGTG...,Bacteria,Pseudomonadota,Betaproteobacteria,Burkholderiales,Comamonadaceae,Delftia,Delftia sp.,CCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATTGGGCGAAAGC...
4254,1886637,>PV382267.1 Delftia sp. strain YC2177 16S ribo...,CGTTGCCGGCATGCTTACACATGCAGTCGAACGGTAACAGGTCTTC...,Bacteria,Pseudomonadota,Betaproteobacteria,Burkholderiales,Comamonadaceae,Delftia,Delftia sp.,CCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGCGAAAGCC...


In [73]:
# suppression des classes sous représentées
test = df_bacteria_of_interest_with_seq.groupby('txid').count().reset_index()
minors = test.loc[test['seq_id'] < 2]['txid'].values
#print(minors)

filtered_df = df_bacteria_of_interest_with_seq.copy()
filtered_df = filtered_df.loc[filtered_df['txid'].apply(lambda x : x not in minors)]
print('nombre de txid différents : ', len(filtered_df['txid'].unique()))

nombre de txid différents :  527


In [None]:
filtered_df.to_csv('../datasets/train_sets/urinary_max10_min2.csv', index = False)