In [3]:
import pandas as pd

In [5]:
from Bio import SeqIO
import requests
from io import StringIO

email = "your@email.com"
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

# Nonaktifkan verifikasi SSL
requests.packages.urllib3.disable_warnings()

def get_sequence_info(dna_accession):
    url = f"{base_url}efetch.fcgi?db=nucleotide&id={dna_accession}&rettype=gb"
    response = requests.get(url, verify=False)
    return response.text

def get_sequence(dna_accession):
    url = f"{base_url}efetch.fcgi?db=nucleotide&id={dna_accession}&rettype=fasta"
    response = requests.get(url, verify=False)
    fasta_data = response.text
    fasta_file = StringIO(fasta_data)
    sequences = SeqIO.parse(fasta_file, "fasta")
    for sequence in sequences:
        sequence_info = sequence.seq
        return sequence_info

sequence_info = get_sequence_info('NG_068190')
print(sequence_info)

sequence = get_sequence('NG_068190')
print(sequence)


LOCUS       NG_068190               1194 bp    DNA     linear   CON 21-APR-2020
DEFINITION  Pseudomonas aeruginosa 1967426 blaPDC gene for class C
            beta-lactamase PDC-402, complete CDS.
ACCESSION   NG_068190
VERSION     NG_068190.1
DBLINK      BioProject: PRJNA313047
KEYWORDS    RefSeq.
SOURCE      Pseudomonas aeruginosa
  ORGANISM  Pseudomonas aeruginosa
            Bacteria; Pseudomonadota; Gammaproteobacteria; Pseudomonadales;
            Pseudomonadaceae; Pseudomonas.
REFERENCE   1  (bases 1 to 1194)
  AUTHORS   Le Bras,C., Billet,M., Young,K., Motyl,M., Hawser,S., Charrier,C.
            and Morrissey,I.
  TITLE     Novel AmpC-type beta lactamase
  JOURNAL   Unpublished
REFERENCE   2  (bases 1 to 1194)
  CONSRTM   NCBI Refseq Project
  TITLE     Direct Submission
  JOURNAL   Submitted (14-APR-2020) National Center for Biotechnology
            Information, NIH, Bethesda, MD 20894, USA
COMMENT     REVIEWED REFSEQ: This record has been curated by NCBI staff. The
         

In [None]:

# def get_sequence_info(dna_accession):
#     handle = Entrez.efetch(db="nucleotide", id=dna_accession, rettype="gb", retmode="text")
#     record = SeqIO.read(handle, "genbank")
#     handle.close()

#     # Mendapatkan informasi tentang entri
#     sequence_id = record.id
#     description = record.description
#     sequence = record.seq

#     return sequence_id, description, sequence

# # Daftar kode DNA Accession yang ingin Anda cari
# dna_accessions = [
#     "NG_068190.1",
#     "NG_062286.1",
#     "AJ011291.1",
#     "KP096411.1",
#     "FJ971899.1",
#     "FR772051.1",
#     "AB039845.1",
#     "DQ303920.1",
#     "JN836269.1",
#     "L09756.1"
# ]

# for dna_accession in dna_accessions:
#     sequence_id, description, sequence = get_sequence_info(dna_accession)
#     print("Sequence ID:", sequence_id)
#     print("Description:", description)
#     print("Sequence:", sequence)
#     print()

In [11]:
def count_value_frequency(df, column):
    value_counts = df[column].value_counts()
    return value_counts

In [13]:
import pandas as pd
from sklearn.utils import resample

def balance_dataset(df, target_column):
    # Menghitung frekuensi nilai pada kolom target
    value_counts = count_value_frequency(df, target_column)

    # Menentukan jumlah minimum frekuensi nilai
    min_frequency = value_counts.min()

    # Melakukan resampling untuk setiap nilai dalam kolom target
    balanced_data = pd.DataFrame()
    for value in value_counts.index:
        # Mengambil subset data dengan nilai tertentu
        subset = df[df[target_column] == value]
        
        # Melakukan resampling dengan jumlah minimum frekuensi
        subset_resampled = resample(subset, replace=True, n_samples=min_frequency, random_state=42)
        
        # Menggabungkan subset resampled ke dalam dataset seimbang
        balanced_data = pd.concat([balanced_data, subset_resampled])
    
    return balanced_data

In [9]:
import pandas as pd

df = pd.read_csv('aro_categories_index.tsv', delimiter='\t')

# Memisahkan nilai 'Resistance Mechanism' yang lebih dari satu
df['Resistance Mechanism'] = df['Resistance Mechanism'].str.split(';')

# Membuat salinan baris untuk setiap nilai 'Resistance Mechanism' yang terpisah
df = df.explode('Resistance Mechanism')

# Membersihkan spasi di sekitar nilai 'Resistance Mechanism'
df['Resistance Mechanism'] = df['Resistance Mechanism'].str.strip()

# Menampilkan hasil
df


Unnamed: 0,Protein Accession,DNA Accession,AMR Gene Family,Drug Class,Resistance Mechanism
0,AAB60941.1,AF002716.1,Erm 23S ribosomal RNA methyltransferase,lincosamide antibiotic;macrolide antibiotic;st...,antibiotic target alteration
1,AAA03550.1,L06156.2,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation
2,AAC44793.1,U41471.1,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation
3,CCP42991.1,AL123456.3,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation
4,AAB41701.1,U72743.1,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation
...,...,...,...,...,...
5097,CAB13166.1,AL009126.1,small multidrug resistance (SMR) antibiotic ef...,aminoglycoside antibiotic;phenicol antibiotic;...,antibiotic efflux
5098,CAB13167.1,AL009126.1,small multidrug resistance (SMR) antibiotic ef...,aminoglycoside antibiotic;phenicol antibiotic;...,antibiotic efflux
5099,AAC75271.1,U00096.3,ATP-binding cassette (ABC) antibiotic efflux pump,peptide antibiotic,antibiotic efflux
5100,ABA70720.1,DQ185144.1,YRC Beta-lactamase,cephalosporin;cephamycin;penam,antibiotic inactivation


In [44]:
df['DNA Accession']

3890    NG_068190.1
4405    NG_062286.1
974      AJ011291.1
1605     KP096411.1
1250     FJ971899.1
           ...     
5008     FR772051.1
4833     AB039845.1
4130     DQ303920.1
4189     JN836269.1
4843       L09756.1
Name: DNA Accession, Length: 696, dtype: object

In [24]:
count_value_frequency(df, 'Resistance Mechanism')

Resistance Mechanism
antibiotic inactivation                              3979
antibiotic target alteration                          552
antibiotic efflux                                     322
antibiotic target protection                          174
antibiotic target replacement                          78
reduced permeability to antibiotic                     28
resistance by absence                                  14
resistance by host-dependent nutrient acquisition       1
Name: count, dtype: int64

In [25]:
res_mech = count_value_frequency(df, 'Resistance Mechanism').index

In [32]:
res_mech

Index(['antibiotic inactivation', 'antibiotic target alteration',
       'antibiotic efflux', 'antibiotic target protection'],
      dtype='object', name='Resistance Mechanism')

In [39]:
df = df[df['Resistance Mechanism'].isin(res_mech)]

In [41]:
df = balance_dataset(df, 'Resistance Mechanism')

In [40]:
df['Resistance Mechanism'].unique()

array(['antibiotic target alteration', 'antibiotic inactivation',
       'antibiotic efflux', 'antibiotic target protection'], dtype=object)

In [42]:
count_value_frequency(df, 'Resistance Mechanism')

Resistance Mechanism
antibiotic inactivation         174
antibiotic target alteration    174
antibiotic efflux               174
antibiotic target protection    174
Name: count, dtype: int64

In [45]:
from Bio import Entrez

In [46]:
df = df.reset_index()

In [47]:
df

Unnamed: 0,index,Protein Accession,DNA Accession,AMR Gene Family,Drug Class,Resistance Mechanism
0,3890,WP_134265631.1,NG_068190.1,PDC beta-lactamase,carbapenem;cephalosporin;monobactam,antibiotic inactivation
1,4405,WP_122630869.1,NG_062286.1,SHV beta-lactamase,carbapenem;cephalosporin;penam,antibiotic inactivation
2,974,CAB36900.1,AJ011291.1,CMY beta-lactamase,cephamycin,antibiotic inactivation
3,1605,AJP67510.1,KP096411.1,GES beta-lactamase,carbapenem;cephalosporin;penam,antibiotic inactivation
4,1250,ACR56321.1,FJ971899.1,CTX-M beta-lactamase,cephalosporin,antibiotic inactivation
...,...,...,...,...,...,...
691,5008,CBY88983.1,FR772051.1,vga-type ABC-F protein,lincosamide antibiotic;pleuromutilin antibioti...,antibiotic target protection
692,4833,BAB82500.1,AB039845.1,tetracycline-resistant ribosomal protection pr...,tetracycline antibiotic,antibiotic target protection
693,4130,ABC17629.1,DQ303920.1,quinolone resistance protein (qnr),fluoroquinolone antibiotic,antibiotic target protection
694,4189,AEU11363.1,JN836269.1,quinolone resistance protein (qnr),fluoroquinolone antibiotic,antibiotic target protection


In [6]:
import pandas as pd

df = pd.read_csv('aro_categories_index(res_manage).csv')
df['Nucleotide(DNA)'] = ""
for i in range(len(df)):
    df['Nucleotide(DNA)'][i] = get_sequence(df['DNA Accession'][i])
    df.to_csv('aro_categories_index(res_manage).csv',index=False)

In [12]:
str(df['Nucleotide(DNA)'][0])

'ATGCGCGATACCAGATTCCCCTGCCCGTGCGGCATCGCCGCTTCCACACTGCTGTTCGCCACCACCCCGGCCATTGCCGGCGAGGCCCCGGCGGATCGCCTGAAGGCACTGGTCGACGCCGCCGTACAACCGGTGATGAAGGCCAATGACATTCCGGGCCTGGCCGTAGCCATCAGCCTGAAAGGAGAACCGCATTACTTCAGCTATGGGCTGGCCTCGAAAGAGGACGGCCGCCGGGTGACGCCGGAGACCCTGTTCGAGATCGGCTCGGTGAGCAAGACCTTCACCGCCACCCTCGCCGGCTATGCCCTGGCCCAGGACAAGATGCGCCTCGACGACCGCGCCAGCCAGCACTGGCCGGCACTGCAGGGCAGCCGCTTCGACGGCATCAGCCTGCTCGACCTCGCGACCTATACCGCCGGCGGCTTGCCGCTGCAGTTCCCCGACTCGGTGCAGAAGGACCAGGCACAGATCCGCGACTACTACCGCCAGTGGCAGCCGACCTATGCGCCGGGCAGCCAGCGCCTCTATTCCAACCCGAGCATCGGCCTGTTCGGCTATCTCGCCGCGCGCAGCCTGGGCCAGCCGTTCGAACGACTCATGGAGCAGCAAGTGTTCCCGGCACTGGGCCTCGAACAGACCCACCTCGACGTGCCCGAGGCGGCGCTGGCGCAGTACGCCCAGGGCTACGGCAAGGACGACCGCCCGCTACGGGTCGGTCCCGGCCCGCTGGATGCCGAAGGCTACGGGGTGAAGACCAGCGCGGCCGACCTGCTGCGCTTCGTCGATGCCAACCTGCATCCGGAGCGCCTGGACAGGCCCTGGGCGCAGGCGCTCGATGCCACCCATCGCGGTTACTACAAGGTCGGCGACATGACCCAGGGCCTGGGCTGGGAAGCCTACGACTGGCCGATCTCCCTGAAGCGCCTGCAGGCCGGCAACTCGACGCCGATGGCGCTGCAACCGCACAGGATCGCCAGGCTGCCCGCGCCACAGGCG

In [63]:
df['Nucleotide(DNA)'] = [get_sequence(df['DNA Accession'][i]) for i in range(len(df))]

In [None]:
df.to_csv('aro_categories_index(res_manage).csv',index=False)

In [5]:
df = pd.read_csv('aro_categories.tsv', delimiter='\t')
df

Unnamed: 0,ARO Category,ARO Accession,ARO Name
0,AMR Gene Family,ARO:3004272,16S rRNA methyltransferase (A1408)
1,AMR Gene Family,ARO:3004271,16S rRNA methyltransferase (G1405)
2,AMR Gene Family,ARO:3003666,16s rRNA with mutation conferring resistance t...
3,AMR Gene Family,ARO:3003976,16S rRNA with mutation conferring resistance t...
4,AMR Gene Family,ARO:3003667,16s rRNA with mutation conferring resistance t...
...,...,...,...
538,Resistance Mechanism,ARO:0001002,antibiotic target replacement
539,Resistance Mechanism,ARO:3007115,modification to cell morphology
540,Resistance Mechanism,ARO:3000244,reduced permeability to antibiotic
541,Resistance Mechanism,ARO:3003764,resistance by absence


In [6]:
df = pd.read_csv('aro_index.tsv', delimiter='\t')
df

Unnamed: 0,ARO Accession,CVTERM ID,Model Sequence ID,Model ID,Model Name,ARO Name,Protein Accession,DNA Accession,AMR Gene Family,Drug Class,Resistance Mechanism,CARD Short Name
0,ARO:3005099,43314,6143,3831,23S rRNA (adenine(2058)-N(6))-methyltransferas...,23S rRNA (adenine(2058)-N(6))-methyltransferas...,AAB60941.1,AF002716.1,Erm 23S ribosomal RNA methyltransferase,lincosamide antibiotic;macrolide antibiotic;st...,antibiotic target alteration,Spyo_ErmA_MLSb
1,ARO:3002523,38923,8144,1781,AAC(2')-Ia,AAC(2')-Ia,AAA03550.1,L06156.2,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation,AAC(2')-Ia
2,ARO:3002524,38924,85,746,AAC(2')-Ib,AAC(2')-Ib,AAC44793.1,U41471.1,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation,AAC(2')-Ib
3,ARO:3002525,38925,4719,1246,AAC(2')-Ic,AAC(2')-Ic,CCP42991.1,AL123456.3,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation,AAC(2')-Ic
4,ARO:3002526,38926,228,1415,AAC(2')-Id,AAC(2')-Id,AAB41701.1,U72743.1,AAC(2'),aminoglycoside antibiotic,antibiotic inactivation,AAC(2')-Id
...,...,...,...,...,...,...,...,...,...,...,...,...
5133,ARO:3003063,39497,489,1826,ykkC,ykkC,CAB13166.1,AL009126.1,small multidrug resistance (SMR) antibiotic ef...,aminoglycoside antibiotic;phenicol antibiotic;...,antibiotic efflux,ykkC
5134,ARO:3003064,39498,536,350,ykkD,ykkD,CAB13167.1,AL009126.1,small multidrug resistance (SMR) antibiotic ef...,aminoglycoside antibiotic;phenicol antibiotic;...,antibiotic efflux,ykkD
5135,ARO:3003952,40722,5261,2424,YojI,YojI,AAC75271.1,U00096.3,ATP-binding cassette (ABC) antibiotic efflux pump,peptide antibiotic,antibiotic efflux,YojI
5136,ARO:3005035,43233,6072,3779,Yrc-1,Yrc-1,ABA70720.1,DQ185144.1,YRC Beta-lactamase,cephalosporin;cephamycin;penam,antibiotic inactivation,Yrc-1


In [None]:
df = pd.read_csv('aro_categories.tsv', delimiter='\t')
df

In [None]:
df = pd.read_csv('aro_categories.tsv', delimiter='\t')
df

In [None]:
df = pd.read_csv('aro_categories.tsv', delimiter='\t')
df

In [None]:
df = pd.read_csv('aro_categories.tsv', delimiter='\t')
df

In [None]:
df = pd.read_csv('aro_categories.tsv', delimiter='\t')
df

In [None]:
df = pd.read_csv('aro_categories.tsv', delimiter='\t')
df