In [45]:
import pandas as pd
import Bio

# Downloading data

## dbAMP

### Information:
- link: https://awi.cuhk.edu.cn/dbAMP/download2024.php
- last update in 06/2024
- available data: 35600 AMPs
- used data: 23883 AMPs
- formats available: xlsx, fasta

### Downloading:
- download tab was selected
- no filters or queries were applied
- Download All Antimicrobial peptides Sequence Data was selected
- the file was downloaded in xlsx format

### Preprocessing

In [46]:
df_dbamp3 = pd.read_excel('data/dbAMP3.xlsx')
print(df_dbamp3.columns)
print(df_dbamp3.shape)

Index(['dbAMP_ID', 'Name', 'Source', 'Tax', 'Uniprot', 'PDB', 'Targets',
       'Seq'],
      dtype='object')
(35600, 8)


In [47]:
# select columns
df_dbamp3 = df_dbamp3[['Name', 'Seq']]

# add sequence length and Classification
df_dbamp3['Length'] = df_dbamp3['Seq'].apply(len)
df_dbamp3['Classification'] = 1

# rename columns
df_dbamp3.rename(columns={'Seq': 'Sequence'}, inplace=True)

# remove sequences with length < 10 or > 50
df_dbamp3 = df_dbamp3[(df_dbamp3['Length'] >= 10) & (df_dbamp3['Length'] <= 50)]

# display first few rows
df_dbamp3.head()

Unnamed: 0,Name,Sequence,Length,Classification
0,Designed AMP No.1,FAAKHNGKSLFKPQN,15,1
1,Synthetic construct&&4A,AAAAGSVWGAVNYTSDCNGECKRRGYKGGYCGSFANVNCWCET,43,1
2,gag,AAANPGLLETSEGCRQIL,18,1
3,gag,AAAPAATLEEHMTACQGV,18,1
5,"jellyfish, Aurelia aurita&&Aurelin (jellyfish,...",AACSDRAHGHICESFKSFCKDSGRNGVKLRANCKKTCGLC,40,1


In [None]:
# count and remove duplicates
print(df_dbamp3.duplicated().sum())
df_dbamp3.drop_duplicates(inplace=True)

# display final shape
df_dbamp3.shape

42


## DRAMP

### Information:
- link: http://dramp.cpu-bioinfor.org/downloads/
- last update in 09/2024
- available data: 30260 AMPs
- used data: 11612 AMPs
- formats available: xlsx, txt, fasta

### Downloading:
- downloads tab was selected
- no filters or queries were applied
- General dataset was selected
- the file was downloaded in xlsx format 

### Preprocessing

In [50]:
df_dramp = pd.read_excel('data/dramp.xlsx')
print(df_dramp.columns)
print(df_dramp.shape)

Index(['DRAMP_ID', 'Sequence', 'Sequence_Length', 'Name', 'Swiss_Prot_Entry',
       'Family', 'Gene', 'Source', 'Activity', 'Protein_existence',
       'Structure', 'Structure_Description', 'PDB_ID', 'Comments',
       'Target_Organism', 'Hemolytic_activity', 'Linear/Cyclic/Branched',
       'N-terminal_Modification', 'C-terminal_Modification',
       'Other_Modifications', 'Stereochemistry', 'Cytotoxicity',
       'Binding_Traget', 'Pubmed_ID', 'Reference', 'Author', 'Title'],
      dtype='object')
(11612, 27)


In [51]:
# select columns
df_dramp = df_dramp[['Name', 'Sequence', 'Sequence_Length']]

# rename columns
df_dramp.rename(columns={'Sequence_Length': 'Length'}, inplace=True)

# add Classification
df_dramp['Classification'] = 1

# remove sequences with length < 10 or > 50
df_dramp = df_dramp[(df_dramp['Length'] >= 10) & (df_dramp['Length'] <= 50)]

# display first few rows
df_dramp.head()

Unnamed: 0,Name,Sequence,Length,Classification
0,Epicidin 280 (Bacteriocin),SLGPAIKATRQVCPKATRFVTVSCKKSDCQ,30,1
1,Microbisporicin A1 (Bacteriocin),VTSWSLCTPGCTSPGGGSNCSFCC,24,1
2,Ruminococcin A (RumA; Bacteriocin),GNGVLKTISHECNMNTWQFLFTCC,24,1
3,Lantibiotic michiganin-A (Bacteriocin),SSSGWLCTLTIECGTIICACR,21,1
5,Garvieacin Q (GarQ; Bacteriocin),EYHLMNGANGYLTRVNGKTVYRVTKDPVSAVFGVISNCWGSAGAGF...,50,1


In [None]:
# count and remove duplicates
print(df_dramp.duplicated().sum())
df_dramp.drop_duplicates(inplace=True)

# display final shape
df_dramp.shape

385


(8979, 4)

## CAMPR3

### Information:
- link: http://www.camp3.bicnirrh.res.in/index.php
- last update in 06/2024
- available data: 8164 AMPs
- used data: 4075 AMPs
- formats available: -

### Downloading:
- due to the lack of download all option, the data was scraped using scrape.py script and saved into a csv file

### Preprocessing

In [52]:
df_campr3 = pd.read_csv('data/campr3.csv')
print(df_campr3.columns)
print(df_campr3.shape)

Index(['Name', 'Sequence', 'Length'], dtype='object')
(4075, 3)


In [53]:
# add Classification
df_campr3['Classification'] = 1

# remove sequences with length < 10 or > 50
df_campr3 = df_campr3[(df_campr3['Length'] >= 10) & (df_campr3['Length'] <= 50)]

# display first few rows
df_dramp.head()

Unnamed: 0,Name,Sequence,Length,Classification
0,Epicidin 280 (Bacteriocin),SLGPAIKATRQVCPKATRFVTVSCKKSDCQ,30,1
1,Microbisporicin A1 (Bacteriocin),VTSWSLCTPGCTSPGGGSNCSFCC,24,1
2,Ruminococcin A (RumA; Bacteriocin),GNGVLKTISHECNMNTWQFLFTCC,24,1
3,Lantibiotic michiganin-A (Bacteriocin),SSSGWLCTLTIECGTIICACR,21,1
5,Garvieacin Q (GarQ; Bacteriocin),EYHLMNGANGYLTRVNGKTVYRVTKDPVSAVFGVISNCWGSAGAGF...,50,1


## UniProt