# This notebook shows how to load and format a UniProt annotation file

### Instructions on how to download a UniProt annotation file

1. Go to the Uniprot website(https://www.uniprot.org/uniprot/), select the organism of interest in the "Popular organisms" section and click on it.
2. Click the "Download" button and select "Text" format.
3. Select the "Compressed" radio button and click "Go".
4. Unzip the downloaded file and specify the path to this file.

In [1]:
from alphamap.uniprot_integration import preprocess_uniprot

### Import and process the uniprot annotation file

In [2]:
path_downloaded_uniprot = '../UniprotAnnotations/uniprot_human_111120.txt'

In [3]:
uniprot_df = preprocess_uniprot(path_downloaded_uniprot)

In [4]:
uniprot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319984 entries, 0 to 1319983
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype   
---  ------      --------------    -----   
 0   protein_id  1319984 non-null  object  
 1   feature     1319984 non-null  category
 2   isoform_id  1319984 non-null  object  
 3   start       1319984 non-null  float64 
 4   end         908352 non-null   float64 
 5   note        1319984 non-null  object  
dtypes: category(1), float64(2), object(3)
memory usage: 51.6+ MB


In [5]:
uniprot_df[0:5]

Unnamed: 0,protein_id,feature,isoform_id,start,end,note
0,Q8N7X0,CHAIN,,1.0,1667.0,Androglobin
1,Q8N7X0,DOMAIN,,70.0,411.0,Calpain catalytic
2,Q8N7X0,DOMAIN,,906.0,935.0,IQ
3,Q8N7X0,COILED,,1588.0,1629.0,
4,Q8N7X0,VAR_SEQ,,1.0,951.0,Missing (in isoform 2)


### Save the preprocessed UniProt data to the desired location

In [6]:
uniprot_df.to_csv('data/preprocessed_uniprot_human.csv', index=False)

# Process mouse annotation

In [7]:
mouse_uniprot_df = preprocess_uniprot('../UniprotAnnotations/uniprot_mouse_091120.txt')

In [8]:
mouse_uniprot_df.to_csv('data/preprocessed_uniprot_mouse.csv', index=False)

# Process arabidopsis annotation

In [9]:
arabidopsis_uniprot_df = preprocess_uniprot('../UniprotAnnotations/uniprot_arabidopsis_thaliana_131120.txt')

In [10]:
arabidopsis_uniprot_df.to_csv('data/preprocessed_uniprot_arabidopsis.csv', index=False)

# Process zebrafish

In [11]:
zebrafish_uniprot_df = preprocess_uniprot('../UniprotAnnotations/uniprot-filtered-organism__Danio+rerio+(Zebrafish)+(Brachydanio+rerio%--.txt')

In [12]:
zebrafish_uniprot_df.to_csv('data/preprocessed_uniprot_zebrafish.csv', index=False)

# Process bovine

In [13]:
bovine_uniprot_df = preprocess_uniprot('../UniprotAnnotations/uniprot-organism__Bovine+[9913]_.txt')

In [14]:
bovine_uniprot_df.to_csv('data/preprocessed_uniprot_bovine.csv', index=False)

# Process rice

In [15]:
rice_uniprot_df = preprocess_uniprot('../UniprotAnnotations/uniprot-filtered-organism__Oryza+sativa+subsp.+japonica+(Rice)+[39947%--.txt')

In [16]:
rice_uniprot_df.to_csv('data/preprocessed_uniprot_rice.csv', index=False)

# Process rat

In [17]:
rat_uniprot_df = preprocess_uniprot('../UniprotAnnotations/uniprot-organism__Rattus+norvegicus+(Rat)+[10116]_.txt')

In [18]:
rat_uniprot_df.to_csv('data/preprocessed_uniprot_rat.csv', index=False)

# Process drosophila

In [19]:
drosophila_uniprot_df = preprocess_uniprot('../UniprotAnnotations/uniprot-organism__Drosophila+melanogaster+[7227]_.txt')

In [20]:
drosophila_uniprot_df.to_csv('data/preprocessed_uniprot_drosophila.csv', index=False)

# Process C. elegans

In [21]:
celegans_uniprot_df = preprocess_uniprot('../UniprotAnnotations/uniprot-organism__Caenorhabditis+elegans+[6239]_.txt')

In [22]:
celegans_uniprot_df.to_csv('data/preprocessed_uniprot_celegans.csv', index=False)

# Process Slimemold

In [23]:
slimemold_uniprot_df = preprocess_uniprot('../UniprotAnnotations/uniprot-organism__Dictyostelium+discoideum+[44689]_.txt')

In [24]:
slimemold_uniprot_df.to_csv('data/preprocessed_uniprot_slimemold.csv', index=False)

# Process Ecoli

In [25]:
ecoli_uniprot_df = preprocess_uniprot('../UniprotAnnotations/uniprot-organism__Dictyostelium+discoideum+[44689]_.txt')

In [26]:
ecoli_uniprot_df.to_csv('data/preprocessed_uniprot_ecoli.csv', index=False)

# Process Bsubtilis

In [27]:
bsubtilis_uniprot_df = preprocess_uniprot('../UniprotAnnotations/uniprot-organism__Bacillus+subtilis+(strain+168)+[224308]_.txt')

In [28]:
bsubtilis_uniprot_df.to_csv('data/preprocessed_uniprot_bsubtilis.csv', index=False)

# Process yeast

In [29]:
yeast_uniprot_df = preprocess_uniprot('../UniprotAnnotations/uniprot-organism__Saccharomyces+cerevisiae+(strain+ATCC+204508+_+S288c%2--.txt')

In [30]:
yeast_uniprot_df.to_csv('data/preprocessed_uniprot_yeast.csv', index=False)