# Query to get intergenic regions of a TARA sample in the OcéanIA Platform

### 1. Install oceania-query-fasta package

In [1]:
!pip install oceania-query-fasta
!pip install openpyxl # required to load supplementary info into pandas



### 2. Load supplementary information for samples and data of Ocean Microbial Reference Gene Catalog v2

In [1]:
#@title Double click to see the cell of the Python program

import pandas

SUPP_INFO_LINK = "https://zenodo.org/record/3539258/files/Salazar_et_al_2019_Suppl_Info.xlsx?download=1"

table_W1 = pandas.read_excel(SUPP_INFO_LINK, sheet_name="Table_W1")

# Select first sample_id at the surface water layer
selected_sample = table_W1[table_W1.Layer=="SRF"].head(1)
print(selected_sample)

sample_id = selected_sample["PANGAEA sample id"].item()

  PANGAEA sample id BioSamples_ID     ENA_ID ENA_Run_ID MetaG/MetaT  Station  \
0   TARA_Y100000004  SAMEA2619888  ERS488658  ERR594328       MetaG       34   

  Layer Size_fraction         Size_fraction_name  \
0   SRF      0.1-0.22  Girus/Prokaryote enriched   

  Used_in_OM-RGC.v1 (Sunagawa_et_al_2015)  Used_for_OM-RGC.v2 (current work)  \
0                       Used_in_OM-RGC.v1  Used_for_OM-RGC.v2 (current work)   

       Used_for_profiling (current work)      Polar  \
0  Not_used_for_profiling (current work)  Non polar   

  Sample ID (registered at the BioSamples ...)  \
0                                 SAMEA2619888   

  Sample ID (registered at the European Nu...)             Date/Time  \
0                                    ERS488658  2010-01-20T04:27:00Z   

   Latitude  Longitude Depth, nominal                  OS region  
0   18.3967     39.875              5  [RS] Red Sea (MRGID:4264)  


  warn(msg)


### 3. Get metadata for the first 10 intergenic regions of size greater than 100 for the selected sample

In [3]:
#@title Double click to see the cell of the Python program

from oceania import list_intergenic_regions

intergenic_regions_metadata = list_intergenic_regions(sample_id, min_length=100, page=1, page_size=10)

print(intergenic_regions_metadata)

             original_sequence_id  \
0   TARA_R110002003_G_scaffold3_1   
1   TARA_R110002003_G_scaffold3_1   
2   TARA_R110002003_G_scaffold3_1   
3   TARA_R110002003_G_scaffold3_1   
4   TARA_R110002003_G_scaffold3_1   
5   TARA_R110002003_G_scaffold3_1   
6   TARA_R110002003_G_scaffold3_1   
7   TARA_R110002003_G_scaffold3_3   
8   TARA_R110002003_G_scaffold3_3   
9   TARA_R110002003_G_scaffold3_3   
10  TARA_R110002003_G_scaffold3_3   
11  TARA_R110002003_G_scaffold3_4   
12  TARA_R110002003_G_scaffold3_4   
13  TARA_R110002003_G_scaffold3_4   
14  TARA_R110002003_G_scaffold3_4   
15  TARA_R110002003_G_scaffold3_4   
16  TARA_R110002003_G_scaffold3_4   
17  TARA_R110002003_G_scaffold3_4   
18  TARA_R110002003_G_scaffold3_4   
19  TARA_R110002003_G_scaffold3_4   
20  TARA_R110002003_G_scaffold3_4   
21  TARA_R110002003_G_scaffold3_4   
22  TARA_R110002003_G_scaffold3_4   
23  TARA_R110002003_G_scaffold3_4   
24  TARA_R110002003_G_scaffold3_4   
25  TARA_R110002003_G_scaffold3_4   
2

### 4. Prepare request params

In [5]:
#@title Double click to see the cell of the Python program

request_regions = intergenic_regions_metadata[['sequence', 'start', 'stop']].copy()
request_params = []
for row in request_regions.itertuples():
    request_params.append(
        (row[1], int(row[2]), int(row[3]))
    )

print(request_params)

data/raw/tara/OM-RGC_v2/assemblies/TARA_R110002003.scaftig.gz
[('TARA_R110002003_G_scaffold3_1', 3290, 6293), ('TARA_R110002003_G_scaffold3_3', 0, 327), ('TARA_R110002003_G_scaffold3_3', 944, 2742), ('TARA_R110002003_G_scaffold3_4', 0, 379), ('TARA_R110002003_G_scaffold3_4', 1530, 1669)]


### 5. Perform call to the OcéanIA services

In [6]:
from oceania import get_sequences_from_fasta

request_result = get_sequences_from_fasta(
    sample_id,
    request_params
)

# request_result is loaded as a pandas.DataFrame
print(request_result)

[29-06-2021 15:25:34] Sending request for fasta sequences
[29-06-2021 15:25:35] Request accepted
[29-06-2021 15:25:35] Waiting for results...
[29-06-2021 15:25:46] Done. Elapsed time: 11.762923056958243 seconds


Result loaded into a DataFrame
                              id  start   end type  \
0  TARA_R110002003_G_scaffold3_1   3290  6293  raw   
1  TARA_R110002003_G_scaffold3_3      0   327  raw   
2  TARA_R110002003_G_scaffold3_3    944  2742  raw   
3  TARA_R110002003_G_scaffold3_4      0   379  raw   
4  TARA_R110002003_G_scaffold3_4   1530  1669  raw   

                                            sequence  
0  TGATCGGGAGTCCTCCAGGCTTTGGATCGTTTGGGATAGATTTGTT...  
1  TCCCTCTACACAGAGCAAACCTCCCAGGTAAGATCAGCCCGGGCTA...  
2  CAACATCTCCCTCTTCTTTACTTTGAATCTCTCGTCCTTATTTCGT...  
3  TCTCTCAAACAGTTGTTGTGCTCAACTTAGCAATCCATGTATTTGC...  
4  GAGCAATTTGCAGATGGTGGTGTAGTCCTCGAAGTTGGAACAGATG...  
