# Intergenic query to OcéanIA Platform

### 1. Install oceania-query-fasta package

In [1]:
!pip install oceania-query-fasta



### 2. List available FASTA samples

In [2]:
#@title Double click to see the cell of the Python program

from oceania import list_fasta_samples

fasta_samples = list_fasta_samples()

# fasta samples list loaded as a pandas.DataFrame
print(fasta_samples)

           sample_id                                         sample_key
0    TARA_A100000164  data/raw/tara/OM-RGC_v2/assemblies/TARA_A10000...
1    TARA_A100000171  data/raw/tara/OM-RGC_v2/assemblies/TARA_A10000...
2    TARA_A100000172  data/raw/tara/OM-RGC_v2/assemblies/TARA_A10000...
3    TARA_A100001011  data/raw/tara/OM-RGC_v2/assemblies/TARA_A10000...
4    TARA_A100001015  data/raw/tara/OM-RGC_v2/assemblies/TARA_A10000...
..               ...                                                ...
365  TARA_Y100001972  data/raw/tara/OM-RGC_v2/assemblies/TARA_Y10000...
366  TARA_Y100001973  data/raw/tara/OM-RGC_v2/assemblies/TARA_Y10000...
367  TARA_Y100001978  data/raw/tara/OM-RGC_v2/assemblies/TARA_Y10000...
368  TARA_Y100001980  data/raw/tara/OM-RGC_v2/assemblies/TARA_Y10000...
369  TARA_Y200000002  data/raw/tara/OM-RGC_v2/assemblies/TARA_Y20000...

[370 rows x 2 columns]


### 3. Get list of unidentified sequence gaps in the Ocean Microbial Reference Gene Catalog v2 for a sample

In [3]:
#@title Double click to see the cell of the Python program

from oceania import list_genes_and_gaps

SAMPLE_ID = "TARA_R110002003"
sample_key = fasta_samples[fasta_samples.sample_id == SAMPLE_ID]['sample_key'].values[0]
gaps_df = list_genes_and_gaps(SAMPLE_ID)

# gaps list loaded as a pandas.DataFrame
print(gaps_df)

             original_sequence_id  \
0   TARA_R110002003_G_scaffold3_1   
1   TARA_R110002003_G_scaffold3_1   
2   TARA_R110002003_G_scaffold3_1   
3   TARA_R110002003_G_scaffold3_1   
4   TARA_R110002003_G_scaffold3_1   
5   TARA_R110002003_G_scaffold3_1   
6   TARA_R110002003_G_scaffold3_1   
7   TARA_R110002003_G_scaffold3_3   
8   TARA_R110002003_G_scaffold3_3   
9   TARA_R110002003_G_scaffold3_3   
10  TARA_R110002003_G_scaffold3_3   
11  TARA_R110002003_G_scaffold3_4   
12  TARA_R110002003_G_scaffold3_4   
13  TARA_R110002003_G_scaffold3_4   
14  TARA_R110002003_G_scaffold3_4   
15  TARA_R110002003_G_scaffold3_4   
16  TARA_R110002003_G_scaffold3_4   
17  TARA_R110002003_G_scaffold3_4   
18  TARA_R110002003_G_scaffold3_4   
19  TARA_R110002003_G_scaffold3_4   
20  TARA_R110002003_G_scaffold3_4   
21  TARA_R110002003_G_scaffold3_4   
22  TARA_R110002003_G_scaffold3_4   
23  TARA_R110002003_G_scaffold3_4   
24  TARA_R110002003_G_scaffold3_4   
25  TARA_R110002003_G_scaffold3_4   
2

### 4. Select gaps to extract

In [4]:
#@title Double click to see the cell of the Python program

# Filtering first 5 gaps with a length greater than 100
gaps_filter = 'length > 100 and id.str.startswith("gap__")'
selected_gaps = gaps_df.query(gaps_filter, engine='python').head(5)

print(selected_gaps)

             original_sequence_id  \
5   TARA_R110002003_G_scaffold3_1   
7   TARA_R110002003_G_scaffold3_3   
9   TARA_R110002003_G_scaffold3_3   
11  TARA_R110002003_G_scaffold3_4   
13  TARA_R110002003_G_scaffold3_4   

                                                   id strand  start  stop  \
5   gap__TARA_R110002003_G_scaffold3_1_gene3__TARA...    NaN   3290  6293   
7   gap__TARA_R110002003_G_scaffold3_1_gene4__TARA...    NaN      0   327   
9   gap__TARA_R110002003_G_scaffold3_3_gene5__TARA...    NaN    944  2742   
11  gap__TARA_R110002003_G_scaffold3_3_gene6__TARA...    NaN      0   379   
13  gap__TARA_R110002003_G_scaffold3_4_gene7__TARA...    NaN   1530  1669   

    length start_codon stop_codon gene_type  
5     3003         NaN        NaN       NaN  
7      327         NaN        NaN       NaN  
9     1798         NaN        NaN       NaN  
11     379         NaN        NaN       NaN  
13     139         NaN        NaN       NaN  


### 5. Prepare request params

In [5]:
#@title Double click to see the cell of the Python program

request_gaps = selected_gaps[['original_sequence_id', 'start', 'stop']].copy()
request_params = []
for row in request_gaps.itertuples():
    request_params.append(
        (row[1], int(row[2]), int(row[3]))
    )

print(sample_key)
print(request_params)

data/raw/tara/OM-RGC_v2/assemblies/TARA_R110002003.scaftig.gz
[('TARA_R110002003_G_scaffold3_1', 3290, 6293), ('TARA_R110002003_G_scaffold3_3', 0, 327), ('TARA_R110002003_G_scaffold3_3', 944, 2742), ('TARA_R110002003_G_scaffold3_4', 0, 379), ('TARA_R110002003_G_scaffold3_4', 1530, 1669)]


### 6. Perform call to the OcéanIA services and print results

In [6]:
#@title Double click to see the cell of the Python program

from oceania import get_sequences_from_fasta

request_result = get_sequences_from_fasta(
    sample_key,
    request_params
)

# request_result is loaded as a pandas.DataFrame
print(f"Result loaded into a {type(request_result).__name__}")
print(request_result)

[29-06-2021 15:25:34] Sending request for fasta sequences
[29-06-2021 15:25:35] Request accepted
[29-06-2021 15:25:35] Waiting for results...
[29-06-2021 15:25:46] Done. Elapsed time: 11.762923056958243 seconds


Result loaded into a DataFrame
                              id  start   end type  \
0  TARA_R110002003_G_scaffold3_1   3290  6293  raw   
1  TARA_R110002003_G_scaffold3_3      0   327  raw   
2  TARA_R110002003_G_scaffold3_3    944  2742  raw   
3  TARA_R110002003_G_scaffold3_4      0   379  raw   
4  TARA_R110002003_G_scaffold3_4   1530  1669  raw   

                                            sequence  
0  TGATCGGGAGTCCTCCAGGCTTTGGATCGTTTGGGATAGATTTGTT...  
1  TCCCTCTACACAGAGCAAACCTCCCAGGTAAGATCAGCCCGGGCTA...  
2  CAACATCTCCCTCTTCTTTACTTTGAATCTCTCGTCCTTATTTCGT...  
3  TCTCTCAAACAGTTGTTGTGCTCAACTTAGCAATCCATGTATTTGC...  
4  GAGCAATTTGCAGATGGTGGTGTAGTCCTCGAAGTTGGAACAGATG...  
