# Jupyter Notebook GenoRobotics Full Pipeline

## Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import os.path as ospath
from lib.consensus.consensus import run_consensus
from lib.identification.identification import run_identification
from lib.general_helpers.process_fastq import concatenate_fastq

## Define Your File and Folder Paths

- Modify the "input_fastq_filename" variable to point to the fastq you want to use as input. 
  
- Modify wsl if you're on Windows and want to use WSL.

In [None]:
#for choosing a fastq that is part of an expedition easily
input_expedition_folder = "summer_expedition"
barcode_nb = 9
windows = False

In [None]:
#for standalone fastq files outside any expedition
input_fastq_filename = ?
input_expedition_folder = None

In [None]:
input_expedition_path= ospath.join("data", input_expedition_folder)

for root, dirs, files in os.walk(input_expedition_path):
    if root.endswith(str(barcode_nb)):
        input_folder_path = root

for _,_, files in os.walk(input_folder_path):
    for file in files:
        if file.endswith(".fastq"):
            input_fastq_path = ospath.join(input_folder_path, file)
            input_fastq_filename=file
        if file.endswith(".fasta"):
            input_ref_path = ospath.join(input_folder_path, file)
        

print(input_fastq_path)
print(input_ref_path)
base_name = ospath.splitext(input_fastq_filename)[0]
print(base_name)

output_path = ospath.join("output",base_name)
print(output_path)

## Run Preprocessing (Optional)

- If you want to preprocess your data, run the following cell. Otherwise, skip to the next cell.
- For now, preprocessing consists of concatenating all the fastq files in a folder into one file. This is useful if you have multiple fastq files for one sample. You'll have to change the first parameter in the "concatenate_fastq_files" function to point to the folder containing your fastq files.

In [None]:
# preprocessing()
#concatenate_fastq(os.path.join("assets", "input", "barcode74"), input_fastq_path)

## Run Consensus Sequence Generation

Select which consensus sequence generation method you want to use by setting the "consensus_method" variable to either:

- "majority" (default)

- "consensus"

- "consensus_with_ambiguities"

In [None]:
# choose a consensus method between the following:
# - "80_20_best_sequence"
# - "80_20_longest_sequence"
# - "straightforward_best_sequence"

# consensus_method = "straightforward_best_sequence"
consensus_method = "80_20_best_sequence"

run_consensus(input_name= base_name,
              expedition_name= input_expedition_folder,
              input_fastq_path= input_fastq_path, 
              consensus_method= consensus_method,
              windows= windows)

## Run Identification of Consensus Sequence
- Run the following cell to identify the consensus sequence.
- Change db to the database you want to use. Options are "matK", "rbcL", "psbA-trnH" and "ITS". If you want to use all of them, set db to None.

In [None]:
# choose an identification method between the following:
# - "blastn"

identification_method = "blastn"

# Choose your db along the gene you're trying to identify : matK, rbcL, psbA-trnH or ITS
db = "rbcL"

run_identification(base_name,expedition_name=input_expedition_folder, db=db, identification_method=identification_method, windows=windows)

print("Pipeline finished !")
print("You can find your results in the 'output/expeditionName/inputName/identification/' folder")