# WGS Pipeline

In [1]:
import os
import argparse
import collections
import pandas as pd
import subprocess
from Bio import SeqIO

from itertools import repeat
from multiprocessing import Pool, freeze_support

In [None]:
conda install -c bioconda fastp unicycler quast bandage prokka diamond

## QC

### fastP

https://github.com/OpenGene/fastp

In [6]:
## Fastp pair end tirmming and merging
def RunFastp(R1, R2, prefix, OutDir, threads):
    fastpDir = os.path.join(OutDir, "fastp")
    if os.path.exists(OutDir) == 0:
        os.makedirs(OutDir, 0o777, True)
    if os.path.exists(fastpDir) == 0:
        os.makedirs(fastpDir, 0o777, True)
    cmd = "fastp --in1 " + R1 + " --in2 " + R2 + " --out1 " + os.path.join(fastpDir, prefix + "_R1.fastq") + " --out2 " + os.path.join(fastpDir, prefix + "_R2.fastq") + \
    " --thread " + str(threads) + \
    " --html " + os.path.join(fastpDir, prefix + ".html") + " --json " + os.path.join(fastpDir, prefix + ".json") + " --report_title " + prefix + "-fastq-merge-report"
    subprocess.call(cmd, shell=True)
## Run fastp in parallel
def RunFastpParallel(R1List, R2List, prefixList, OutDir, threads, jobs):
    pool = Pool(processes = jobs)
    pool.starmap(RunFastp, zip(R1List, R2List, prefixList, repeat(thread), repeat(OutDir)))
    pool.close()
    pool.join()
    pool.terminate()

#### test

fastp -i /mnt/d/Lab/WGS-Pipeline/testdata/ERR044595_1M_1.fastq.gz -I /mnt/d/Lab/WGS-Pipeline/testdata/ERR044595_1M_2.fastq.gz -o /mnt/d/Lab/WGS-Pipeline/result/fastp/out.R1.fq.gz -O /mnt/d/Lab/WGS-Pipeline/result/fastp/out.R2.fq.gz

In [10]:
R1 = "/mnt/d/Lab/WGS-Pipeline/testdata/ERR044595_1M_1.fastq.gz"
R2 = "/mnt/d/Lab/WGS-Pipeline/testdata/ERR044595_1M_2.fastq.gz"
prefix = "ERR044595_1M"
OutDir = "/mnt/d/Lab/WGS-Pipeline/result"

In [8]:
RunFastp(R1, R2, prefix, OutDir)

if `-a` add adapter seq

rename `--html` and `--json` output

add `--report_title`

### skewer

https://github.com/relipmoc/skewer

## Assembly

### Unicycler

https://github.com/rrwick/Unicycler

```shell
unicycler -1 $result/trimmomatic/ERR044595_1M_1.paired.fastq -2 $result/trimmomatic/ERR044595_1M_2.paired.fastq -o $result/unicycler
```

In [9]:
def RunUnicycler(R1, R2, prefix, OutDir, threads):
    unicyclerDir = os.path.join(OutDir, "unicycler")
    if os.path.exists(unicyclerDir) == 0:
        os.makedirs(unicyclerDir, 0o777, True)
    cmd = "unicycler -1 " + R1 + " -2 " + R2 + " -o " + os.path.join(unicyclerDir, prefix) + " --threads " + str(threads)
    subprocess.call(cmd, shell=True)
def RunUnicyclerParallel(R1List, R2List, prefixList, OutDir, threads, jobs):
    pool = Pool(processes = jobs)
    pool.starmap(RunFastp, zip(R1List, R2List, prefixList, repeat(thread), repeat(OutDir)))
    pool.close()
    pool.join()
    pool.terminate()

In [12]:
RunUnicycler(R1, R2, prefix, OutDir, 8)

### Quast

https://github.com/ablab/quast

### kraken2

https://github.com/DerrickWood/kraken2/wiki

#### kraken2 database

## Annotation

### Prokka

https://github.com/tseemann/prokka

### ABRicate

https://github.com/tseemann/abricate

### antiSMASH

BGCs

### Pathway

#### kofam_scan

https://github.com/takaram/kofam_scan

## Request

特定功能基因: 
- SCFA（刘红宾)   
- 胆汁酸（周春花）   
- BGC（唐啸宇/司同）      
- VFDB   

Phage-Host

噬菌谱？宿主谱

Phage Seq Mapping Metagenomics data

Ma Lab Metageome sample

which dataset?

## Output

- FastQC
- assembly
- quast
- annotation