# Construct pan-genomes
This notebook contains commands for constructing the various pan-genomes.

In [1]:
raw_reads_dir = "../data/raw_fastq"

In [2]:
run_ids = {"Eri": "ERR3624573",
          "Kyo": "ERR3624576",
          "Cvi-0": "ERR3624578",
          "Ler": "ERR3624574",
          "Sha": "ERR3624575",
          "C24": "ERR3624577",
          "An-1": "ERR3624579"}

In [3]:
read_len = 101
genome_size = 120000000
def reads_for_cov(cov):
    """
    How many read pairs are needed to get a coverage of X
    """
    return int(cov*genome_size/(read_len*2))

## x50
This is the basic pan-genome of 7 A. thaliana ecotypes + reference - all LQ samples with x50 sequencing depth.

In [4]:
k = reads_for_cov(50)
x50_dir = "../output/A_thaliana_pan_genome/de_novo/x50/RESULT/per_sample"
seed = 100
for ecotype, run_id in run_ids.items():
    print(ecotype)
    out_dir = "%s/%s/data" %(x50_dir, ecotype)
    ! mkdir -p $out_dir
    in_fastq1 = "../data/raw_fastq/%s/%s_1.fastq.gz" %(ecotype, run_id)
    in_fastq2 = "../data/raw_fastq/%s/%s_2.fastq.gz" %(ecotype, run_id)
    out_fastq1 = "%s/%s_1.fastq" %(out_dir,run_id)
    out_fastq2 = "%s/%s_2.fastq" %(out_dir,run_id)
    ! seqtk  sample -s$seed $in_fastq1 $k > $out_fastq1
    ! gzip $out_fastq1
    ! seqtk sample -s$seed $in_fastq2 $k > $out_fastq2
    ! gzip $out_fastq2

Eri
Kyo
Cvi-0
Ler
Sha
C24
An-1


## x10
Same as x50 set except for the sequencing depth

In [4]:
k = reads_for_cov(10)
x10_dir = "../output/A_thaliana_pan_genome/de_novo/x10/RESULT/per_sample"
seed = 100
for ecotype, run_id in run_ids.items():
    print(ecotype)
    out_dir = "%s/%s/data" %(x10_dir, ecotype)
    ! mkdir -p $out_dir
    in_fastq1 = "../data/raw_fastq/%s/%s_1.fastq.gz" %(ecotype, run_id)
    in_fastq2 = "../data/raw_fastq/%s/%s_2.fastq.gz" %(ecotype, run_id)
    out_fastq1 = "%s/%s_1.fastq" %(out_dir,run_id)
    out_fastq2 = "%s/%s_2.fastq" %(out_dir,run_id)
    ! seqtk sample -s$seed $in_fastq1 $k > $out_fastq1
    ! gzip $out_fastq1
    ! seqtk sample -s$seed $in_fastq2 $k > $out_fastq2
    ! gzip $out_fastq2

Eri
Kyo
Cvi-0
Ler
Sha
C24
An-1


## x20

In [4]:
k = reads_for_cov(20)
x20_dir = "../output/A_thaliana_pan_genome/de_novo/x20/RESULT/per_sample"
seed = 100
for ecotype, run_id in run_ids.items():
    print(ecotype)
    out_dir = "%s/%s/data" %(x20_dir, ecotype)
    ! mkdir -p $out_dir
    in_fastq1 = "../data/raw_fastq/%s/%s_1.fastq.gz" %(ecotype, run_id)
    in_fastq2 = "../data/raw_fastq/%s/%s_2.fastq.gz" %(ecotype, run_id)
    out_fastq1 = "%s/%s_1.fastq" %(out_dir,run_id)
    out_fastq2 = "%s/%s_2.fastq" %(out_dir,run_id)
    ! seqtk sample -s$seed $in_fastq1 $k > $out_fastq1
    ! gzip $out_fastq1
    ! seqtk sample -s$seed $in_fastq2 $k > $out_fastq2
    ! gzip $out_fastq2

Eri
Kyo
Cvi-0
Ler
Sha
C24
An-1


In [4]:
k = reads_for_cov(30)
x30_dir = "../output/A_thaliana_pan_genome/de_novo/x30/RESULT/per_sample"
seed = 100
for ecotype, run_id in run_ids.items():
    print(ecotype)
    out_dir = "%s/%s/data" %(x30_dir, ecotype)
    ! mkdir -p $out_dir
    in_fastq1 = "../data/raw_fastq/%s/%s_1.fastq.gz" %(ecotype, run_id)
    in_fastq2 = "../data/raw_fastq/%s/%s_2.fastq.gz" %(ecotype, run_id)
    out_fastq1 = "%s/%s_1.fastq" %(out_dir,run_id)
    out_fastq2 = "%s/%s_2.fastq" %(out_dir,run_id)
    ! seqtk sample -s$seed $in_fastq1 $k > $out_fastq1
    ! gzip $out_fastq1
    ! seqtk sample -s$seed $in_fastq2 $k > $out_fastq2
    ! gzip $out_fastq2

Eri
Kyo
Cvi-0
Ler
Sha
C24
An-1
