# Vorbereitung

In [1]:
# Setup

import os, sys
import pandas as pd

import qiime2
from tempfile import mkdtemp
from qiime2.plugins import demux, deblur, quality_filter, metadata, feature_table, alignment, phylogeny, diversity, emperor, feature_classifier, taxa, composition

workdir = os.getcwd()

if 'project.ipynb' in os.listdir(workdir):
    datadir = workdir + '/data'
    outputdir = workdir + '/output'
    sequencedir = datadir + '/sequences'

    # Create directories
    !mkdir -p data
    !mkdir -p data/sequences
    !mkdir -p output
    !mkdir -p output/viz
else:
    raise RuntimeError("Working directory not notebook directory. The Jupyter server has to be started in this notebook's directory")

print(os.getcwd())

print(f'Working directory: {workdir}')
print(f'Data directory: {datadir}')

%cd $workdir

/mnt/e/dev/pda
Working directory: /mnt/e/dev/pda
Data directory: /mnt/e/dev/pda/data
/mnt/e/dev/pda


In [2]:
# Rename files

df = pd.read_csv(f"{datadir}/run_prefix.tsv", sep="\t")
prefix = df[["sample_name", "run_prefix"]]

# listing directories
file_list = os.listdir(sequencedir)

if not '1629.SubjectIBD001_L2_L006_R1_001.fastq.gz' in file_list:
    for file in file_list:
        if file[:4] == "1629":
            sample_id = file[:18]
            run_prefix = prefix[prefix['sample_name'] == sample_id]['run_prefix'].tolist()[0]
            src = sample_id + ".fastq.gz"
            dst = sample_id + run_prefix[3:15]+"001.fastq.gz"
            os.rename(f'{sequencedir}/{src}', f'{sequencedir}/{dst}')
            print(src)
            print(dst)

In [3]:
# Load data
# This may take a very long time (up to a few hours)
if not 'demux-single-end.qza' in os.listdir(outputdir):
    !qiime tools import --type 'SampleData[SequencesWithQuality]' --input-path data/sequences --input-format CasavaOneEightSingleLanePerSampleDirFmt --output-path output/demux-single-end.qza

In [4]:
# Demultiplexed summary

!qiime demux summarize --i-data output/demux-single-end.qza --o-visualization output/viz/demux-single-end.qzv

[32mSaved Visualization to: output/viz/demux-single-end.qzv[0m


In [5]:
# Filter

!qiime quality-filter q-score --i-demux output/demux-single-end.qza --o-filtered-sequences output/demux-filtered.qza --o-filter-stats output/demux-filter-stats.qza --verbose

  phred_offset = yaml.load(metadata_view)['phred-offset']
[32mSaved SampleData[SequencesWithQuality] to: output/demux-filtered.qza[0m
[32mSaved QualityFilterStats to: output/demux-filter-stats.qza[0m


In [6]:
# Deblur

!qiime deblur denoise-16S --i-demultiplexed-seqs output/demux-filtered.qza --p-trim-length 90 --o-representative-sequences output/rep-seqs-deblur.qza --o-table output/table-deblur.qza --verbose --p-sample-stats --p-jobs-to-start 8 --o-stats output/deblur-stats.qza

[32mSaved FeatureTable[Frequency] to: output/table-deblur.qza[0m
[32mSaved FeatureData[Sequence] to: output/rep-seqs-deblur.qza[0m
[32mSaved DeblurStats to: output/deblur-stats.qza[0m


In [7]:
# Phylogeny mafft

!qiime phylogeny align-to-tree-mafft-fasttree   --i-sequences output/rep-seqs-deblur.qza   --o-alignment output/aligned-rep-seqs.qza   --o-masked-alignment output/masked-aligned-rep-seqs.qza   --o-tree output/unrooted-tree.qza   --o-rooted-tree output/rooted-tree.qza --verbose
!qiime diversity core-metrics-phylogenetic --i-phylogeny output/rooted-tree.qza --i-table output/table-deblur.qza --p-sampling-depth 1103 --m-metadata-file data/metadata.tsv --output-dir output/core-metrics-results --verbose

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: mafft --preservecase --inputorder --thread 1 /tmp/qiime2-archive-k7b5xfsl/0ffc96cb-3b1b-4546-9ead-bd4714520bb4/data/dna-sequences.fasta

inputfile = orig
3111 x 90 - 90 d
nthread = 1
nthreadpair = 1
nthreadtb = 1
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
 3101 / 3111 (thread    0)
done.

Constructing a UPGMA tree (efffree=0) ... 
 3100 / 3111
done.

Progressive alignment 1/2... 
STEP    71 / 3110 (thread    0)
Reallocating..done. *alloclen = 1181
STEP  3101 / 3110 (thread    0) h
done.

Making a distance matrix from msa.. 
 3100 / 3111 (thread    0)
done.

Constructing a UPGMA tree (efffree=1) ... 
 3100 / 3111
done.

Progressive alignment

In [None]:
# Taxonomy

# Classifier
!wget -nc https://data.qiime2.org/2020.11/common/silva-138-99-nb-classifier.qza -P data/
!qiime feature-classifier classify-sklearn --i-classifier data/silva-138-99-nb-classifier.qza --i-reads output/rep-seqs-deblur.qza --o-classification output/taxonomy.qza

# Data
!qiime metadata tabulate --m-input-file output/taxonomy.qza --o-visualization output/taxonomy.qzv
!qiime taxa collapse --i-table output/table-deblur.qza --i-taxonomy output/taxonomy.qza --p-level 6 --o-collapsed-table output/gut-table-l6.qza

File ‘data/silva-138-99-nb-classifier.qza’ already there; not retrieving.

