# Fungut

# 00 Packages and Directory

In [2]:
import IPython

import pandas as pd
import matplotlib.pyplot as plt
import qiime2 as q2
import seaborn as sns
from qiime2 import Visualization


import os

import matplotlib.pyplot as plt
%matplotlib inline

# 01 Data import

In [3]:
PATH = "fungut_data/fungut_metadata.tsv"

In [4]:
surveys_df = pd.read_csv(PATH, sep="\t")

In [5]:
data_dir = 'fungut_data'

In [7]:
!qiime tools peek $data_dir/fungut_forward_reads.qza

[32mUUID[0m:        3638611d-1767-413b-9390-70ee3d78e4ff
[32mType[0m:        SampleData[SequencesWithQuality]
[32mData format[0m: SingleLanePerSampleSingleEndFastqDirFmt


In [8]:
!qiime demux summarize \
  --i-data $data_dir/fungut_forward_reads.qza \
  --o-visualization $data_dir/demux_summary.qzv

  import pkg_resources
[32mSaved Visualization to: fungut_data/demux_summary.qzv[0m
[0m[?25h

In [9]:
Visualization.load(f"{data_dir}/demux_summary.qzv")

# 02 Trimming the primers

In [8]:
!qiime cutadapt trim-single \
  --i-demultiplexed-sequences $data_dir/fungut_forward_reads.qza \
  --p-front CTTGGTCATTTAGAGGAAGTAA \
  --o-trimmed-sequences $data_dir/fungut_forward_reads_trimmed.qza \
  --verbose

  import pkg_resources
Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: cutadapt -u 0 --error-rate 0.1 --times 1 --overlap 3 --minimum-length 1 -q 0,0 --quality-base 33 --cores 1 -o /tmp/qiime2/jovyan/processes/9805-1761647670.19@jovyan/tmp/q2-OutPath-it4lyt0k/ERR5327198_01_L001_R1_001.fastq.gz --front CTTGGTCATTTAGAGGAAGTAA /tmp/qiime2/jovyan/data/3638611d-1767-413b-9390-70ee3d78e4ff/data/ERR5327198_01_L001_R1_001.fastq.gz

This is cutadapt 5.1 with Python 3.10.14
Command line parameters: -u 0 --error-rate 0.1 --times 1 --overlap 3 --minimum-length 1 -q 0,0 --quality-base 33 --cores 1 -o /tmp/qiime2/jovyan/processes/9805-1761647670.19@jovyan/tmp/q2-OutPath-it4lyt0k/ERR5327198_01_L001_R1_001.fastq.gz --front CTTGGTCATTTAGAGGAAGTAA /tmp/qiime2/jovyan/data/3638611d-1767-413b-9390-70ee3d78e4ff/data

# 03 Denoising

### 1. Denoising Versuch mit trim left 0 und trunc len 0

In [11]:
!qiime dada2 denoise-single \
   --i-demultiplexed-seqs $data_dir/fungut_forward_reads.qza \
   --p-trim-left 0 \
   --p-trunc-len 0 \
   --p-min-fold-parent-over-abundance 4 \
   --p-max-ee 4 \
    --o-representative-sequences $data_dir/dada2_rep_seqs_1.qza \
    --o-table $data_dir/dada2_table_1.qza \
    --o-denoising-stats $data_dir/dada2_stats_1.qza

  import pkg_resources
[32mSaved FeatureTable[Frequency] to: fungut_data/dada2_table_1.qza[0m
[32mSaved FeatureData[Sequence] to: fungut_data/dada2_rep_seqs_1.qza[0m
[32mSaved SampleData[DADA2Stats] to: fungut_data/dada2_stats_1.qza[0m
[0m[?25h

In [12]:
! qiime metadata tabulate \
    --m-input-file $data_dir/dada2_stats_1.qza \
    --o-visualization $data_dir/dada2_stats_1.qzv

  import pkg_resources
[32mSaved Visualization to: fungut_data/dada2_stats_1.qzv[0m
[0m[?25h

In [13]:
Visualization.load(f"{data_dir}/dada2_stats_1.qzv")

In [14]:
! qiime feature-table tabulate-seqs \
    --i-data $data_dir/dada2_rep_seqs_1.qza \
    --o-visualization $data_dir/dada2_rep_seqs_1.qzv

  import pkg_resources
[32mSaved Visualization to: fungut_data/dada2_rep_seqs_1.qzv[0m
[0m[?25h

In [15]:
Visualization.load(f"{data_dir}/dada2_rep_seqs_1.qzv")

In [16]:
! qiime feature-table summarize \
    --i-table $data_dir/dada2_table_1.qza \
    --m-sample-metadata-file $data_dir/fungut_metadata.tsv \
    --o-visualization $data_dir/dada2_table_1.qzv

  import pkg_resources
[32mSaved Visualization to: fungut_data/dada2_table_1.qzv[0m
[0m[?25h

In [17]:
Visualization.load(f"{data_dir}/dada2_table_1.qzv")

### 2. Denoising Versuch mit trim left 15 und trunc len 130

In [None]:
!qiime dada2 denoise-single \
   --i-demultiplexed-seqs $data_dir/fungut_forward_reads.qza \
   --p-trim-left 15 \
   --p-trunc-len 130 \
   --p-min-fold-parent-over-abundance 4 \
   --p-max-ee 4 \
    --o-representative-sequences $data_dir/dada2_rep_seqs_2.qza \
    --o-table $data_dir/dada2_table_2.qza \
    --o-denoising-stats $data_dir/dada2_stats_2.qza

In [None]:
! qiime metadata tabulate \
    --m-input-file $data_dir/dada2_stats_2.qza \
    --o-visualization $data_dir/dada2_stats_2.qzv

In [None]:
Visualization.load(f"{data_dir}/dada2_stats_2.qzv")

In [None]:
! qiime feature-table tabulate-seqs \
    --i-data $data_dir/dada2_rep_seqs_2.qza \
    --o-visualization $data_dir/dada2_rep_seqs_2.qzv

In [None]:
Visualization.load(f"{data_dir}/dada2_rep_seqs_2.qzv")

In [None]:
! qiime feature-table summarize \
    --i-table $data_dir/dada2_table_2.qza \
    --m-sample-metadata-file $data_dir/fungut_metadata.tsv \
    --o-visualization $data_dir/dada2_table_2.qzv

In [None]:
Visualization.load(f"{data_dir}/dada2_table_2.qzv")

Comparing denoised data 1&2:
ITS sequences are more variable in length than 16S data. There is the risk of cutting the truncation length too short and loose valid ITS reads. Regarding that the the original data was good overall (everywhere over 30), it is better to keep all sequences. With this conclusion, it is decided to continue further steps with the data optained by run 1.

In [None]:
! qiime feature-classifier classify-sklearn ?


Denoising with trimming

In [None]:
!qiime dada2 denoise-single \
   --i-demultiplexed-seqs $data_dir/fungut_forward_reads_trimmed.qza \
   --p-trim-left 0 \
   --p-trunc-len 0 \
   --p-min-fold-parent-over-abundance 4 \
   --p-max-ee 4 \
    --o-representative-sequences $data_dir/rep_seqs_trimmed.qza \
    --o-table $data_dir/table_trimmed.qza \
    --o-denoising-stats $data_dir/stats_trimmed.qza

In [None]:
! qiime feature-table tabulate-seqs \
    --i-data $data_dir/rep_seqs_trimmed.qza \
    --o-visualization $data_dir/rep_seqs_trimmed.qzv

In [None]:
Visualization.load(f"{data_dir}/rep_seqs_trimmed.qzv")

# 04 Taxonomy

### 04.01 Taxonomy using pretrained classifier

In [None]:
classifier = unite_ver10_dynamic_s_all_19.02.2025-Q2-2024.10.qza

In [None]:
! qiime metadata tabulate \
    --m-input-file $data_dir/taxonomy.qza \
    --o-visualization $data_dir/taxonomy.qzv

In [6]:
Visualization.load(f"{data_dir}/taxonomy.qzv")

In [None]:
euler:
    miniconda und chiime auf euler laden
    jetzt könne wir mit euler kommunizieren
    für jeden job die parameter definieren + wo sollen datein abgespeichert werden
    laufen lassen
    Hat es funktioniert?

In [None]:
conda 
-> ampicon distribution (nicht m)
transferring files
zeit: 24h
cpu: nach datenbank orientieren -> etwas mehr als das
