<a href="https://colab.research.google.com/github/MishterBluesky/Tn-seek/blob/master/Tn_Seek.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tn-Seek - Automated Tn-seq processing and analysis

This software uses custom scripts to analyse tn-seq data. Simply select your genome and upload your fastq files. It will detect common sequences and remove the inverted repeats to clean your data automatically.

The comparison of insertion will be automatically processed for you into a csv file.

In [1]:
#@title Download Tn-Seek software and genome
#@markdown please provide a direct url to your genome files for download.
!pip install biopython pandas matplotlib
!wget -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x miniconda.sh
!bash ./miniconda.sh -b -f -p /usr/local

# Update PATH environment variable to include conda
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages')
!conda init bash

# Install bowtie2
!apt-get install -y bowtie2

# Create a directory for the reference genome
import os
os.makedirs('/content/ref_genome/GENOME', exist_ok=True)

# Download the correct reference genome file
url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/045/GCF_000009045.1_ASM904v1/GCF_000009045.1_ASM904v1_genomic.fna.gz' #@param
correct_url = url
!wget -P /content/ref_genome/GENOME {correct_url}

# Unzip the genome file if it's in .gz format
!gunzip /content/ref_genome/GENOME/GCF_000009045.1_ASM904v1_genomic.fna.gz

# Check if the genome file is downloaded correctly
genome_path = '/content/ref_genome/GENOME/GCF_000009045.1_ASM904v1_genomic.fna'
if os.path.exists(genome_path):
    print("Genome file downloaded successfully.")
else:
    print("Error: Genome file not found.")
    # Exit if the genome file is not found
    exit(1)

# Clone the GitHub repository
!git clone https://github.com/MishterBluesky/Tn-seek.git

# Navigate to the script directory
%cd Tn-seek

# Make the script executable
!chmod +x TnSeq3-2.sh

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.84
--2024-07-16 09:21:02--  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.32.241, 104.16.191.158, 2606:4700::6810:bf9e, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.32.241|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 146836934 (140M) [application/octet-stream]
Saving to: ‘miniconda.sh’


2024-07-16 09:21:03 (235 MB/s) - ‘miniconda.sh’ saved [146836934/146836934]

PREFIX=/usr/local
Unpacking payload ...

Installing base environment...

Preparing transaction: ...working... done
Executing transaction: ...working... done
installation finished.
    You curr

In [2]:
!git clone https://github.com/marcelm/cutadapt.git
!pip install ./cutadapt
!cutadapt --version

Cloning into 'cutadapt'...
remote: Enumerating objects: 12896, done.[K
remote: Counting objects: 100% (2505/2505), done.[K
remote: Compressing objects: 100% (655/655), done.[K
remote: Total 12896 (delta 1647), reused 2416 (delta 1578), pack-reused 10391[K
Receiving objects: 100% (12896/12896), 3.48 MiB | 8.69 MiB/s, done.
Resolving deltas: 100% (8428/8428), done.
Processing ./cutadapt
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting dnaio>=1.2.0 (from cutadapt==4.10.dev2+gadfbc18)
  Downloading dnaio-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting xopen>=1.6.0 (from cutadapt==4.10.dev2+gadfbc18)
  Downloading xopen-2.0.2-py3-none-any.whl.metadata (15 kB)
Collecting isal>=1.6.1 (from xopen>=1.6.0->cutadapt==4.10.dev2+gadfbc18)
  Downloading isal-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.w

In [3]:

import os
!cd /content/Tn-seek/





# Wildtype analysis




In [4]:
!cd /content/Tn-seek/
from google.colab import files
uploaded = files.upload()
#@markdown Please upload your wildtype control fastq file. This takes some time typically.
original_name = next(iter(uploaded.keys()))
# New file name (change extension to '.fastq')
new_name = original_name.replace('.fq', '.fastq')

# Save the uploaded file with the new name
with open(new_name, 'wb') as f:
    f.write(uploaded[original_name])
genome_path1 = f'/content/ref_genome/GENOME/GENOME'
wildtype_filename = original_name.replace('.fq', '')

prefix1 = wildtype_filename

Saving 2413.fastq to 2413.fastq


In [6]:
#@markdown Optionally we will save your fastq file to your google drive if it isnt already, as uploading this takes a long time.
from google.colab import drive
from google.colab import files
import shutil

drive.mount('/content/drive')
filename = f'{wildtype_filename}.fastq'
destination_folder = '/content/drive/My Drive/Tn-seek/'
for filename in uploaded.keys():
    shutil.copy(filename, destination_folder + filename)
    print(f'copied "{filename}" to "{destination_folder}"')

Mounted at /content/drive
copied "2413.fastq" to "/content/drive/My Drive/Tn-seek/"


In [17]:
#@markdown This code will find repeat sequences within your fastq and remove them..
# Step 1: Read the File and Sample the First 1000 Lines
sample_size = 10000
wildtype_filename = wildtype_filename.replace('.fastq', '')
prefix1= wildtype_filename
file_path = filename
print(file_path)
lines = []
with open(file_path, 'r') as file:
    for i in range(sample_size):
        line = file.readline()
        if not line:
            break
        lines.append(line.strip())

# Step 2: Extract Nucleotide Sequences
nucleotide_sequences = [lines[i] for i in range(1, len(lines), 4)]

# Step 3: Count Occurrences of Sequences
from collections import defaultdict

sequence_counts = defaultdict(int)

def count_subsequences(seq, min_length):
    for length in range(min_length, len(seq) + 1):
        for i in range(len(seq) - length + 1):
            subseq = seq[i:i+length]
            sequence_counts[subseq] += 1

# Count subsequences for all nucleotide sequences
for seq in nucleotide_sequences:
    count_subsequences(seq, 5)

total_sequences = len(nucleotide_sequences)

# Step 4: Determine Frequent Sequences
frequent_sequences = {
    seq: count for seq, count in sequence_counts.items()
    if (len(seq) > 15 and count / total_sequences > 0.10) or
       (len(seq) > 7 and count / total_sequences > 0.30)
}

# Output the results
frequent_sequences_list = list(frequent_sequences.keys())
output_filename = f'IR_sequences_{wildtype_filename}.txt'
with open(output_filename, 'w') as file:
    for seq in frequent_sequences_list:
        file.write(f"{seq}\n")
# Read the sequences from the file into a list
with open(output_filename, 'r') as file:
    sequences = [line.strip() for line in file]

# Sort sequences based on their lengths in descending order
sequences_sorted = sorted(sequences, key=len, reverse=True)

# Write the sorted sequences back to the file
with open(f'IR_sequences_{wildtype_filename}_sorted.txt', 'w') as file:
    for sequence in sequences_sorted:
        file.write(sequence + '\n')
print(f'Saved sequences to IR_sequences_{wildtype_filename}_sorted.txt')

output_filename = f'IR_sequences_{wildtype_filename}_sorted.txt'
with open(output_filename, 'r') as file:
    sequences = [line.strip() for line in file]

# Step 2: Sort sequences based on their lengths in descending order
sequences_sorted = sorted(sequences, key=len, reverse=True)

# Step 3: Write the sorted sequences to a new file in numbered FASTA format
fasta_output_filename = f'IR_sequences_{wildtype_filename}_sorted.fasta'
with open(fasta_output_filename, 'w') as file:
    for index, sequence in enumerate(sequences_sorted, start=1):
        file.write(f">Sequence_{index}\n")
        file.write(f"{sequence}\n")

print(f'Saved sequences to {fasta_output_filename}')

2413.fastq
Saved sequences to IR_sequences_2413_sorted.txt
Saved sequences to IR_sequences_2413_sorted.fasta


In [18]:
import os
import subprocess
import shutil
fasta = f'IR_sequences_{wildtype_filename}_sorted.fasta'
# Copy original fastq file to .trim.fastq
shutil.copyfile(f'{wildtype_filename}.fastq', f'{wildtype_filename}.trim.fastq')
fastq_file = f'{wildtype_filename}.fastq'
cut_fastq = f'{wildtype_filename}.trim.fastq'
# Iterate over sequences and run cutadapt
!cutadapt -b file:{fasta} -m 15 -l 15 -o {cut_fastq} {fastq_file}
print("All sequences processed with cutadapt.")

This is cutadapt 4.10.dev2+gadfbc18 with Python 3.12.4
Command line parameters: -b file:IR_sequences_2413_sorted.fasta -m 15 -l 15 -o 2413.trim.fastq 2413.fastq
Processing single-end reads on 1 core ...
Done           00:00:26     8,476,106 reads @   3.1 µs/read;  19.08 M reads/minute
Finished in 26.653 s (3.145 µs/read; 19.08 M reads/minute).

=== Summary ===

Total reads processed:               8,476,106

== Read fate breakdown ==
Reads that were too short:                   0 (0.0%)
Reads written (passing filters):     8,476,106 (100.0%)

Total basepairs processed:   135,617,696 bp
Total written (filtered):    127,141,590 bp (93.8%)
All sequences processed with cutadapt.


In [None]:
prefix1= wildtype_filename
genome_path = '/content/ref_genome/GENOME/GCF_000009045.1_ASM904v1_genomic.fna'
!bowtie2-build {genome_path} /content/ref_genome/GENOME/GENOME
ir_seq = 'GCTA'
!./TnSeq3-2.sh -i {ir_seq} -g /content/ref_genome/GENOME/GENOME {prefix1}

Settings:
  Output files: "/content/ref_genome/GENOME/GENOME.*.bt2"
  Line rate: 6 (line is 64 bytes)
  Lines per side: 1 (side is 64 bytes)
  Offset rate: 4 (one in 16)
  FTable chars: 10
  Strings: unpacked
  Max bucket size: default
  Max bucket size, sqrt multiplier: default
  Max bucket size, len divisor: 4
  Difference-cover sample period: 1024
  Endianness: little
  Actual local endianness: little
  Sanity checking: disabled
  Assertions: disabled
  Random seed: 0
  Sizeofs: void*:8, int:4, long:8, size_t:8
Input files DNA, FASTA:
  /content/ref_genome/GENOME/GCF_000009045.1_ASM904v1_genomic.fna
Building a SMALL index
Reading reference sizes
  Time reading reference sizes: 00:00:00
Calculating joined length
Writing header
Reserving space for joined string
Joining reference sequences
  Time to join reference sequences: 00:00:00
bmax according to bmaxDivN setting: 1053901
Using parameters --bmax 790426 --dcv 1024
  Doing ahead-of-time memory usage test
  Passed!  Constructing with

# Condition analysis

In [None]:
!cd /content/Tn-seek/
from google.colab import files
uploaded = files.upload()
#@markdown Please upload your condition fastq file, and note the file prefix
original_name = next(iter(uploaded.keys()))
# New file name (change extension to '.fastq')
new_name = original_name.replace('.fq', '.fastq')

# Save the uploaded file with the new name
with open(new_name, 'wb') as f:
    f.write(uploaded[original_name])
condition_filename = original_name.replace('.fq', '')
prefix2 = condition_filename

In [None]:
#@markdown Optionally we will save your fastq file to your google drive if it isnt already, as uploading this takes a long time.
from google.colab import drive
from google.colab import files
import shutil

drive.mount('/content/drive')
filename = f'{condition_filename}.fastq'
destination_folder = '/content/drive/My Drive/Tn-seek/'
for filename in uploaded.keys():
    shutil.copy(filename, destination_folder + filename)
    print(f'copied "{filename}" to "{destination_folder}"')

In [None]:
#@markdown This code will find repeat sequences within your fastq and remove them..
# Step 1: Read the File and Sample the First 1000 Lines
file_path1 = f'{condition_filename}.fastq'
sample_size = 10000
condition_filename = condition_filename.replace('.fastq', '')
prefix2= condition_filename
file_path1 = filename
print(file_path)
lines = []
with open(file_path1, 'r') as file:
    for i in range(sample_size):
        line = file.readline()
        if not line:
            break
        lines.append(line.strip())

# Step 2: Extract Nucleotide Sequences
nucleotide_sequences = [lines[i] for i in range(1, len(lines), 4)]

# Step 3: Count Occurrences of Sequences
from collections import defaultdict

sequence_counts = defaultdict(int)

def count_subsequences(seq, min_length):
    for length in range(min_length, len(seq) + 1):
        for i in range(len(seq) - length + 1):
            subseq = seq[i:i+length]
            sequence_counts[subseq] += 1

# Count subsequences for all nucleotide sequences
for seq in nucleotide_sequences:
    count_subsequences(seq, 5)

total_sequences = len(nucleotide_sequences)

# Step 4: Determine Frequent Sequences
frequent_sequences = {
    seq: count for seq, count in sequence_counts.items()
    if (len(seq) > 15 and count / total_sequences > 0.10) or
       (len(seq) > 7 and count / total_sequences > 0.30)
}

# Output the results
frequent_sequences_list = list(frequent_sequences.keys())
output_filename = f'IR_sequences_{condition_filename}.txt'
with open(output_filename, 'w') as file:
    for seq in frequent_sequences_list:
        file.write(f"{seq}\n")
# Read the sequences from the file into a list
with open(output_filename, 'r') as file:
    sequences = [line.strip() for line in file]

# Sort sequences based on their lengths in descending order
sequences_sorted = sorted(sequences, key=len, reverse=True)

# Write the sorted sequences back to the file
with open(f'IR_sequences_{condition_filename}_sorted.txt', 'w') as file:
    for sequence in sequences_sorted:
        file.write(sequence + '\n')
print(f'Saved sequences to IR_sequences_{condition_filename}_sorted.txt')

output_filename = f'IR_sequences_{condition_filename}_sorted.txt'
with open(output_filename, 'r') as file:
    sequences = [line.strip() for line in file]

# Step 2: Sort sequences based on their lengths in descending order
sequences_sorted = sorted(sequences, key=len, reverse=True)

# Step 3: Write the sorted sequences to a new file in numbered FASTA format
fasta_output_filename1 = f'IR_sequences_{condition_filename}_sorted.fasta'
with open(fasta_output_filename1, 'w') as file:
    for index, sequence in enumerate(sequences_sorted, start=1):
        file.write(f">Sequence_{index}\n")
        file.write(f"{sequence}\n")

print(f'Saved sequences to {fasta_output_filename1}')

In [None]:
import os
import subprocess
import shutil
fasta1 = f'IR_sequences_{condition_filename}_sorted.fasta'
# Copy original fastq file to .trim.fastq
shutil.copyfile(f'{condition_filename}.fastq', f'{condition_filename}.trim.fastq')
fastq_file1 = f'{condition_filename}.fastq'
cut_fastq1 = f'{condition_filename}.trim.fastq'
# Iterate over sequences and run cutadapt
!cutadapt -b file:{fasta1} -m 15 -l 15 -o {cut_fastq1} {fastq_file1}
print("All sequences processed with cutadapt.")

In [None]:
ir_seq2 = 'GCTA'
!./TnSeq3-2.sh -i {ir_seq2} -g /content/ref_genome/GENOME/GENOME {prefix2}

# Tnseq visualisation and comparison

In [None]:
!pip install pandas matplotlib


In [None]:
#@title Align your cleaned fq files to the genome to find transposon insertion site frequencies for each gene.
import pandas as pd
import matplotlib.pyplot as plt
import requests
import shutil
import gzip

# Function to download the GFF file
def download_gff(url, output_path):
    response = requests.get(url, stream=True)
    with open(output_path, 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
    del response

# Function to parse GFF file and extract genes
def parse_gff(gff_file):
    genes = []
    with gzip.open(gff_file, 'rt') as file:
        for line in file:
            if line.startswith('#'):
                continue
            parts = line.strip().split('\t')
            if parts[2] == 'gene':
                start = int(parts[3])
                end = int(parts[4])
                strand = parts[6]
                attributes = parts[8]
                gene_id = attributes.split(';')[0].split('=')[1]
                gene_name = attributes.split(';')[1].split('=')[1] if len(attributes.split(';')) > 1 else ""
                gene_length = end - start + 1  # Calculate gene length
                genes.append((start, end, strand, gene_id, gene_name, gene_length))
    return pd.DataFrame(genes, columns=['start', 'end', 'strand', 'gene_id', 'gene_name', 'gene_length'])

# Function to process transposon sites and find hits
def process_transposon_sites(prefix, gff_file):
    # Load transposon insertion sites
    transposon_sites = pd.read_csv(f'{prefix}/{prefix}-sites.txt', delim_whitespace=True, header=None)
    transposon_sites.columns = ['count', 'position']

    # Parse GFF file to get genes dataframe
    genes_df = parse_gff(gff_file)

    results = []
    for _, row in transposon_sites.iterrows():
        position = row['position']
        count = row['count']
        hit_genes = genes_df[(genes_df['start'] <= position) & (genes_df['end'] >= position)]
        for _, gene in hit_genes.iterrows():
            results.append({
                "prefix": prefix,
                "transposon_position": position,
                "count": count,
                "gene_id": gene['gene_id'],
                "gene_name": gene['gene_name'],
                "gene_start": gene['start'],
                "gene_end": gene['end'],
                "strand": gene['strand'],
                "gene_length": gene['gene_length']  # Include gene length
            })

    results_df = pd.DataFrame(results)

    # Aggregate counts per gene (summing counts from different positions)
    aggregated_results_df = results_df.groupby(['gene_id', 'gene_name', 'gene_length']).agg({
        'count': 'sum'
    }).reset_index()

    return aggregated_results_df

# URL for Bacillus subtilis GFF file from Ensembl Bacteria
gff_url = "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/gff3/bacteria_0_collection/bacillus_subtilis_subsp_subtilis_str_168_gca_000009045/Bacillus_subtilis_subsp_subtilis_str_168_gca_000009045.ASM904v1.59.gff3.gz"
gff_file = "Bacillus_subtilis_subsp_subtilis_str_168_gca_000009045.ASM904v1.59.gff3.gz"

# Download the GFF file
download_gff(gff_url, gff_file)

# Parse the GFF file to get genes dataframe
genes_df = parse_gff(gff_file)

# Process transposon sites for prefix1 (wildtype)

wildtype_results_df = process_transposon_sites(prefix1, gff_file)
wildtype_results_df.to_csv(f'{prefix1}_transposon_hits.csv', index=False)

# Process transposon sites for prefix2 (condition)

condition_results_df = process_transposon_sites(prefix2, gff_file)
condition_results_df.to_csv(f'{prefix2}_transposon_hits.csv', index=False)

# Ensure all genes are included by merging with the complete gene list
wildtype_full_df = pd.merge(genes_df, wildtype_results_df, on=['gene_id', 'gene_name', 'gene_length'], how='left').fillna(0)
condition_full_df = pd.merge(genes_df, condition_results_df, on=['gene_id', 'gene_name', 'gene_length'], how='left').fillna(0)

# Merge wildtype and condition results on gene_id and gene_name
merged_results_df = pd.merge(wildtype_full_df[['gene_id', 'gene_name', 'gene_length', 'count']],
                             condition_full_df[['gene_id', 'gene_name', 'gene_length', 'count']],
                             on=['gene_id', 'gene_name', 'gene_length'],
                             suffixes=('_wildtype', '_condition'),
                             how='outer')

# Ensure that merged dataframe has zero counts for missing values
merged_results_df['count_wildtype'].fillna(0, inplace=True)
merged_results_df['count_condition'].fillna(0, inplace=True)

# Save merged results to CSV
merged_results_df.to_csv('Tnseek-transposon_hits_merged.csv', index=False)

# Print the first few rows of the merged results
print(merged_results_df.head())


In [None]:
#@title Filter the genes to remove Housekeeping genes and common insertion sites (extremely high and low tn insertion frequencies in the wildtype) and plot a ratio graph
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#@markdown The housekeeping threshold is the number of basepairs between each transposon. A way to reduce the background of housekeeping genes and background noise between samples is to reduce this threshold. This reduces contenders that already have very low counts from the data. Eventually however this will affect all genes, and randomly remove good data.
housekeeping_threshold = 200 #@param
housekeeping_threshold = 1/housekeeping_threshold
# Load the merged results CSV (replace with your actual CSV file path)
merged_results_df = pd.read_csv('Tnseek-transposon_hits_merged.csv')

# Add TIF (Transposon Insertion Frequency) columns with handling for division by zero
merged_results_df['TIF_WT'] = merged_results_df.apply(
    lambda row: row['count_wildtype'] / row['gene_length'] if row['gene_length'] != 0 else 0, axis=1
)
merged_results_df['TIF_Condition'] = merged_results_df.apply(
    lambda row: row['count_condition'] / row['gene_length'] if row['gene_length'] != 0 else 0, axis=1
)

# Add RATIO columns with handling for division by zero
merged_results_df['RATIO'] = merged_results_df.apply(
    lambda row: row['TIF_WT'] / row['TIF_Condition'] if row['TIF_Condition'] != 0 else np.inf, axis=1
)
merged_results_df['RATIO_BOTHENDS'] = merged_results_df.apply(
    lambda row: max(row['RATIO'], 1 / row['RATIO']) if row['RATIO'] != 0 else np.inf, axis=1
)

# Filter out housekeeping genes and frequent transposon insertion sites
merged_results_df['RATIO_BOTHENDS_HK_FILTER'] = merged_results_df.apply(
    lambda row: row['RATIO_BOTHENDS'] if row['TIF_WT'] > housekeeping_threshold else "Housekeeping", axis=1
)
merged_results_df['TNF1'] = merged_results_df.apply(
    lambda row: "TI_Insertion_site" if row['TIF_WT'] > 5 else row['RATIO_BOTHENDS_HK_FILTER'], axis=1
)
merged_results_df['TNF2'] = merged_results_df.apply(
    lambda row: "TI_Insertion_site" if row['TIF_Condition'] > 5 else row['TNF1'], axis=1
)
merged_results_df.to_csv('Tnseek-transposon_hits_merged_ratio.csv', index=False)
# Filter for non-housekeeping, non-transposon insertion site genes
filtered_df = merged_results_df[
    (merged_results_df['TNF2'] != 'Housekeeping') &
    (merged_results_df['TNF2'] != 'TI_Insertion_site')
].copy()  # Ensure a copy of the DataFrame to avoid the SettingWithCopyWarning

# Convert 'RATIO' column to numeric, coerce non-numeric values to NaN
filtered_df['RATIO'] = pd.to_numeric(filtered_df['RATIO'], errors='coerce')

# Remove rows with infinite 'RATIO' values
filtered_df = filtered_df[~np.isinf(filtered_df['RATIO'])]

# Separate plot for the ratio
plt.figure(figsize=(12, 6))
plt.scatter(filtered_df['gene_id'], filtered_df['RATIO'], c='black', label='Filtered Genes')

# Label top 20 genes based on ratio

top_20_genes = filtered_df.nlargest(20, 'RATIO').reset_index(drop=True)
for i, row in top_20_genes.iterrows():
    plt.text(row['gene_id'], row['RATIO'], f"{row['gene_name']} ({row['RATIO']:.2f})", fontsize=6, color='red', ha='center')
bottom_20_genes = filtered_df.nsmallest(20, 'RATIO').reset_index(drop=True)
for i, row in bottom_20_genes.iterrows():
    plt.text(row['gene_id'], row['RATIO'], f"{row['gene_name']} ({row['RATIO']:.2f})", fontsize=6, color='blue', ha='center')

plt.yscale('log')
plt.axhline(y=1, color='r', linestyle='--', label='Ratio = 1')
plt.xlabel('Gene ID')
plt.ylabel('Ratio (TIF_WT / TIF_Condition)')
plt.title('Transposon Insertion Frequency Ratio for Filtered Genes (Excluding Infinite Values)')
plt.legend()
plt.xticks([])  # Remove x-axis labels
plt.tight_layout()
plt.show()


In [None]:
print(top_20_genes)
print(bottom_20_genes)

In [None]:
#@title Download results
import os
from google.colab import files
import shutil
# Specify the folder path and the output zip file name
folder_to_zip = f'/content/Tn-seek/Tn-seek/{wildtype_filename}'
output_zip_file = f'TnSeek_{wildtype_filename}.zip'
shutil.make_archive(output_zip_file.replace('.zip', ''), 'zip', folder_to_zip)
folder_to_zip = f'/content/Tn-seek/Tn-seek/{condition_filename}'
output_zip_file = f'TnSeek_{condition_filename}.zip'
shutil.make_archive(output_zip_file.replace('.zip', ''), 'zip', folder_to_zip)
files.download("Tnseek_transposon_hits_merged.csv")
files.download(f"{wildtype_filename}.zip")
files.download(f"{condition_filename}.zip")
files.download(f"{wildtype_filename}_transposon_hits.csv")
files.download(f"{condition_filename}_transposon_hits.csv")