In [4]:
%matplotlib inline
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import gridspec
import matplotlib.patheffects as path_effects

import os,glob,copy
import numpy as np
import squarify

typeface='Helvetica Neue'
mpl.rcParams['mathtext.fontset']='custom'
mpl.rcParams['font.sans-serif']=typeface
mpl.rcParams['mathtext.default']='sf'
mpl.rcParams['axes.labelweight']=300
mpl.rcParams['font.family']=typeface
mpl.rcParams['font.size']=22

base_path='/Volumes/aine_store/SENZOR_project/SPlited_SENSOR_porject/IDseq/Metdata_senzor_project.csv' ## where contig_quality and contig folders sit

metadata={}
with open(base_path, 'r') as f:  # Use 'with' to open the file safely
    for line_num, line in enumerate(f):
        l = line.strip().split(',')  # Remove '\r\n' and split by comma
        if line_num == 0:  # Assuming the first line is the header
            header = l
        else:
            metadata[l[0]] = {header[i]: l[i].strip() for i in range(len(l))}  # Safely strip spaces

In [16]:
import os
import pandas as pd
from Bio import SeqIO  # To parse FASTA files

# Define file paths
csv_target_directory = '/Volumes/aine_store/SENZOR_project/SPlited_SENSOR_porject/IDseq/Scylla_merged_csv_files/'
final_csv_path = os.path.join(csv_target_directory, '1total_updated_final_merged_contig_read_counts.csv')
fasta_file_path = "/Volumes/aine_store/SENZOR_project/Blast_databse/NCBI_virus_reference.fa"  # Update this with the path to your TXT file

# Step 1: Parse the FASTA file to extract accession IDs, family, genus, and virus names
accession2info = {}
for record in SeqIO.parse(fasta_file_path, "fasta"):
    header = record.description
    # Split the header based on the pipe symbol
    accession_id = header.split('|')[0].split('.')[0].strip()  # Accession ID without version (e.g., NC_086348 from NC_086348.1)
    family = header.split('|')[1].strip()  # Family (e.g., 'Phenuiviridae')
    genus = header.split('|')[2].strip()  # Genus (e.g., 'Phlebovirus')
    virus_name = header.split('|')[3].strip()  # Virus name (e.g., 'Frijoles phlebovirus')
    
    # Store the family, genus, and virus name in a dictionary, mapping by the accession ID without version number
    accession2info[accession_id] = {'family': family, 'genus': genus, 'name': virus_name}

# Step 2: Load the CSV file
df = pd.read_csv(final_csv_path)

# Step 3: Remove the version numbers from the 'subject' column in the CSV (if present)
df['subject'] = df['subject'].str.split('.').str[0]

# Step 4: Update the CSV with family, genus, and virus names based on the 'subject' column
df['family'] = df['subject'].map(lambda acc: accession2info.get(acc, {}).get('family'))  # Create or update 'family' column
df['genus'] = df['subject'].map(lambda acc: accession2info.get(acc, {}).get('genus'))  # Create or update 'genus' column
df['name'] = df['subject'].map(lambda acc: accession2info.get(acc, {}).get('name'))  # Update the 'name' column

# Step 5: Save the updated CSV to a new file
updated_csv_path = os.path.join(csv_target_directory, '1complete_final_merged_contig_read_counts_with_family_genus_names.csv')
df.to_csv(updated_csv_path, index=False)

print(f"Updated CSV with family, genus, and virus names saved at: {updated_csv_path}")

Updated CSV with family, genus, and virus names saved at: /Volumes/aine_store/SENZOR_project/SPlited_SENSOR_porject/IDseq/Scylla_merged_csv_files/1complete_final_merged_contig_read_counts_with_family_genus_names.csv


In [13]:
df =pd.read_csv("/Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/merged_csv_files/updated_final_merged_contig_read_counts_with_genus_and_names.csv")

df.head(100)

Unnamed: 0,sample,contig_name,contig_length,read_count,subject,percent_identity,alignment_length,mismatches,gap_opens,q_start,...,taxid_blast,blasted,taxon_group_blast,taxid_curated,rdrp,name,segment,curated,taxon_group,genus
0,AIAMACAT001_S69,k141_0,226,2,,,,,,,...,,True,Viruses,,False,,,True,Viruses,
1,AIAMACAT001_S69,k141_176892,268,10,,,,,,,...,,True,Viruses,,False,,,True,Viruses,
2,AIAMACAT001_S69,k141_235855,231,2,,,,,,,...,,True,Viruses,,False,,,True,Viruses,
3,AIAMACAT001_S69,k141_58964,324,7,,,,,,,...,,True,Viruses,,False,,,True,Viruses,
4,AIAMACAT001_S69,k141_117928,281,4,,,,,,,...,,True,Viruses,,False,,,True,Viruses,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,AIAMACAT001_S69,k141_235869,285,6,,,,,,,...,,True,Viruses,,False,,,True,Viruses,
96,AIAMACAT001_S69,k141_412760,384,7,,,,,,,...,,True,Viruses,,False,,,True,Viruses,
97,AIAMACAT001_S69,k141_117939,244,2,,,,,,,...,,True,Viruses,,False,,,True,Viruses,
98,AIAMACAT001_S69,k141_13,258,4,,,,,,,...,,True,Viruses,,False,,,True,Viruses,


In [17]:
import os
import pandas as pd
from ete3 import NCBITaxa

# Initialize NCBI taxonomy object
ncbi = NCBITaxa()

# Define the path to the virus_accession2taxid.txt file
accession2taxid_file = "/Volumes/aine_store/SENZOR_project/Blast_databse/virus_accession2taxid.txt"

# Load the virus_accession2taxid.txt file into a dictionary
accession2taxid = {}
with open(accession2taxid_file, 'r') as f:
    for line in f:
        accession, taxid = line.strip().split('\t')  # Assuming it's tab-delimited
        accession2taxid[accession] = int(taxid)  # Store taxid as an integer

# Load your CSV file
csv_file = '/Volumes/aine_store/SENZOR_project/SPlited_SENSOR_porject/IDseq/Scylla_merged_csv_files/1complete_final_merged_contig_read_counts_with_family_genus_names.csv'  # Update with your actual file path
df = pd.read_csv(csv_file)

# Function to calculate the number of aligned bases (alignment_length * percent_identity)
def aligned_bases(row):
    if pd.notna(row['alignment_length']) and pd.notna(row['percent_identity']):
        return row['alignment_length'] * (row['percent_identity'] / 100)
    return 0

# Apply the aligned_bases function to all rows to compute the aligned bases for each hit
df['aligned_bases'] = df.apply(aligned_bases, axis=1)

# Identify the best BLAST hit (with the highest aligned bases) for each contig
df['best_hit'] = df.groupby('contig_name')['aligned_bases'].transform(max)

# Filter hits that are within the threshold:
# aligned_bases >= best_hit - mismatches
def is_valid_hit(row):
    if pd.notna(row['best_hit']) and pd.notna(row['mismatches']):
        threshold = row['best_hit'] - row['mismatches']
        return row['aligned_bases'] >= threshold
    return False

df['valid_hit'] = df.apply(is_valid_hit, axis=1)

# Function to compute the LCA of taxonomic IDs using NCBI taxonomy
def compute_lca(taxids):
    if len(taxids) == 0:
        return None
    try:
        taxid_lca = ncbi.get_topology(taxids).get_common_ancestor(taxids).name
        return taxid_lca
    except:
        return None

# Remove version numbers from the 'subject' column (everything after the first dot)
df['subject'] = df['subject'].str.split('.').str[0]

# Map subjects to taxonomic IDs using the accession2taxid dictionary
df['taxid_blast'] = df['subject'].map(accession2taxid)

# Group valid hits by contig_name and calculate the LCA
df['taxid_curated'] = df.groupby('contig_name')['taxid_blast'].transform(lambda x: compute_lca(x.dropna().tolist()))

# Save the updated dataframe to a new CSV file
output_csv_file = '/Volumes/aine_store/SENZOR_project/SPlited_SENSOR_porject/IDseq/Scylla_merged_csv_files/11complete_final_merged_contig_read_counts_with_family_genus_names.csv'  # Update with your desired output file path
df.to_csv(output_csv_file, index=False)

print(f"Updated CSV saved at: {output_csv_file}")

  df['taxid_curated'] = df.groupby('contig_name')['taxid_blast'].transform(lambda x: compute_lca(x.dropna().tolist()))


Updated CSV saved at: /Volumes/aine_store/SENZOR_project/SPlited_SENSOR_porject/IDseq/Scylla_merged_csv_files/11complete_final_merged_contig_read_counts_with_family_genus_names.csv


In [18]:
import os
import pandas as pd
from Bio import SeqIO  # To parse FASTA files

# Define file paths
csv_target_directory = '/Volumes/aine_store/SENZOR_project/SPlited_SENSOR_porject/IDseq/Scylla_merged_csv_files/'
final_csv_path = os.path.join(csv_target_directory, '11complete_final_merged_contig_read_counts_with_family_genus_names.csv')
fasta_file_path = "/Volumes/aine_store/SENZOR_project/Blast_databse/NCBI_virus_reference.fa"  # Update this with the path to your FASTA file

# Step 1: Parse the FASTA file to extract accession IDs, family, genus, and virus names
accession2info = {}
for record in SeqIO.parse(fasta_file_path, "fasta"):
    header = record.description
    # Split the header based on the pipe symbol
    accession_id = header.split('|')[0].split('.')[0].strip()  # Accession ID without version (e.g., NC_086348 from NC_086348.1)
    family = header.split('|')[1].strip()  # Family (e.g., 'Phenuiviridae')
    genus = header.split('|')[2].strip()  # Genus (e.g., 'Phlebovirus')
    virus_name = header.split('|')[3].strip()  # Virus name (e.g., 'Frijoles phlebovirus')
    
    # Store the family, genus, and virus name in a dictionary, mapping by the accession ID without version number
    accession2info[accession_id] = {'family': family, 'genus': genus, 'name': virus_name}

# Step 2: Load the CSV file
df = pd.read_csv(final_csv_path)

# Step 3: Remove the version numbers from the 'subject' column in the CSV (if present)
df['subject'] = df['subject'].str.split('.').str[0]

# Step 4: Mark 'dark contigs' for contigs with no BLAST hits (no match in accession2info)
df['dark_contig'] = df['subject'].apply(lambda acc: 'Yes' if acc not in accession2info else 'No')

# Step 5: Save the updated CSV to a new file
output_csv_file = os.path.join(csv_target_directory, 'LCA_complete_final_merged_contig_read_counts_with_dark_contigs.csv')
df.to_csv(output_csv_file, index=False)

print(f"Updated CSV with dark contigs saved at: {output_csv_file}")


Updated CSV with dark contigs saved at: /Volumes/aine_store/SENZOR_project/SPlited_SENSOR_porject/IDseq/Scylla_merged_csv_files/LCA_complete_final_merged_contig_read_counts_with_dark_contigs.csv


In [14]:
import os
import pandas as pd

# Define the paths
csv_target_directory = '/Volumes/aine_store/SENZOR_project/SPlited_SENSOR_porject/IDseq/Scylla_merged_csv_files/'
final_csv_path = os.path.join(csv_target_directory, 'updated_final_merged_contig_read_counts_with_genus_and_names.csv')
accession2taxid_file = "/Volumes/aine_store/SENZOR_project/Blast_databse/virus_accession2taxid.txt"

# Load the virus_accession2taxid.txt file into a dictionary
accession2taxid = {}
with open(accession2taxid_file, 'r') as f:
    for line in f:
        accession, taxid = line.strip().split('\t')  # Assuming it's tab-delimited
        accession2taxid[accession] = taxid

# Load the final_merged_contig_read_counts.csv
df = pd.read_csv(final_csv_path)

# Remove version numbers from the 'subject' column (everything after the first dot)
df['subject'] = df['subject'].str.split('.').str[0]

# Update the 'taxid_curated' column in the CSV based on the 'subject' column matching with the accession IDs
df['taxid_curated'] = df['subject'].map(accession2taxid)

# Save the updated dataframe back to CSV
updated_csv_path = os.path.join(csv_target_directory, '1total_updated_final_merged_contig_read_counts.csv')
df.to_csv(updated_csv_path, index=False)

print(f"Updated CSV saved at: {updated_csv_path}")


Updated CSV saved at: /Volumes/aine_store/SENZOR_project/SPlited_SENSOR_porject/IDseq/Scylla_merged_csv_files/1total_updated_final_merged_contig_read_counts.csv


In [7]:
import os
import pandas as pd

# Define the directory where the CSV files are located
csv_target_directory = '/Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/merged_csv_files/'

# List to store dataframes for merging
csv_dataframes = []

# Loop through each CSV file in the target directory
for file_name in os.listdir(csv_target_directory):
    if file_name.endswith('_final_merged_blast_contig_read_counts.csv'):  # Ensure it matches the correct CSV files
        full_file_path = os.path.join(csv_target_directory, file_name)
        
        # Read the CSV file and append to the list of dataframes
        df = pd.read_csv(full_file_path)
        csv_dataframes.append(df)

# Merge all CSV dataframes into one, ignoring the extra headers
merged_csv = pd.concat(csv_dataframes, ignore_index=True)

# Save the merged dataframe into a new CSV file
final_csv_path = os.path.join(csv_target_directory, 'final_merged_contig_read_counts.csv')
merged_csv.to_csv(final_csv_path, index=False)

print(f"Final merged CSV saved at: {final_csv_path}")

  df = pd.read_csv(full_file_path)
  df = pd.read_csv(full_file_path)


Final merged CSV saved at: /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/merged_csv_files/final_merged_contig_read_counts.csv


In [5]:
import os
import shutil

# Define the base path and target directory
base_path = '/Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed'
folders = ['CAT_fastq', 'CHICKEN_fastq', 'DOG_fastq', 'GOAT_fastq', 'LIZARD_fastq', 'PIG_fastq', 'SHEEP_fastq']
target_directory = '/Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/merged_csv_files/'

# Create the target directory if it doesn't exist
os.makedirs(target_directory, exist_ok=True)

# Traverse each folder and copy the relevant CSV files
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    
    # Walk through all subdirectories in the folder
    for root, dirs, files in os.walk(folder_path):
        # Check if the current path contains the "assemblies/viruses" folder
        if 'assemblies/viruses' in root:
            # Loop through the files and find the desired CSV files
            for file in files:
                if file.endswith('_final_merged_blast_contig_read_counts.csv'):
                    # Construct full file path
                    full_file_path = os.path.join(root, file)
                    # Copy file to the target directory
                    shutil.copy(full_file_path, target_directory)
                    print(f"Copied: {full_file_path} to {target_directory}")

print("Finished copying files.")


Copied: /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/CAT_fastq/AIAMACAT001_S69/assemblies/viruses/AIAMACAT001_S69_final_merged_blast_contig_read_counts.csv to /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/merged_csv_files/
Copied: /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/CAT_fastq/SEZONOWIFCAT001_S34/assemblies/viruses/SEZONOWIFCAT001_S34_final_merged_blast_contig_read_counts.csv to /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/merged_csv_files/
Copied: /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/CHICKEN_fastq/AINWZCHICKEN001_S56/assemblies/viruses/AINWZCHICKEN001_S56_final_merged_blast_contig_read_counts.csv to /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/merged_csv_files/
Copied: /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/CHICKEN_fastq/AINWZCHICKEN002_S57/assemblies/viruses/AINWZCHICKEN002_S57_final_merged_blast_contig_r

In [6]:
import os
import shutil

# Define the base path and target directory
base_path = '/Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed'
folders = ['CAT_fastq', 'CHICKEN_fastq', 'DOG_fastq', 'GOAT_fastq', 'LIZARD_fastq', 'PIG_fastq', 'SHEEP_fastq']
target_directory = '/Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/megahit_contigs/'

# Create the target directory if it doesn't exist
os.makedirs(target_directory, exist_ok=True)

# Traverse each folder and copy the relevant megahit_assembled_contigs.fa files
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    
    # Walk through all subdirectories in the folder
    for root, dirs, files in os.walk(folder_path):
        # Check if the current path contains the "assemblies/viruses" folder
        if 'assemblies/viruses' in root:
            # Extract the sample name from the folder path
            sample_name = root.split(os.sep)[-3]  # Assuming the sample folder is two levels above "viruses"
            
            # Loop through the files and find the "megahit_assembled_contigs.fa"
            for file in files:
                if file == 'megahit_assembled_contigs.fa':
                    # Construct full file path
                    full_file_path = os.path.join(root, file)
                    
                    # Create a new filename with the sample name prefixed
                    new_filename = f"{sample_name}_megahit_assembled_contigs.fa"
                    new_file_path = os.path.join(target_directory, new_filename)
                    
                    # Copy file to the target directory with the new name
                    shutil.copy(full_file_path, new_file_path)
                    print(f"Copied and renamed: {full_file_path} to {new_file_path}")

print("Finished copying and renaming megahit contigs files.")

Copied and renamed: /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/CAT_fastq/AIAMACAT001_S69/assemblies/viruses/megahit_assembled_contigs.fa to /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/megahit_contigs/AIAMACAT001_S69_megahit_assembled_contigs.fa
Copied and renamed: /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/CAT_fastq/SEZONOWIFCAT001_S34/assemblies/viruses/megahit_assembled_contigs.fa to /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/megahit_contigs/SEZONOWIFCAT001_S34_megahit_assembled_contigs.fa
Copied and renamed: /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/CHICKEN_fastq/AINWZCHICKEN001_S56/assemblies/viruses/megahit_assembled_contigs.fa to /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/megahit_contigs/AINWZCHICKEN001_S56_megahit_assembled_contigs.fa
Copied and renamed: /Volumes/aine_store/SENZOR_project/1stBatch_senzor/Scylla_completed/CHICKEN_fa