Step9c
Date: May 13, 2024
Purpose: The purpose of this file is to merge bedtools_intersect_Cvv_2017_genome_with_myos_CDS_miranda_miRNA_targets.bed (/home/administrator/Documents/Kaas/Venom_ncRNA_project/Results/miRanda/miRanda_2024-5-13/bedtools_intersect_Cvv_2017_genome_with_myos_CDS_miranda_miRNA_targets.bed) with the Cvv_2017_genome_with_myos_CDS_miranda_miRNA_targets.tab (/home/administrator/Documents/Kaas/Venom_ncRNA_project/Results/miRanda/miRanda_2024-5-13/Cvv_2017_genome_with_myos_CDS_miranda_miRNA_targets.tab) file to increase the amount of info we have.
The previous step in the pipeline is: miranda_bedtools_intersect_2024-5-13.sh (/home/administrator/Documents/Kaas/Venom_ncRNA_project/Scripts/BEDtools/Intersect/miranda_bedtools_intersect_2024-5-13.sh)
The next step in the pipeline is: blast_miRBase_alignment_2024-5-13.sh (/home/administrator/Documents/Kaas/Venom_ncRNA_project/Scripts/blast/blast_miRBase_alignment_2024-5-13.sh)

In [1]:
# Import needed packages
import pandas as pd
import re

In [2]:
# Read the miranda_mir.fasta.tab file as a dataframe
miranda_df = pd.read_csv('/home/administrator/Documents/Kaas/Venom_ncRNA_project/Results/miRanda/miRanda_2024-5-13/Cvv_2017_genome_with_myos_CDS_miranda_miRNA_targets.tab', sep='\t')
# Rename the columns of the miranda_df
miranda_df.columns = ['miRNA Sequence (Mature)', 'miRNA Target Sequence', 'Total Score', 'Total Energy', 'Max Score', 'Max Energy', 'Strand', 'Length of miRNA', 'Length of miRNA Target', 'Positions']

# Drop rows that are duplicates
miranda_df.drop_duplicates(inplace=True)

# Extract miRNA Sequnce info
# Add a new column 'miRNA Cluster' and extract the 'miRNA Cluster' information from 'miRNA Sequence'
miranda_df['miRNA Cluster'] = miranda_df['miRNA Sequence (Mature)'].str.split('.').str[0]
# Add a new column 'Maturity' and extract the 'Maturity' information from 'miRNA Sequence'
miranda_df['Maturity'] = miranda_df['miRNA Sequence (Mature)'].str.split('.').str[1].str.split('::').str[0]
# Add new column for 'miRNA Sequence Chrom'
miranda_df['miRNA Sequence Chrom'] = miranda_df['miRNA Sequence (Mature)'].str.split('::').str[1].str.split(':').str[0]
# Add new column for "miRNA Start"
miranda_df['miRNA Start'] = miranda_df['miRNA Sequence (Mature)'].str.extract(r':(\d+)-').astype(int)
# Add new column for "miRNA End"
miranda_df['miRNA End'] = miranda_df['miRNA Sequence (Mature)'].str.extract(r'-(\d+)').astype(int)
# Add new column for "miRNA Strandedness"
miranda_df['miRNA Strandedness'] = miranda_df['miRNA Sequence (Mature)'].str.split('(').str[1].str.split(')').str[0]

# Extract Target Sequence info
# Add new column for 'miRNA Target Chrom'
miranda_df['miRNA Target Chrom'] = miranda_df['miRNA Target Sequence'].str.split(':').str[0]
# Add new column for "miRNA Target Start"
miranda_df['miRNA Target Start'] = miranda_df['miRNA Target Sequence'].str.extract(r':(\d+)-').astype(int)
# Add new column for "miRNA Target End"
#miranda_df['miRNA Target End'] = miranda_df['miRNA Target Sequence'].str.extract(r'-(\d+)').astype(int) # This doesn't work for myotoxin, the 10 from PE-reconstructed-10x-myo gets taken instead of the end number.
miranda_df['miRNA Target End'] = miranda_df['miRNA Target Sequence'].str.extract(r'-(\d+)$').astype(int)

# Move the columns
column_order = ['miRNA Sequence (Mature)', 'miRNA Cluster', 'Maturity', 'miRNA Sequence Chrom', 'miRNA Start', 'miRNA End', 'miRNA Strandedness', 'miRNA Target Sequence', 'miRNA Target Chrom', 'miRNA Target Start', 'miRNA Target End', 'Total Score', 'Total Energy', 'Max Score', 'Max Energy', 'Strand', 'Length of miRNA', 'Length of miRNA Target', 'Positions']
miranda_df = miranda_df[column_order]

# Display the first few rows to verify the result
#print(miranda_df.head())

In [5]:
# Read the bedtools_intersect file as a dataframe
bedtools_df = pd.read_csv('/home/administrator/Documents/Kaas/Venom_ncRNA_project/Results/miRanda/miRanda_2024-5-13/bedtools_intersect_Cvv_2017_genome_with_myos_CDS_miranda_miRNA_targets.bed', sep='\t', header=None)

# Rename the columns of the bedtools_df
bedtools_df.columns = ['Genome Chrom', 'Assembler', 'Feature type', 'Genome Start', 'Genome End', 'Unk1', 'Genome Strandedness', 'Unk3', 'GTF ID Info', 'miRNA Target Chrom', 'miRNA Target Start', 'miRNA Target End']

# Define a function to parse the GTF ID Info column
def parse_gtf_info(cell_info):
    parsed_info = {}
    pairs = cell_info.split(';')
    for pair in pairs:
        if pair.strip():  # Skip empty pairs
            key, value = pair.strip().split(' ', 1)
            parsed_info[key.strip()] = value.strip()
    return parsed_info

# Use the function on the 'GTF ID Info' column in order to extract the information
bedtools_df['gtf_info_dict'] = bedtools_df['GTF ID Info'].apply(parse_gtf_info)

# Get all unique keys from the parsed dictionaries
all_keys = set()
for info_dict in bedtools_df['gtf_info_dict']:
    all_keys.update(info_dict.keys())

# Iterate over all unique keys and add them as new columns
for key in all_keys:
    bedtools_df[key] = bedtools_df['gtf_info_dict'].apply(lambda x: x.get(key))

# Drop the original 'GTF ID Info' column and the intermediate 'gtf_info_dict' column
bedtools_df.drop(columns=['gtf_info_dict'], inplace=True)

# Display the updated dataframe
#print(bedtools_df.head())

# Define new column order
new_column_order = ['Genome Chrom', 'Assembler', 'Feature type', 'Genome Start', 'Genome End', 'Unk1', 'Genome Strandedness', 'Unk3', 'GTF ID Info', 'gene_id', 'transcript_id', 'ID', 'Parent', 'Anolis_Blast_Type', 'Anolis_Homolog', 'Crovir_Transcript_ID', 'Python_Blast_Type', 'Python_Homolog', 'Thamnophis_Blast_Type', 'Thamnophis_Homolog', 'miRNA Target Chrom', 'miRNA Target Start', 'miRNA Target End']

# Reorder the columns
bedtools_df = bedtools_df[new_column_order]

# Define a function to remove quotation marks
def remove_quotation_marks(cell):
    if isinstance(cell, str):
        return cell.replace('"', '')
    else:
        return cell

# Apply the function to all columns except 'GTF ID Info'
bedtools_df.loc[:, bedtools_df.columns != 'GTF ID Info'] = bedtools_df.loc[:, bedtools_df.columns != 'GTF ID Info'].map(remove_quotation_marks)

# Drop rows that are exactly the same
bedtools_df.drop_duplicates(inplace=True)

#print(bedtools_df.head())

In [6]:

# Concatenate the two dataframes along the columns
bedtools_miRanda_tab_merged_df = pd.merge(miranda_df, bedtools_df, on=['miRNA Target Chrom', 'miRNA Target Start', 'miRNA Target End'], how='inner')

In [7]:
# Read the counts.txt as a dataframe
shortstack_counts_df = pd.read_csv('/home/administrator/Documents/Kaas/Venom_ncRNA_project/Usable_data/miRanda_mirna_inputs_from_shortstack/2024-5-13_Run/Counts/Counts.txt', sep='\t')

# Rename columns for clarity
shortstack_counts_df.columns = ['miRNA Sequence for Counts (Hairpin)', 'miRNA Cluster', 'miRNA Yes or No', 'LVG_2', 'LVG_4', 'LVG_9', 'RVG_12S','RVG_5S', 'RVG_6S', 'RVG_7S']

# Filter out rows that do not qualify as miRNAs
filtered_shortstack_counts_df = shortstack_counts_df[shortstack_counts_df['miRNA Yes or No'] == 'Y']


In [8]:
# Merge dfs
shortstack_bedtools_miranda_df = pd.merge(bedtools_miRanda_tab_merged_df, filtered_shortstack_counts_df, on='miRNA Cluster', how='inner')

# Drop the miRNA Yes or No, Unk1, and Unk3 columns
shortstack_bedtools_miranda_df.drop(columns=['miRNA Yes or No', 'Unk1', 'Unk3'], inplace=True)

# Drop the rows that are duplicates
shortstack_bedtools_miranda_df.drop_duplicates(inplace=True)

In [9]:
# Save the data frame
shortstack_bedtools_miranda_df.to_csv('/home/administrator/Documents/Kaas/Venom_ncRNA_project/Results/miRanda/miRanda_2024-5-13/CDS_Shortstack_miRanda_bedtoolsintersect_Master.IMPORTANT.2024.5.13.tsv', sep='\t')

In [11]:
# This section is for doing some filtering to create a smaller more useable dataframe file.
# Drop a bunch of unessesary columns that I don't need at all
filtered_shortstack_bedtools_miranda_df = shortstack_bedtools_miranda_df.drop(columns=['Maturity', 'miRNA Sequence (Mature)', 'miRNA Target Sequence', 'GTF ID Info', 'transcript_id', 'Assembler', 'ID', 'Parent', 'Anolis_Blast_Type', 'Anolis_Homolog', 'Python_Blast_Type', 'Python_Homolog', 'Thamnophis_Blast_Type', 'Thamnophis_Homolog', 'Feature type'])

# Save to file
filtered_shortstack_bedtools_miranda_df.to_csv('/home/administrator/Documents/Kaas/Venom_ncRNA_project/Results/miRanda/miRanda_2024-5-13/CDS_Column_Filtered_Shortstack_miRanda_bedtoolsintersect_Master.IMPORTANT2024.5.13.tsv', sep='\t')