# Script Description
## Step 9 - Fusing TAB and BED files

Date: January 14, 2025

Purpose: The purpose of this file is to merge the bedtools intersect outputs with the miRanda tab outputs.

Previous step: miranda_bedtools_intersect_2025.01.14.sh (/home/administrator/Documents/Kaas/Venom_ncRNA_project/Scripts/BEDtools/Intersect/miranda_bedtools_intersect_2025.01.14.sh)

Next step: blast_miRBase_alignment_2024-5-13.sh (/home/administrator/Documents/Kaas/Venom_ncRNA_project/Scripts/blast/blast_miRBase_alignment_2024-5-13.sh)

## Set up

### Import packages and set paths

In [59]:
# Import needed packages
import pandas as pd
import polars as pl
import polars.selectors as cs
import os
from pathlib import Path
import pathlib
import re
from Bio import SeqIO
import pyarrow
import re

# Set working directory
working_dir = "/home/administrator/Documents/Kaas/Venom_ncRNA_project/"
os.chdir(working_dir)
print('Working directory:', os.getcwd())

# Set file paths for important files
conversion_table_data = 'Usable_data/Genome_files/Cvv_GTF_to_converted_names_2024.2.18.txt' # Coversion data
mirna_dir = 'Results/miRanda/miRanda_2025-01-12' # File path for the directory containing all of the miRanda tabular outputs
bed_dir = 'Results/miRanda/miRanda_2025-01-12/bedtools_intersect' # File path for the directory containing all of the bedtools intersect files
miRNA_fasta = 'Usable_data/miRanda_mirna_inputs_from_shortstack/2024-5-13_Run/Post_clean/mature_mir.fasta' # FASTA file containing the ShoirtStack outputs
hairpin_fasta = 'Usable_data/miRanda_mirna_inputs_from_shortstack/2024-5-13_Run/Post_clean/haripin_mir.fasta' # FASTA file containing the sequences for the miRNA hairpins
blast_data = 'Results/blast/miRBase/miRNA_formated_alignment.tsv'
reference_gtf = 'Usable_data/Genome_files/CroVir_rnd1.all.maker.final.homologIDs.updatedNov2019_with_myos_geneidmod_edited_with_BPP.gtf'
miRNA_counts = 'Usable_data/miRanda_mirna_inputs_from_shortstack/2024-5-13_Run/Counts/Counts.txt'

Working directory: /home/administrator/Documents/Kaas/Venom_ncRNA_project


## Get file path arrays

### Get miRanda file paths

In [60]:
# Get the tab files from their directory
# Set the directory for all of the tab files
miranda_dir = Path(mirna_dir)

# Create a array of for the sample tab files to used
miranda_tabs = list(miranda_dir.glob('consensus*.tab'))
# print(type(miranda_tabs))

# Check if the files were found
if not miranda_tabs:
    print(f"Error: No miRanda output files found in {miranda_dir}")
else:
    print("Success: miRanda output files found. The following will be processed:")
    for file in miranda_tabs:
        print(file)


# Create a array of for the reference tab files to used
ref_miranda_tabs = list(miranda_dir.glob('Crotalus*.tab'))

# Check if the files were found
if not ref_miranda_tabs:
    print(f"Error: No miRanda output files found in {miranda_dir}")
else:
    print("Success: miRanda output files found. The following will be processed:")
    for file in ref_miranda_tabs:
        print(file)

Success: miRanda output files found. The following will be processed:
Results/miRanda/miRanda_2025-01-12/consensus_CV1087_viridis_North_F_five_prime_utr_miranda_miRNA_targets.tab
Results/miRanda/miRanda_2025-01-12/consensus_CV0985_concolor_Other_F_CDS_miranda_miRNA_targets.tab
Results/miRanda/miRanda_2025-01-12/consensus_CV1082_viridis_South_M_three_prime_utr_miranda_miRNA_targets.tab
Results/miRanda/miRanda_2025-01-12/consensus_CV0857_viridis_North_M_three_prime_utr_miranda_miRNA_targets.tab
Results/miRanda/miRanda_2025-01-12/consensus_CV1082_viridis_South_M_CDS_miranda_miRNA_targets.tab
Results/miRanda/miRanda_2025-01-12/consensus_CV1087_viridis_North_F_CDS_miranda_miRNA_targets.tab
Results/miRanda/miRanda_2025-01-12/consensus_CV1086_viridis_South_M_five_prime_utr_miranda_miRNA_targets.tab
Results/miRanda/miRanda_2025-01-12/consensus_CV0985_concolor_Other_F_three_prime_utr_miranda_miRNA_targets.tab
Results/miRanda/miRanda_2025-01-12/consensus_CV0857_viridis_North_M_CDS_miranda_miRNA_

### Get bedtools intersect outputs

In [61]:
# Get the bedtools intersect files from their directory
# Set the directory for all of the BED files
bedtools_dir = Path(bed_dir)

# Create an array for the sample BED files
bed_files = list(bedtools_dir.glob('consensus*.bed'))

# Check if the files were found
if not bed_files:
    print(f"Error: No bedtools output files found in {bedtools_dir}")
else:
    print("Success: bedtools output files found. The following will be processed:")
    for file in bed_files:
        print(file)

# Create an array for the reference BED files
ref_bed_files = list(bedtools_dir.glob('Crotalus*.bed'))

# Check if the files were found
if not ref_bed_files:
    print(f"Error: No bedtools output files found in {bedtools_dir}")
else:
    print("Success: bedtools output files found. The following will be processed:")
    for file in ref_bed_files:
        print(file)

Success: bedtools output files found. The following will be processed:
Results/miRanda/miRanda_2025-01-12/bedtools_intersect/consensus_CV1086_viridis_South_M_three_prime_utr_bedtools_intersect.bed
Results/miRanda/miRanda_2025-01-12/bedtools_intersect/consensus_CV1086_viridis_South_M_five_prime_utr_bedtools_intersect.bed
Results/miRanda/miRanda_2025-01-12/bedtools_intersect/consensus_CV1082_viridis_South_M_five_prime_utr_bedtools_intersect.bed
Results/miRanda/miRanda_2025-01-12/bedtools_intersect/consensus_CV1087_viridis_North_F_CDS_bedtools_intersect.bed
Results/miRanda/miRanda_2025-01-12/bedtools_intersect/consensus_CV0987_lutosus_Other_F_CDS_bedtools_intersect.bed
Results/miRanda/miRanda_2025-01-12/bedtools_intersect/consensus_CV1081_viridis_Mid_M_five_prime_utr_bedtools_intersect.bed
Results/miRanda/miRanda_2025-01-12/bedtools_intersect/consensus_CV1081_viridis_Mid_M_three_prime_utr_bedtools_intersect.bed
Results/miRanda/miRanda_2025-01-12/bedtools_intersect/consensus_CV0987_lutosus

## Functions

### Define a function to format the miRanda outputs

In [62]:
# Define a function to format the miRanda data
def format_miRanda_data(df: pl.DataFrame) -> pl.DataFrame:
    """
    Transforms a miRanda output DataFrame to extract relevant features, rename columns, 
    and reorder the final DataFrame.
    
    Parameters:
        df (pl.DataFrame): Input Polars DataFrame containing raw miRanda data.

    Returns:
        pl.DataFrame: Transformed Polars DataFrame with renamed, extracted, and reordered columns.
    """
    # Define the mapping for renaming columns
    column_mapping = {
        df.columns[0]: 'miRNA_sequence',
        df.columns[1]: 'miRNA_target_sequence',
        df.columns[2]: 'total_score',
        df.columns[3]: 'total_energy',
        df.columns[4]: 'max_score',
        df.columns[5]: 'max_energy',
        df.columns[6]: 'strand',
        df.columns[7]: 'miRNA_length',
        df.columns[8]: 'miRNA_target_length',
        df.columns[9]: 'positions'
    }

    # Rename columns
    df = df.rename(column_mapping)

    # Apply transformations
    return (
        df
        .with_columns(
            # Extract miRNA info
            # Create an miRNA_cluster_original column
            (pl.col('miRNA_sequence').str.split('.').list.get(0).alias('miRNA_cluster_original')),
            # Create a miRNA Sequence Chrom column
            (pl.col('miRNA_sequence').str.split('::').list.get(1).str.split(':').list.get(0).alias('miRNA_sequence_chrom')),
            # Create miRNA Start column
            (pl.col('miRNA_sequence').str.extract(r':(\d+)-').cast(pl.Int64).alias('miRNA_start')),
            # Create miRNA End column
            (pl.col('miRNA_sequence').str.extract(r'-(\d+)').cast(pl.Int64).alias('miRNA_end')),
            # Create miRNA Strandedness column
            (pl.col('miRNA_sequence').str.split('(').list.get(1).str.split(')').list.get(0).alias('miRNA_strandedness')),
            # Append the current column to remove the part in front containing the cluster information
            (pl.col('miRNA_sequence').str.split('::').list.get(1).str.split('(').list.get(0)),

            # Extract target info
            # Add a new column named miRNA_target_chrom
            (pl.col('miRNA_target_sequence').str.split(':').list.get(0).alias('miRNA_target_chrom')),
            # Add new column named miRNA_target_start
            (pl.col('miRNA_target_sequence').str.extract(r':(\d+)-\d+$').cast(pl.Int64).alias('miRNA_target_start')), 
            # Add new column named miRNA_target_end
            (pl.col('miRNA_target_sequence').str.extract(r'-(\d+)$').cast(pl.Int64).alias('miRNA_target_end'))  
        )
        .select([
            'miRNA_cluster_original', 'miRNA_sequence_chrom',
            'miRNA_start', 'miRNA_end', 'miRNA_strandedness', 'miRNA_length',
            'miRNA_target_chrom', 'miRNA_target_start', 
            'miRNA_target_end', 'miRNA_target_length', 'total_score', 
            'total_energy', 'max_score', 'max_energy', 'strand', 'positions'
        ])
        .unique()
    )

### Define a function to format the bedtools intersect outputs

In [63]:
# Create function to format the bedtools intersect data
def format_bedtools_intersect_data(df: pl.DataFrame) -> pl.DataFrame:
    """
    Transforms a bedtools intersect output DataFrame to extract relevant features, rename columns, 
    and reorder the final DataFrame.

    Parameters:
        df (pl.DataFrame): Input Polars DataFrame containing raw bedtools intersect data.

    Returns:
        pl.DataFrame: Transformed Polars DataFrame with renamed, extracted, and reordered columns.
    """

    # Define the mapping for renaming columns
    column_mapping = {
        df.columns[0]: 'genome_chrom',
        df.columns[1]: 'assembler',
        df.columns[2]: 'feature_type',
        df.columns[3]: 'genome_start',
        df.columns[4]: 'genome_end',
        df.columns[5]: 'Unk1',
        df.columns[6]: 'genome_strandedness',
        df.columns[7]: 'Unk3',
        df.columns[8]: 'gff_id_info',
        df.columns[9]: 'miRNA_target_chrom',
        df.columns[10]: 'miRNA_target_start',
        df.columns[11]: 'miRNA_target_end'
    }

    # Rename and clean up columns
    df = (
        df.rename(column_mapping)
        .drop(['Unk1', 'Unk3']) # Drop unnecessary columns
        .unique()  # Remove duplicate rows
    )
    
    # Parse `gff_id_info` for key-value pairs
    parsed_df = (
        df.with_columns(
            pl.col('gff_id_info').str.split(';').alias('key_value_pairs')  # Step 1: Split into key-value pairs
        )
        .explode('key_value_pairs')  # Step 2: Explode into rows
        .filter(pl.col('key_value_pairs').str.contains('='))  # Keep valid key-value pairs
        .with_columns(
            pl.col('key_value_pairs').str.split('=').alias('key_value_split')  # Step 3: Split key-value pairs
        )
        .with_columns(
            pl.col('key_value_split').list.get(0).alias('key'),  # Extract keys
            pl.col('key_value_split').list.get(1).alias('value')  # Extract values
        )
        .pivot(
            values = 'value',
            index = ['genome_chrom', 'genome_start', 'genome_end', 'gff_id_info'],  # Grouping columns
            on = 'key',  # Pivot key-value pairs into columns
            aggregate_function = 'first'  # Handle duplicate keys
        )
        .drop(['extra_copy_number'])  # Drop unnecessary columns if they exist
        .rename({'Parent': 'transcript_id'})
        .unique()
    )

    # Join the parsed DataFrame back to the original
    result_df = df.join(
        parsed_df, on = ['genome_chrom', 'genome_start', 'genome_end', 'gff_id_info'], how = 'left'
    )
    return result_df


In [64]:
# Create function to format the bedtools intersect data
def format_bedtools_intersect_data2(df: pl.DataFrame, genome_type: str) -> pl.DataFrame:
    """
    Transforms a bedtools intersect output DataFrame to extract relevant features, rename columns, 
    and reorder the final DataFrame.

    Parameters:
        df (pl.DataFrame): Input Polars DataFrame containing raw bedtools intersect data.
        genome_type (str): Specifies whether the genome type is 'reference' or 'sample'.

    Returns:
        pl.DataFrame: Transformed Polars DataFrame with renamed, extracted, and reordered columns.
    """

    # Define the mapping for renaming columns
    column_mapping = {
        df.columns[0]: 'genome_chrom',
        df.columns[1]: 'assembler',
        df.columns[2]: 'feature_type',
        df.columns[3]: 'genome_start',
        df.columns[4]: 'genome_end',
        df.columns[5]: 'Unk1',
        df.columns[6]: 'genome_strandedness',
        df.columns[7]: 'Unk3',
        df.columns[8]: 'gff_id_info',
        df.columns[9]: 'miRNA_target_chrom',
        df.columns[10]: 'miRNA_target_start',
        df.columns[11]: 'miRNA_target_end'
    }

    # Rename and clean up columns
    df = (
        df.rename(column_mapping)
        .drop(['Unk1', 'Unk3']) # Drop unnecessary columns
        .unique()  # Remove duplicate rows
    )
    
    # Parse `gff_id_info` for key-value pairs
    parsed_df = (
        df.with_columns(
            pl.col('gff_id_info').str.split(';').alias('key_value_pairs')  # Step 1: Split into key-value pairs
        )
        .explode('key_value_pairs')  # Step 2: Explode into rows
        .filter(pl.col('key_value_pairs').str.contains('='))  # Keep valid key-value pairs
        .with_columns(
            pl.col('key_value_pairs').str.split('=').alias('key_value_split')  # Step 3: Split key-value pairs
        )
        .with_columns(
            pl.col('key_value_split').list.get(0).alias('key'),  # Extract keys
            pl.col('key_value_split').list.get(1).alias('value')  # Extract values
        )
        .pivot(
            values = 'value',
            index = ['genome_chrom', 'genome_start', 'genome_end', 'gff_id_info'],  # Grouping columns
            on = 'key',  # Pivot key-value pairs into columns
            aggregate_function = 'first'  # Handle duplicate keys
        )
        .rename({'Parent': 'transcript_id'})
        .unique()
    )

    # Conditionally drop 'extra_copy_number' if genome_type is 'sample'
    if genome_type == 'sample':
        parsed_df = parsed_df.drop(['extra_copy_number'])

    # Join the parsed DataFrame back to the original
    result_df = df.join(
        parsed_df, on = ['genome_chrom', 'genome_start', 'genome_end', 'gff_id_info'], how = 'left'
    )
    return result_df


### Define a function to join the miRanda and bedtools outputs, along with the conversion table

In [65]:
# Create function to format the bedtools intersect data
def join_bedtools_miRanda_conversion_table(
    miranda_df: pl.DataFrame, 
    bedtools_df: pl.DataFrame, 
    sample_name: str, 
    conversion_df: pl.DataFrame
) -> pl.DataFrame:
    """
    Join the bedtools and miRanda tabular outputs, along with the conversion table.

    Parameters:
        miranda_df (pl.DataFrame): miRanda input data.
        bedtools_df (pl.DataFrame): Bedtools intersect data.
        sample_name (str): sample_name name for the result, probably extracted from a file name.
        conversion_df (pl.DataFrame): Conversion table to merge additional annotations.

    Returns:
        pl.DataFrame: Transformed Polars DataFrame with bedtools, miRanda, and conversion table combined.
    """
    # Get a list of shared columns between miranda_df and bedtools_df
    shared_cols = list(set(miranda_df.columns) & set(bedtools_df.columns))
    if not shared_cols:
        raise ValueError("No shared columns between miranda_df and bedtools_df for joining.")

    # Set the name of the new column
    new_column_name = 'sample_id'

    # Join miranda_df and bedtools_df, adding the sample_name
    miranda_df_bedtools_df = (
        miranda_df
        .join(bedtools_df, on = shared_cols, how='left')
        .with_columns(pl.lit(sample_name).alias(new_column_name))
        .unique()
    )

    # Get a list of shared columns between miranda_df_bedtools_df and conversion_df
    shared_cols2 = list(set(miranda_df_bedtools_df.columns) & set(conversion_df.columns))
    if not shared_cols2:
        raise ValueError("No shared columns between miranda_df_bedtools_df and conversion_df for joining.")

    # Join with the conversion table
    result_df = miranda_df_bedtools_df.join(conversion_df, on=shared_cols2, how='left').unique()

    return result_df


In [66]:
# Create function to format the bedtools intersect data
def join_bedtools_miRanda_conversion_table2(
    miranda_df: pl.DataFrame, 
    bedtools_df: pl.DataFrame, 
    conversion_df: pl.DataFrame, 
    genome_type: str, 
    sample_name: str = None
) -> pl.DataFrame:
    """
    Join the bedtools and miRanda tabular outputs, along with the conversion table.

    Parameters:
        miranda_df (pl.DataFrame): miRanda input data.
        bedtools_df (pl.DataFrame): Bedtools intersect data.
        conversion_df (pl.DataFrame): Conversion table to merge additional annotations.
        genome_type (str): Type of genome, either "reference" or "sample".
        sample_name (str, optional): Sample name for the result, required if genome_type is "sample".

    Returns:
        pl.DataFrame: Transformed Polars DataFrame with bedtools, miRanda, and conversion table combined.
    """
    # Validate genome_type
    if genome_type not in {"reference", "sample"}:
        raise ValueError("genome_type must be either 'reference' or 'sample'.")

    # Validate sample_name if genome_type is "sample"
    if genome_type == "sample" and sample_name is None:
        raise ValueError("sample_name is required when genome_type is 'sample'.")

    # Get a list of shared columns between miranda_df and bedtools_df
    shared_cols = list(set(miranda_df.columns) & set(bedtools_df.columns))
    if not shared_cols:
        raise ValueError("No shared columns between miranda_df and bedtools_df for joining.")

    # Join miranda_df and bedtools_df
    miranda_df_bedtools_df = (
        miranda_df
        .join(bedtools_df, on=shared_cols, how='left')
    )

    # Add sample_id column if genome_type is "sample"
    if genome_type == "sample":
        miranda_df_bedtools_df = (
            miranda_df_bedtools_df
            .with_columns(pl.lit(sample_name).alias('sample_id'))
        )

    miranda_df_bedtools_df = miranda_df_bedtools_df.unique()

    # Get a list of shared columns between miranda_df_bedtools_df and conversion_df
    shared_cols2 = list(set(miranda_df_bedtools_df.columns) & set(conversion_df.columns))
    if not shared_cols2:
        raise ValueError("No shared columns between miranda_df_bedtools_df and conversion_df for joining.")

    # Join with the conversion table
    result_df = miranda_df_bedtools_df.join(conversion_df, on=shared_cols2, how='left').unique()

    return result_df


### Define a function that extracts the SampleID from the file path

In [67]:
def extract_sample_id(file_path: str) -> str:
    """
    Extracts specific parts of the filename from a file path.

    Parameters:
        file_path (str): The full path of the file.

    Returns:
        str: The extracted identifier.
    """
    # Convert file_path to string and extract the filename
    file_name = Path(file_path).name
    
    if file_name.startswith('Crotalus'):
        # For files starting with "Crotalus", return a fixed identifier
        return "Crotalus_viridis_reference"
    elif file_name.startswith('consensus'):
        # Use regex to extract the "CV####_species" portion
        match = re.search(r'(CV\d+_[a-z]+)', file_name)
        if match:
            return match.group(1)
    return None  # Return None if no match is found

#### Test

In [68]:
# Test
for path in miranda_tabs:
    result = extract_sample_id(str(path))
    print(result)

CV1087_viridis
CV0985_concolor
CV1082_viridis
CV0857_viridis
CV1082_viridis
CV1087_viridis
CV1086_viridis
CV0985_concolor
CV0857_viridis
CV1081_viridis
CV0857_viridis
CV1087_viridis
CV0987_lutosus
CV1081_viridis
CV0985_concolor
CV0987_lutosus
CV1086_viridis
CV0987_lutosus
CV1081_viridis
CV1086_viridis
CV1082_viridis


In [69]:
for path in ref_miranda_tabs:
    result = extract_sample_id(str(path))
    print(result)

Crotalus_viridis_reference
Crotalus_viridis_reference
Crotalus_viridis_reference


In [70]:
for path in bed_files:
    result = extract_sample_id(str(path))
    print(result)

CV1086_viridis
CV1086_viridis
CV1082_viridis
CV1087_viridis
CV0987_lutosus
CV1081_viridis
CV1081_viridis
CV0987_lutosus
CV0987_lutosus
CV1086_viridis
CV1082_viridis
CV1081_viridis
CV0857_viridis
CV0985_concolor
CV0857_viridis
CV1087_viridis
CV1087_viridis
CV0985_concolor
CV0857_viridis
CV1082_viridis
CV0985_concolor


### Define a function that extracts the feature type from a file

In [71]:
def detect_feature_type(file_path) -> str:
    """Detect feature type based on file name."""
    file_path_str = str(file_path)
    
    if 'three_prime_utr' in file_path_str:
        return 'three_prime_utr'
    elif 'five_prime_utr' in file_path_str:
        return 'five_prime_utr'
    elif 'CDS' in file_path_str:
        return 'CDS'
    else:
        raise ValueError(f"Cannot determine feature type from file path: {file_path_str}")

#### Test

In [72]:
# Test
for path in miranda_tabs:
    result = detect_feature_type(str(path))
    print(result)

five_prime_utr
CDS
three_prime_utr
three_prime_utr
CDS
CDS
five_prime_utr
three_prime_utr
CDS
five_prime_utr
five_prime_utr
three_prime_utr
three_prime_utr
three_prime_utr
five_prime_utr
five_prime_utr
CDS
CDS
CDS
three_prime_utr
five_prime_utr


### Define a function that process the three file feature types and concats the output

In [73]:
# Define a function that takes file paths and outputs a concatenated DataFrame
def proccess_bedtools_and_miRanda(
        miranda_paths: list,
        bedtools_paths: list,
        converion_dataframe: pl.DataFrame,
        sample_name: str
) -> pl.DataFrame:
    
    """
    This function runs the miRanda and bedtools joining function, while being able to detect whether the file
    is a three_prime_utr, five_prime_utr, or CDS file. As it uses the join_bedtools_miRanda_conversion_table()
    function, it does require the conversion_df.

    Parameters:
        miranda_paths (list): miRanda input data for a given sample.
        bedtools_paths (list): Bedtools intersect data for a given sample.
        sample_name (str): sample_name name for the result, also used to detect what files to use.
        converion_df (pl.DataFrame): Conversion table to merge additional annotations.

    Returns:
        pl.DataFrame: Transformed Polars DataFrame with bedtools, miRanda, and conversion table combined.
        Feature types should also be concatenated.
    """

    # Create a results variable to hold the resulting DataFrames
    results = []
    
    # Loop through the file paths in the miranda file path
    for miranda_path in miranda_paths:

        # Get the current sample name
        miranda_sample = extract_sample_id(miranda_path)

        # Also get the current feature type
        miranda_feature = detect_feature_type(miranda_path)
        
        # # Tell the user what file this is
        # print(f'Current miRanda file: {miranda_path}')

        # Continue if the sample name set by the user and if the sample name in the file are the same
        if sample_name == miranda_sample:

            # Now go throught the bedtoos paths to see if there are any that match sample name wise
            for bedtools_path in bedtools_paths:

                # # Tell the user what file this is
                # print(f'Current bedtools intersect file: {bedtools_path}')

                # Get the sample name for this file
                bedtools_sample = extract_sample_id(bedtools_path)

                # Also get the current feature type
                bedtools_feature = detect_feature_type(bedtools_path)

                # Check that the current bedtools somple is the same as the user input one
                if sample_name == bedtools_sample:
                    # Check that the target feature type is the same
                    if miranda_feature == bedtools_feature:
                        # Read the miRanda file in as a data frame:
                        miranda_df1 = (
                            pl.read_csv(
                                miranda_path,
                                separator = '\t',
                                has_header = True
                            )
                        )

                        # Format the miranda data
                        miranda_df1 = format_miRanda_data(miranda_df1)


                        # Read in the bedtools intersect data
                        bedtools_df1 = (
                            pl.read_csv(
                                bedtools_path,
                                separator = '\t', 
                                has_header = False
                            )
                        )

                        # Format the bedtools intersect data
                        bedtools_df1 = format_bedtools_intersect_data(bedtools_df1)

                        # Fuse the DataFrames together
                        bedtools_miranda_df = join_bedtools_miRanda_conversion_table(
                            miranda_df = miranda_df1,
                            bedtools_df = bedtools_df1,
                            conversion_df = converion_dataframe,
                            sample_name = sample_name
                        )

                        # Drop some columns that are just repeats
                        bedtools_miranda_df = (
                            bedtools_miranda_df
                            .drop([
                                'genome_chrom', 'genome_start', 'genome_end'
                            ])
                            .rename({'genome_strandedness': 'miRNA_target_strandedness'})
                        )

                        # Append the result to the list
                        results.append(bedtools_miranda_df)

    # Concatenate the resulting bedtools_miranda_df
    return pl.concat(results) if results else pl.DataFrame(schema=[])


In [74]:
# Define a function that takes file paths and outputs a concatenated DataFrame
def proccess_bedtools_and_miRanda2(
        miranda_paths: list,
        bedtools_paths: list,
        converion_dataframe: pl.DataFrame,
        genome_type: str,
        sample_name: str = None
) -> pl.DataFrame:
    """
    This function runs the miRanda and bedtools joining function, while being able to detect whether the file
    is a three_prime_utr, five_prime_utr, or CDS file. As it uses the join_bedtools_miRanda_conversion_table()
    function, it does require the conversion_df.

    Parameters:
        miranda_paths (list): miRanda input data for a given sample.
        bedtools_paths (list): Bedtools intersect data for a given sample.
        genome_type (str): Type of genome, either "reference" or "sample".
        sample_name (str, optional): Sample name for the result, required if genome_type is "sample".
        converion_df (pl.DataFrame): Conversion table to merge additional annotations.

    Returns:
        pl.DataFrame: Transformed Polars DataFrame with bedtools, miRanda, and conversion table combined.
        Feature types should also be concatenated.
    """

    # Validate genome_type
    if genome_type not in {"reference", "sample"}:
        raise ValueError("genome_type must be either 'reference' or 'sample'.")

    # Validate sample_name if genome_type is "sample"
    if genome_type == "sample" and sample_name is None:
        raise ValueError("sample_name is required when genome_type is 'sample'.")

    # Create a results variable to hold the resulting DataFrames
    results = []
    
    # Loop through the file paths in the miranda file path
    for miranda_path in miranda_paths:

        # Get the current sample name
        miranda_sample = extract_sample_id(miranda_path)

        # Also get the current feature type
        miranda_feature = detect_feature_type(miranda_path)
        
        # # Tell the user what file this is
        # print(f'Current miRanda file: {miranda_path}')

        # Continue if the sample name set by the user and if the sample name in the file are the same
        if genome_type == "sample" and sample_name == miranda_sample:

            # Now go through the bedtools paths to see if there are any that match sample name wise
            for bedtools_path in bedtools_paths:

                # # Tell the user what file this is
                # print(f'Current bedtools intersect file: {bedtools_path}')

                # Get the sample name for this file
                bedtools_sample = extract_sample_id(bedtools_path)

                # Also get the current feature type
                bedtools_feature = detect_feature_type(bedtools_path)

                # Check that the current bedtools sample is the same as the user input one
                if sample_name == bedtools_sample:
                    # Check that the target feature type is the same
                    if miranda_feature == bedtools_feature:
                        # Read the miRanda file in as a data frame:
                        miranda_df1 = (
                            pl.read_csv(
                                miranda_path,
                                separator = '\t',
                                has_header = True
                            )
                        )

                        # Format the miranda data
                        miranda_df1 = format_miRanda_data(miranda_df1)


                        # Read in the bedtools intersect data
                        bedtools_df1 = (
                            pl.read_csv(
                                bedtools_path,
                                separator = '\t', 
                                has_header = False
                            )
                        )

                        # Format the bedtools intersect data
                        bedtools_df1 = format_bedtools_intersect_data2(bedtools_df1, genome_type='sample')

                        # Fuse the DataFrames together
                        bedtools_miranda_df = join_bedtools_miRanda_conversion_table2(
                            miranda_df = miranda_df1,
                            bedtools_df = bedtools_df1,
                            conversion_df = converion_dataframe,
                            genome_type = genome_type,
                            sample_name = sample_name
                        )

                        # Drop some columns that are just repeats
                        bedtools_miranda_df = (
                            bedtools_miranda_df
                            .drop([
                                'genome_chrom', 'genome_start', 'genome_end'
                            ])
                            .rename({'genome_strandedness': 'miRNA_target_strandedness'})
                        )

                        # Append the result to the list
                        results.append(bedtools_miranda_df)

        elif genome_type == "reference":

            # Now go through the bedtools paths to match feature type
            for bedtools_path in bedtools_paths:

                # Also get the current feature type
                bedtools_feature = detect_feature_type(bedtools_path)

                # Check that the target feature type is the same
                if miranda_feature == bedtools_feature:
                    # Read the miRanda file in as a data frame:
                    miranda_df1 = (
                        pl.read_csv(
                            miranda_path,
                            separator = '\t',
                            has_header = True
                        )
                    )

                    # Format the miranda data
                    miranda_df1 = format_miRanda_data(miranda_df1)


                    # Read in the bedtools intersect data
                    bedtools_df1 = (
                        pl.read_csv(
                            bedtools_path,
                            separator = '\t', 
                            has_header = False
                        )
                    )

                    # Format the bedtools intersect data
                    bedtools_df1 = format_bedtools_intersect_data2(bedtools_df1, genome_type='reference')

                    # Fuse the DataFrames together
                    bedtools_miranda_df = join_bedtools_miRanda_conversion_table2(
                        miranda_df = miranda_df1,
                        bedtools_df = bedtools_df1,
                        conversion_df = converion_dataframe,
                        genome_type = genome_type
                    )

                    # Drop some columns that are just repeats
                    bedtools_miranda_df = (
                        bedtools_miranda_df
                        .drop([
                            'genome_chrom', 'genome_start', 'genome_end'
                        ])
                        .rename({'genome_strandedness': 'miRNA_target_strandedness'})
                    )

                    # Append the result to the list
                    results.append(bedtools_miranda_df)

    # Concatenate the resulting bedtools_miranda_df
    return pl.concat(results) if results else pl.DataFrame(schema=[])


## Conversion Data
This data is needed to fuse all four (miRNA count/sequence data, miRNA target data, mRNA data, and Protein data) together

### Read in the Conversion table and reference GTF

In [75]:
# Read in the conversion table
conversion_table = (
    pl.read_csv(
        'Usable_data/Genome_files/Cvv_GTF_to_converted_names_2024.2.18.txt',
        separator = "\t",
        has_header = True,
    )
    .rename({'converted_id_no_dups': 'genes'})
    .filter(~pl.col('gtf_gene').str.contains('fgenesh')) # Filter out fgenesh genes
)
conversion_table
# Well crap, the because liftoff and gffread didn't keep much of the final column of the gtf, I don't have any columns to fuse on

gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes
str,str,str,str,str
"""myotoxin1""","""myotoxin1""","""crovir-transcript-myotoxin""","""Venom_myotoxin""","""Venom_myotoxin"""
"""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-1688""","""MAP4""","""MAP4"""
"""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-1686""","""ZNF282""","""ZNF282"""
"""augustus_masked-scaffold-Z-pro…","""augustus_masked-scaffold-Z-pro…","""crovir-transcript-1684""","""ZNF267""","""ZNF267"""
"""augustus_masked-scaffold-Z-pro…","""augustus_masked-scaffold-Z-pro…","""crovir-transcript-1685""","""SETD2""","""SETD2"""
…,…,…,…,…
"""augustus_masked-scaffold-un648…","""augustus_masked-scaffold-un648…","""NA""","""augustus_masked-scaffold-un648…","""augustus_masked-scaffold-un648…"
"""augustus_masked-scaffold-un663…","""augustus_masked-scaffold-un663…","""NA""","""augustus_masked-scaffold-un663…","""augustus_masked-scaffold-un663…"
"""augustus_masked-scaffold-un703…","""augustus_masked-scaffold-un703…","""NA""","""augustus_masked-scaffold-un703…","""augustus_masked-scaffold-un703…"
"""myotoxin3""","""myotoxin3""","""NA""","""myotoxin3""","""myotoxin3"""


In [76]:
# Read in the old gtf to get a column for the conversion table I can actually use
ref_gtf = (
    pl.read_csv(
        reference_gtf,
        separator= '\t', comment_prefix='#', has_header=False,
        new_columns=[
            'seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'
        ]
    )
    .filter((pl.col('type') != 'gene')) # Filter out gene features since they don't have transcript IDs
    .with_columns(
        (pl.col('attributes').str.extract(r'gene_id "([^"]+)"').alias('gene_id')),
        (pl.col('attributes').str.extract(r'transcript_id "([^"]+)"').alias('transcript_id'))
    )
    .select(
        ['gene_id', 'transcript_id']
    )
    .rename({'gene_id': 'gtf_gene'})
    .unique()
)
ref_gtf

gtf_gene,transcript_id
str,str
"""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…"
"""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…"
"""augustus_masked-scaffold-ma1-p…","""augustus_masked-scaffold-ma1-p…"
"""maker-scaffold-ma7-augustus-ge…","""maker-scaffold-ma7-augustus-ge…"
"""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…"
…,…
"""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…"
"""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…"
"""augustus_masked-scaffold-un663…","""augustus_masked-scaffold-un663…"
"""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…"


In [77]:
# Join the gtf and conversion table to get a new column to fuse on
conversion_table = conversion_table.join(ref_gtf, on='gtf_gene', how='left')
conversion_table

gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes,transcript_id
str,str,str,str,str,str
"""myotoxin1""","""myotoxin1""","""crovir-transcript-myotoxin""","""Venom_myotoxin""","""Venom_myotoxin""","""myotoxin_model_1"""
"""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-1688""","""MAP4""","""MAP4""","""maker-scaffold-Z-augustus-gene…"
"""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-1686""","""ZNF282""","""ZNF282""","""maker-scaffold-Z-augustus-gene…"
"""augustus_masked-scaffold-Z-pro…","""augustus_masked-scaffold-Z-pro…","""crovir-transcript-1684""","""ZNF267""","""ZNF267""","""augustus_masked-scaffold-Z-pro…"
"""augustus_masked-scaffold-Z-pro…","""augustus_masked-scaffold-Z-pro…","""crovir-transcript-1685""","""SETD2""","""SETD2""","""augustus_masked-scaffold-Z-pro…"
…,…,…,…,…,…
"""augustus_masked-scaffold-un648…","""augustus_masked-scaffold-un648…","""NA""","""augustus_masked-scaffold-un648…","""augustus_masked-scaffold-un648…","""augustus_masked-scaffold-un648…"
"""augustus_masked-scaffold-un663…","""augustus_masked-scaffold-un663…","""NA""","""augustus_masked-scaffold-un663…","""augustus_masked-scaffold-un663…","""augustus_masked-scaffold-un663…"
"""augustus_masked-scaffold-un703…","""augustus_masked-scaffold-un703…","""NA""","""augustus_masked-scaffold-un703…","""augustus_masked-scaffold-un703…","""augustus_masked-scaffold-un703…"
"""myotoxin3""","""myotoxin3""","""NA""","""myotoxin3""","""myotoxin3""","""myotoxin_model_3"""


## miRNA Count and Sequence data

### Get miRNA count data

In [78]:
# Read in the counts data from shortstack
miRNA_counts_df = (
    pl.read_csv(
    miRNA_counts,
    separator= '\t',
    new_columns=[
        'miRNA_sequence', 'miRNA_cluster_original', 'miRNA_Yes_or_No', 'CV1081_viridis', 'CV0857_viridis', 'CV1086_viridis', 'CV1082_viridis', 'CV1087_viridis', 'CV0987_lutosus', 'CV0985_concolor' # I'm using the CV IDs this time which should be easier
    ]
    )
    # Filter out non-miRNAs
    .filter(pl.col('miRNA_Yes_or_No') == 'Y')
    # Remove the column because it isn't required anymore
    .drop(['miRNA_Yes_or_No'])
    .unique()
    .unpivot(
        index = ['miRNA_sequence', 'miRNA_cluster_original'],
        variable_name = 'sample_id',
        value_name = 'miRNA_counts'
    )
    # Drop the sequence data, as it is not necessarry
    .drop(['miRNA_sequence'])
)
miRNA_counts_df

miRNA_cluster_original,sample_id,miRNA_counts
str,str,i64
"""Cluster_322""","""CV1081_viridis""",3873
"""Cluster_667""","""CV1081_viridis""",2
"""Cluster_924""","""CV1081_viridis""",34
"""Cluster_186""","""CV1081_viridis""",13
"""Cluster_987""","""CV1081_viridis""",3393
…,…,…
"""Cluster_30""","""CV0985_concolor""",58084
"""Cluster_1854""","""CV0985_concolor""",321
"""Cluster_653""","""CV0985_concolor""",140867
"""Cluster_450""","""CV0985_concolor""",36423


In [79]:
# # Fuse the counts data to the main data frame
# counts_miranda_bedtools_df = converted_miranda_bedtools_df.join(miRNA_counts_df, on= 'miRNA_cluster_original', how= 'left')
# counts_miranda_bedtools_df

### Process the BLAST hits and FASTA sequences

In [80]:
# Read in the miRNA FASTA sequences
miRNA_sequences = SeqIO.parse(miRNA_fasta, 'fasta')

# Prepare the data for a Polars dataframe
sequences_data = [
    {
        'miRNA_cluster_original': record.id.split('.')[0], # Remove extraneous data
        'miRNA_sequence': str(record.seq) # Convert sequence to string
    }
    for record in miRNA_sequences
]

# Convert the list of dictionaries to a Polars dataframe
sequences_df = pl.DataFrame(sequences_data)
sequences_df

miRNA_cluster_original,miRNA_sequence
str,str
"""Cluster_4""","""TCTTTGGTTATCTAGCTGTATG"""
"""Cluster_30""","""GTGAAATGTTTAGGACCACTTG"""
"""Cluster_83""","""CTTTTTGCGGTCTGGGCTTGCT"""
"""Cluster_112""","""AGCCACTGACTAACGCACATTG"""
"""Cluster_122""","""CAGTGCAATGTAAAAAGGGCAT"""
…,…
"""Cluster_1858""","""CGATCTTGTGCTCCTGTTCATC"""
"""Cluster_1863""","""CTAGCACCATTTGAAATCGGTT"""
"""Cluster_1864""","""TAGCACCATTTGAAATCAGTGTT"""
"""Cluster_1888""","""TCGTACCGTGAGTAATAATGCG"""


In [81]:
# Read in the hairpin FASTA sequences
hairpin_sequences = SeqIO.parse(hairpin_fasta, 'fasta')

# Prepare the data for a Polars dataframe
hairpin_data = [
    {
        'miRNA_cluster_original': record.id.split('.')[0], # Remove extraneous data
        'hairpin_sequence': str(record.seq) # Convert sequence to string
    }
    for record in hairpin_sequences
]

# Convert the list of dictionaries to a Polars dataframe
hairpin_df = pl.DataFrame(hairpin_data).with_columns((pl.col('miRNA_cluster_original').str.split('::').list.get(0)))
hairpin_df

miRNA_cluster_original,hairpin_sequence
str,str
"""Cluster_4""","""ATCCACAGGGCCGGTTTTTCTCTTTGGTTA…"
"""Cluster_30""","""TCTCTCAGCCTGCTGGGTGCAGTGGTTCTT…"
"""Cluster_83""","""AGACGTTGTCCTTCGCGAATCTTTTTGCGG…"
"""Cluster_112""","""GTTAATTCTCCAGAAGCAGGTGAGCCACTG…"
"""Cluster_122""","""GGGCACGGCCCCTGCCCGAGGCTCTTTTCA…"
…,…
"""Cluster_1858""","""CGGGAAGCAAGCAAGCGAGATGGGCAGGGG…"
"""Cluster_1863""","""GTGACCCCTTTAAAGGATAACTGATTTCTT…"
"""Cluster_1864""","""GTCTGGAGCTTCTTCAGGAATCTGGTTTCA…"
"""Cluster_1888""","""AGCCCGGGCTGACCGCGGCTCATTATTACT…"


In [82]:
# Fuse the harpin data to the mature data
sequences_df = (
    sequences_df
    .join(hairpin_df, on = 'miRNA_cluster_original', how = 'left')
)
sequences_df

miRNA_cluster_original,miRNA_sequence,hairpin_sequence
str,str,str
"""Cluster_4""","""TCTTTGGTTATCTAGCTGTATG""","""ATCCACAGGGCCGGTTTTTCTCTTTGGTTA…"
"""Cluster_30""","""GTGAAATGTTTAGGACCACTTG""","""TCTCTCAGCCTGCTGGGTGCAGTGGTTCTT…"
"""Cluster_83""","""CTTTTTGCGGTCTGGGCTTGCT""","""AGACGTTGTCCTTCGCGAATCTTTTTGCGG…"
"""Cluster_112""","""AGCCACTGACTAACGCACATTG""","""GTTAATTCTCCAGAAGCAGGTGAGCCACTG…"
"""Cluster_122""","""CAGTGCAATGTAAAAAGGGCAT""","""GGGCACGGCCCCTGCCCGAGGCTCTTTTCA…"
…,…,…
"""Cluster_1858""","""CGATCTTGTGCTCCTGTTCATC""","""CGGGAAGCAAGCAAGCGAGATGGGCAGGGG…"
"""Cluster_1863""","""CTAGCACCATTTGAAATCGGTT""","""GTGACCCCTTTAAAGGATAACTGATTTCTT…"
"""Cluster_1864""","""TAGCACCATTTGAAATCAGTGTT""","""GTCTGGAGCTTCTTCAGGAATCTGGTTTCA…"
"""Cluster_1888""","""TCGTACCGTGAGTAATAATGCG""","""AGCCCGGGCTGACCGCGGCTCATTATTACT…"


In [83]:
# Read in the blast data
blast_df = (
    pl.read_csv(
        blast_data,
        separator = '\t'
    )
    .rename(
        {'Query': 'miRNA_cluster_original', 'Best.miRNA.Blast.Hits': 'best_miRNA_ortholog', 'Base.Putative.miRNA.Name': 'base_miRNA_name', 'miRNA.Identity.Type': 'miRNA_name_probability',
         'miRNA.Length': 'miRNA_length', 'Blast.Percent.Identity': 'blast_percent_identity', 'Blast.Alignment.Length': 'blast_alignment_length', 'Bit.Score': 'bit_score'}
    )
    # Create a miRNA_cluster column that contains the putative miRNA names
    .with_columns(
        (pl.when(pl.col("miRNA_name_probability") == "De-Novo")
        .then(pl.col("miRNA_cluster_original"))
        .otherwise('cvi-' + pl.col('base_miRNA_name').fill_null(''))
        .alias("miRNA_cluster"))
    )
    .join(sequences_df, on = ['miRNA_cluster_original'], how = 'left')
    .select([
        'miRNA_cluster', 'miRNA_cluster_original', 'base_miRNA_name', 'best_miRNA_ortholog', 'miRNA_name_probability', 'blast_percent_identity', 'E.value', 'bit_score', 'miRNA_length', 'miRNA_sequence', 'hairpin_sequence'
    ])
)
blast_df

miRNA_cluster,miRNA_cluster_original,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_length,miRNA_sequence,hairpin_sequence
str,str,str,str,str,f64,f64,f64,i64,str,str
"""cvi-miR-146a-3p""","""Cluster_1062""","""miR-146a-3p""","""aca-miR-146a-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""GCCCCGTGGATTCAGTTCTACA""","""TACATGGCTGGCTTAGCTCTGAGAACTGAA…"
"""cvi-miR-15a-5p""","""Cluster_1084""","""miR-15a-5p""","""pbv-miR-15a-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""TAGCAGCACATCATGATTTGTG""","""TTCTGGAAGCCTCAGAGTACTATAGCAGCA…"
"""cvi-miR-737-5p""","""Cluster_1105""","""miR-737-5p""","""aca-miR-737-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""GTTTTTTTAGGTTTTGATTTTT""","""ATATCTGCTACTCTGCTGTTGTTTTTTTAG…"
"""cvi-miR-210-5p""","""Cluster_112""","""miR-210-5p""","""pbv-miR-210-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""AGCCACTGACTAACGCACATTG""","""GTTAATTCTCCAGAAGCAGGTGAGCCACTG…"
"""cvi-miR-135-5p""","""Cluster_1135""","""miR-135-5p""","""oha-miR-135-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""TATGGCTTTTTATTCCTATGTG""","""AGATAAATTCACTCTAGTGTTTTATGGCTT…"
…,…,…,…,…,…,…,…,…,…,…
"""cvi-miR-129b-5p""","""Cluster_1339""","""miR-129b-5p""","""oha-miR-129b-5p""","""Potential-Identity""",100.0,0.000001,42.1,22,"""CTTTTTGCGGTCTGGGCTTTCT""","""CAATGACAGGTCTTCTCTGGGTCTTTTTGC…"
"""cvi-miR-200b-3p""","""Cluster_1692""","""miR-200b-3p""","""pbv-miR-200b-3p""","""Potential-Identity""",100.0,0.000001,42.1,22,"""TAATACTGCCTGGTAAAGATGT""","""CATTCTGGGAAGCCATCGGCATCTTACTGG…"
"""cvi-miR-29c-3p""","""Cluster_1863""","""miR-29c-3p""","""xla-miR-29c-3p""","""Potential-Identity""",100.0,0.000001,42.1,22,"""CTAGCACCATTTGAAATCGGTT""","""GTGACCCCTTTAAAGGATAACTGATTTCTT…"
"""cvi-let-7f-2-3p""","""Cluster_856""","""let-7f-2-3p""","""oha-let-7f-2-3p""","""Potential-Identity""",100.0,0.000001,42.1,22,"""CTATACAATCTATTGCCTTCCT""","""TTTGTGTTGCTTTGTGGAGGTGAGGTAGTA…"


#### Get Non-Unique BLAST hits so you can fix the miRNA naming scheme
Some of the miRNAs match clusters the same miRNA clusters, despite the different sequences. This just fixes that by adding dashes to the name.

In [84]:
# Convert the DataFrame to pandas
blast_df = blast_df.to_pandas()
blast_df

Unnamed: 0,miRNA_cluster,miRNA_cluster_original,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_length,miRNA_sequence,hairpin_sequence
0,cvi-miR-146a-3p,Cluster_1062,miR-146a-3p,aca-miR-146a-3p,Very-Probable-Identity,100.0,3.080000e-07,44.1,22,GCCCCGTGGATTCAGTTCTACA,TACATGGCTGGCTTAGCTCTGAGAACTGAATTCCATAGGCTTTAGA...
1,cvi-miR-15a-5p,Cluster_1084,miR-15a-5p,pbv-miR-15a-5p,Very-Probable-Identity,100.0,3.080000e-07,44.1,22,TAGCAGCACATCATGATTTGTG,TTCTGGAAGCCTCAGAGTACTATAGCAGCACATCATGATTTGTGTT...
2,cvi-miR-737-5p,Cluster_1105,miR-737-5p,aca-miR-737-5p,Very-Probable-Identity,100.0,3.080000e-07,44.1,22,GTTTTTTTAGGTTTTGATTTTT,ATATCTGCTACTCTGCTGTTGTTTTTTTAGGTTTTGATTTTTATGC...
3,cvi-miR-210-5p,Cluster_112,miR-210-5p,pbv-miR-210-5p,Very-Probable-Identity,100.0,3.080000e-07,44.1,22,AGCCACTGACTAACGCACATTG,GTTAATTCTCCAGAAGCAGGTGAGCCACTGACTAACGCACATTGTG...
4,cvi-miR-135-5p,Cluster_1135,miR-135-5p,oha-miR-135-5p,Very-Probable-Identity,100.0,3.080000e-07,44.1,22,TATGGCTTTTTATTCCTATGTG,AGATAAATTCACTCTAGTGTTTTATGGCTTTTTATTCCTATGTGAT...
...,...,...,...,...,...,...,...,...,...,...,...
118,cvi-miR-129b-5p,Cluster_1339,miR-129b-5p,oha-miR-129b-5p,Potential-Identity,100.0,1.220000e-06,42.1,22,CTTTTTGCGGTCTGGGCTTTCT,CAATGACAGGTCTTCTCTGGGTCTTTTTGCGGTCTGGGCTTTCTGG...
119,cvi-miR-200b-3p,Cluster_1692,miR-200b-3p,pbv-miR-200b-3p,Potential-Identity,100.0,1.220000e-06,42.1,22,TAATACTGCCTGGTAAAGATGT,CATTCTGGGAAGCCATCGGCATCTTACTGGGCAGCGTTGGATGTTT...
120,cvi-miR-29c-3p,Cluster_1863,miR-29c-3p,xla-miR-29c-3p,Potential-Identity,100.0,1.220000e-06,42.1,22,CTAGCACCATTTGAAATCGGTT,GTGACCCCTTTAAAGGATAACTGATTTCTTCTGGTGTTCGGAGTCT...
121,cvi-let-7f-2-3p,Cluster_856,let-7f-2-3p,oha-let-7f-2-3p,Potential-Identity,100.0,1.220000e-06,42.1,22,CTATACAATCTATTGCCTTCCT,TTTGTGTTGCTTTGTGGAGGTGAGGTAGTAGGTTGTATAGTTTGTG...


In [85]:
# Assuming blast_df is already a Pandas DataFrame
# Step 1: Create a descending index for duplicates within each miRNA_cluster
blast_df['desc_index'] = blast_df.groupby('miRNA_cluster').cumcount(ascending=False)

# Step 2: Add the descending index to the miRNA_cluster name for duplicates
blast_df['miRNA_cluster'] = blast_df.apply(
    lambda row: f"{row['miRNA_cluster']}-{row['desc_index']}" if row['desc_index'] > 0 else row['miRNA_cluster'],
    axis=1
)

# Step 3: Drop the desc_index column
blast_df = blast_df.drop(columns=['desc_index'])

# convert back to polars
blast_df = pl.from_pandas(blast_df)
blast_df

miRNA_cluster,miRNA_cluster_original,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_length,miRNA_sequence,hairpin_sequence
str,str,str,str,str,f64,f64,f64,i64,str,str
"""cvi-miR-146a-3p""","""Cluster_1062""","""miR-146a-3p""","""aca-miR-146a-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""GCCCCGTGGATTCAGTTCTACA""","""TACATGGCTGGCTTAGCTCTGAGAACTGAA…"
"""cvi-miR-15a-5p""","""Cluster_1084""","""miR-15a-5p""","""pbv-miR-15a-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""TAGCAGCACATCATGATTTGTG""","""TTCTGGAAGCCTCAGAGTACTATAGCAGCA…"
"""cvi-miR-737-5p""","""Cluster_1105""","""miR-737-5p""","""aca-miR-737-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""GTTTTTTTAGGTTTTGATTTTT""","""ATATCTGCTACTCTGCTGTTGTTTTTTTAG…"
"""cvi-miR-210-5p""","""Cluster_112""","""miR-210-5p""","""pbv-miR-210-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""AGCCACTGACTAACGCACATTG""","""GTTAATTCTCCAGAAGCAGGTGAGCCACTG…"
"""cvi-miR-135-5p-2""","""Cluster_1135""","""miR-135-5p""","""oha-miR-135-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""TATGGCTTTTTATTCCTATGTG""","""AGATAAATTCACTCTAGTGTTTTATGGCTT…"
…,…,…,…,…,…,…,…,…,…,…
"""cvi-miR-129b-5p""","""Cluster_1339""","""miR-129b-5p""","""oha-miR-129b-5p""","""Potential-Identity""",100.0,0.000001,42.1,22,"""CTTTTTGCGGTCTGGGCTTTCT""","""CAATGACAGGTCTTCTCTGGGTCTTTTTGC…"
"""cvi-miR-200b-3p""","""Cluster_1692""","""miR-200b-3p""","""pbv-miR-200b-3p""","""Potential-Identity""",100.0,0.000001,42.1,22,"""TAATACTGCCTGGTAAAGATGT""","""CATTCTGGGAAGCCATCGGCATCTTACTGG…"
"""cvi-miR-29c-3p-1""","""Cluster_1863""","""miR-29c-3p""","""xla-miR-29c-3p""","""Potential-Identity""",100.0,0.000001,42.1,22,"""CTAGCACCATTTGAAATCGGTT""","""GTGACCCCTTTAAAGGATAACTGATTTCTT…"
"""cvi-let-7f-2-3p""","""Cluster_856""","""let-7f-2-3p""","""oha-let-7f-2-3p""","""Potential-Identity""",100.0,0.000001,42.1,22,"""CTATACAATCTATTGCCTTCCT""","""TTTGTGTTGCTTTGTGGAGGTGAGGTAGTA…"


### Fuse miRNA count data to miRNA sequence and blast data
This should give me all of the miRNA information other than the target data

In [86]:
# Fuse the miRNA count data to the blast and sequence data
miRNA_info_df = miRNA_counts_df.join(
    blast_df,
    on = ['miRNA_cluster_original'],
    how = 'left'
)
miRNA_info_df

miRNA_cluster_original,sample_id,miRNA_counts,miRNA_cluster,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_length,miRNA_sequence,hairpin_sequence
str,str,i64,str,str,str,str,f64,f64,f64,i64,str,str
"""Cluster_322""","""CV1081_viridis""",3873,"""cvi-miR-194-5p""","""miR-194-5p""","""oha-miR-194-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""TGTAACAGCAACTCCATGTGGA""","""GCTCTTAAACAGTGTTATCAAGTGTAACAG…"
"""Cluster_667""","""CV1081_viridis""",2,"""cvi-miR-9-3p""","""miR-9-3p""","""oha-miR-9-3p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,23,"""TCTTTGGTTATCTAGCTGTATGA""","""ATATGAGGGAAGCGAGTTGTTATCTTTGGT…"
"""Cluster_924""","""CV1081_viridis""",34,"""cvi-miR-18a-3p""","""miR-18a-3p""","""oha-miR-18a-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""ACTGCCCTAAGTGCTCCTTCTG""","""ACAATAACTGCTTTTTGTTCTAAGGTGCAT…"
"""Cluster_186""","""CV1081_viridis""",13,"""Cluster_186""","""miR-5007-5p""","""hsa-miR-5007-5p""","""De-Novo""",100.0,4.5,20.3,22,"""TGAACATGGACTATCAGCCAGC""","""CCACTGACACAACTGCAGCCAGTGAACATG…"
"""Cluster_987""","""CV1081_viridis""",3393,"""cvi-miR-133a-3p""","""miR-133a-3p""","""pbv-miR-133a-3p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,23,"""TTTGGTCCCCTTCAACCAGCTGT""","""GGCCTAGAATGCTTTGCTAAAGCTGGTAAA…"
…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_30""","""CV0985_concolor""",58084,"""cvi-miR-203-3p""","""miR-203-3p""","""oha-miR-203-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""GTGAAATGTTTAGGACCACTTG""","""TCTCTCAGCCTGCTGGGTGCAGTGGTTCTT…"
"""Cluster_1854""","""CV0985_concolor""",321,"""cvi-miR-194-5p-1""","""miR-194-5p""","""xla-miR-194-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,23,"""TGTAACAGCAACTCCATGTGGAA""","""ACCGAAGGGGGTGTCTATCGACTGTAACAG…"
"""Cluster_653""","""CV0985_concolor""",140867,"""cvi-miR-27b-5p""","""miR-27b-5p""","""oha-miR-27b-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""AGAGCTTAGCTGATTGGTGAAC""","""GGAGACCTCTCTAGTGAGGTGCAGAGCTTA…"
"""Cluster_450""","""CV0985_concolor""",36423,"""cvi-let-7g-5p""","""let-7g-5p""","""oha-let-7g-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,22,"""TGAGGTAGTAGTTTGTACAGTT""","""CACAGTGCCTGGTTCCAGGCTGAGGTAGTA…"


## Format miRNA target data for each sample

### Use the formating function to get sample DataFrames

#### CV0857

In [87]:
# Set the function equal to a DataFrame
CV0857_df = proccess_bedtools_and_miRanda(
    miranda_paths = miranda_tabs,
    bedtools_paths = bed_files,
    converion_dataframe = conversion_table,
    sample_name = 'CV0857_viridis'
)
CV0857_df

miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Cluster_30""","""scaffold-ma1""",23145585,23145607,"""-""",22,"""scaffold-ma3""",107157529,107157963,434,155.0,-16.21,155.0,-16.21,15651,""" 100""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_6054;Parent…","""three_prime_utr_6054""","""maker-scaffold-ma3-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-5794""","""DPYD""","""DPYD.1"""
"""Cluster_855""","""scaffold-ma3""",139201720,139201742,"""+""",22,"""scaffold-ma2""",90047540,90049975,2435,141.0,-8.99,141.0,-8.99,568478,""" 1454""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_4261;Parent…","""three_prime_utr_4261""","""maker-scaffold-ma2-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-14149""","""CYBC1""","""CYBC1"""
"""Cluster_1911""","""scaffold-un619""",5287,5308,"""+""",21,"""scaffold-ma6""",34227211,34227686,475,145.0,-8.45,145.0,-8.45,1237642,""" 242""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_8339;Parent…","""three_prime_utr_8339""","""maker-scaffold-ma6-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma6-augustus-ge…","""maker-scaffold-ma6-augustus-ge…","""crovir-transcript-11894""","""GLIPR1L1""","""GLIPR1L1"""
"""Cluster_1863""","""scaffold-un31""",31144,31166,"""-""",22,"""scaffold-ma3""",126269313,126271673,2360,149.0,-14.33,149.0,-14.33,1205678,""" 1389""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_6169;Parent…","""three_prime_utr_6169""","""maker-scaffold-ma3-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-5983""","""C8B""","""C8B"""
"""Cluster_4""","""scaffold-ma1""",3055559,3055581,"""-""",22,"""scaffold-ma1""",259028816,259031775,2959,289.0,-30.9,146.0,-15.49,2723,""" 159 1655""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_2723;Parent…","""three_prime_utr_2723""","""maker-scaffold-ma1-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-9435""","""YIPF4""","""YIPF4"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_1737""","""scaffold-mi7""",12015387,12015410,"""+""",23,"""scaffold-Z""",36545498,36546127,629,144.0,-19.98,144.0,-19.98,778936,""" 11""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_131;Parent=m…","""five_prime_utr_131""","""maker-scaffold-Z-augustus-gene…","""CV0857_viridis""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-2055""","""HOXA1""","""HOXA1"""
"""Cluster_323""","""scaffold-ma1""",270639454,270639476,"""+""",22,"""scaffold-mi7""",5898112,5898878,766,143.0,-21.36,143.0,-21.36,128101,""" 562""","""Liftoff""","""five_prime_utr""","""+""","""ID=five_prime_utr_6323;Parent=…","""five_prime_utr_6323""","""maker-scaffold-mi7-augustus-ge…","""CV0857_viridis""","""maker-scaffold-mi7-augustus-ge…","""maker-scaffold-mi7-augustus-ge…","""crovir-transcript-1359""","""TUBB2B""","""TUBB2B.1"""
"""Cluster_4""","""scaffold-ma1""",3055559,3055581,"""-""",22,"""scaffold-un721""",5243,5644,401,140.0,-14.95,140.0,-14.95,7133,""" 35""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_7143;Parent=…","""five_prime_utr_7143""","""maker-scaffold-un721-augustus-…","""CV0857_viridis""","""maker-scaffold-un721-augustus-…","""maker-scaffold-un721-augustus-…","""NA""","""maker-scaffold-un721-augustus-…","""maker-scaffold-un721-augustus-…"
"""Cluster_208""","""scaffold-ma1""",181071334,181071356,"""+""",22,"""scaffold-ma3""",167442531,167442693,162,145.0,-22.12,145.0,-22.12,104380,""" 119""","""Liftoff""","""five_prime_utr""","""+""","""ID=five_prime_utr_4702;Parent=…","""five_prime_utr_4702""","""maker-scaffold-ma3-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-6577""","""BCAS4""","""BCAS4"""


#### CV0985

In [88]:
# Set the function equal to a DataFrame
CV0985_df = proccess_bedtools_and_miRanda(
    miranda_paths = miranda_tabs,
    bedtools_paths = bed_files,
    converion_dataframe = conversion_table,
    sample_name = 'CV0985_concolor'
)
CV0985_df

miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Cluster_30""","""scaffold-ma1""",23145585,23145607,"""-""",22,"""scaffold-Z""",37831839,37831924,85,142.0,-12.97,142.0,-12.97,154721,""" 1""","""Liftoff""","""CDS""","""+""","""ID=CDS_3371;Parent=maker-scaff…","""CDS_3371""","""maker-scaffold-Z-augustus-gene…","""CV0985_concolor""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-2069""","""CHN2""","""CHN2.1"""
"""Cluster_1736""","""scaffold-mi7""",12014980,12015002,"""+""",22,"""scaffold-Z""",16618129,16618542,413,141.0,-23.1,141.0,-23.1,16347037,""" 306""","""Liftoff""","""CDS""","""+""","""ID=CDS_1344;Parent=maker-scaff…","""CDS_1344""","""maker-scaffold-Z-augustus-gene…","""CV0985_concolor""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-1837""","""MKX""","""MKX"""
"""Cluster_475""","""scaffold-ma2""",74468517,74468538,"""+""",21,"""scaffold-mi4""",8459802,8459888,86,146.0,-10.29,146.0,-10.29,4066045,""" 52""","""Liftoff""","""CDS""","""-""","""ID=CDS_109632;Parent=maker-sca…","""CDS_109632""","""maker-scaffold-mi4-augustus-ge…","""CV0985_concolor""","""maker-scaffold-mi4-augustus-ge…","""maker-scaffold-mi4-augustus-ge…","""crovir-transcript-131""","""MPP1""","""MPP1"""
"""Cluster_83""","""scaffold-ma1""",65768523,65768545,"""-""",22,"""scaffold-ma1""",147716783,147716906,123,154.0,-14.08,154.0,-14.08,331999,""" 33""","""Liftoff""","""CDS""","""+""","""ID=CDS_29313;Parent=augustus_m…","""CDS_29313""","""augustus_masked-scaffold-ma1-p…","""CV0985_concolor""","""augustus_masked-scaffold-ma1-p…","""augustus_masked-scaffold-ma1-p…","""crovir-transcript-8396""","""FBN1""","""FBN1"""
"""Cluster_867""","""scaffold-ma3""",146886226,146886250,"""-""",24,"""scaffold-ma3""",141101916,141102004,88,140.0,-13.95,140.0,-13.95,9166899,""" 61""","""Liftoff""","""CDS""","""-""","""ID=CDS_92725;Parent=maker-scaf…","""CDS_92725""","""maker-scaffold-ma3-augustus-ge…","""CV0985_concolor""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-6227""","""TAF11""","""TAF11"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_182""","""scaffold-ma1""",160506211,160506233,"""-""",22,"""scaffold-ma3""",112704842,112705471,629,144.0,-20.46,144.0,-20.46,75450,""" 451""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_4342;Parent=…","""five_prime_utr_4342""","""maker-scaffold-ma3-augustus-ge…","""CV0985_concolor""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-5848""","""SH3GLB1""","""SH3GLB1"""
"""Cluster_866""","""scaffold-ma3""",146885107,146885130,"""-""",23,"""scaffold-ma3""",57717754,57717845,91,148.0,-18.69,148.0,-18.69,425395,""" 33""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_4133;Parent=…","""five_prime_utr_4133""","""augustus_masked-scaffold-ma3-p…","""CV0985_concolor""","""augustus_masked-scaffold-ma3-p…","""augustus_masked-scaffold-ma3-p…","""crovir-transcript-5349""","""RREB1""","""RREB1"""
"""Cluster_973""","""scaffold-ma4""",67130488,67130510,"""-""",22,"""scaffold-ma4""",57157117,57157240,123,144.0,-17.34,144.0,-17.34,497613,""" 54""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_5008;Parent=…","""five_prime_utr_5008""","""maker-scaffold-ma4-augustus-ge…","""CV0985_concolor""","""maker-scaffold-ma4-augustus-ge…","""maker-scaffold-ma4-augustus-ge…","""crovir-transcript-3615""","""FUNDC1""","""FUNDC1"""
"""Cluster_632""","""scaffold-ma2""",167637184,167637206,"""+""",22,"""scaffold-ma3""",155732655,155732714,59,140.0,-14.84,140.0,-14.84,297269,""" 1""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_4635;Parent=…","""five_prime_utr_4635""","""maker-scaffold-ma3-augustus-ge…","""CV0985_concolor""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-6456""","""TRPC4AP""","""TRPC4AP"""


#### CV0987

In [89]:
# Set the function equal to a DataFrame
CV0987_df = proccess_bedtools_and_miRanda(
    miranda_paths = miranda_tabs,
    bedtools_paths = bed_files,
    converion_dataframe = conversion_table,
    sample_name = 'CV0987_lutosus'
)
CV0987_df

miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Cluster_591""","""scaffold-ma2""",136588087,136588109,"""-""",22,"""scaffold-ma2""",192574124,192577382,3258,146.0,-15.78,146.0,-15.78,377781,""" 3049""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_5160;Parent…","""three_prime_utr_5160""","""maker-scaffold-ma2-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-15573""","""DTWD2""","""DTWD2"""
"""Cluster_476""","""scaffold-ma2""",74469087,74469110,"""+""",23,"""scaffold-ma1""",40694903,40696549,1646,301.0,-31.57,151.0,-16.85,273509,""" 135 1067""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_1241;Parent…","""three_prime_utr_1241""","""maker-scaffold-ma1-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-7186""","""MLH3""","""MLH3"""
"""Cluster_1640""","""scaffold-mi5""",454312,454333,"""-""",21,"""scaffold-ma2""",102652919,102655638,2719,141.0,-18.55,141.0,-18.55,1022404,""" 435""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_4407;Parent…","""three_prime_utr_4407""","""maker-scaffold-ma2-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-14367""","""TTYH2""","""TTYH2"""
"""Cluster_1846""","""scaffold-un11""",475610,475632,"""+""",22,"""scaffold-ma6""",7739061,7740938,1877,143.0,-17.64,143.0,-17.64,1177026,""" 1538""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_8227;Parent…","""three_prime_utr_8227""","""maker-scaffold-ma6-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma6-augustus-ge…","""maker-scaffold-ma6-augustus-ge…","""crovir-transcript-11694""","""CPNE8""","""CPNE8.1"""
"""Cluster_667""","""scaffold-ma2""",204548839,204548862,"""+""",23,"""scaffold-ma5""",67164203,67165801,1598,294.0,-38.17,154.0,-24.3,481091,""" 1150 1052""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_8088;Parent…","""three_prime_utr_8088""","""maker-scaffold-ma5-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma5-augustus-ge…","""maker-scaffold-ma5-augustus-ge…","""crovir-transcript-10803""","""TM4SF4""","""TM4SF4"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_1266""","""scaffold-Z""",35887469,35887491,"""-""",22,"""scaffold-ma3""",62992610,62992742,132,145.0,-13.89,145.0,-13.89,12185975,""" 21""","""Liftoff""","""CDS""","""+""","""ID=CDS_84896;Parent=maker-scaf…","""CDS_84896""","""maker-scaffold-ma3-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-5406""","""COG6""","""COG6"""
"""Cluster_475""","""scaffold-ma2""",74468517,74468538,"""+""",21,"""scaffold-ma2""",39996247,39997314,1067,143.0,-14.31,143.0,-14.31,3984943,""" 481""","""Liftoff""","""CDS""","""+""","""ID=CDS_56641;Parent=augustus_m…","""CDS_56641""","""augustus_masked-scaffold-ma2-p…","""CV0987_lutosus""","""augustus_masked-scaffold-ma2-p…","""augustus_masked-scaffold-ma2-p…","""crovir-transcript-13553""","""WNK2""","""WNK2.1"""
"""Cluster_898""","""scaffold-ma3""",177083078,177083100,"""+""",22,"""scaffold-ma5""",24019274,24020600,1326,141.0,-15.37,141.0,-15.37,9332443,""" 138""","""Liftoff""","""CDS""","""-""","""ID=CDS_115082;Parent=augustus_…","""CDS_115082""","""augustus_masked-scaffold-ma5-p…","""CV0987_lutosus""","""augustus_masked-scaffold-ma5-p…","""augustus_masked-scaffold-ma5-p…","""crovir-transcript-10350""","""DRD1""","""DRD1"""
"""Cluster_451""","""scaffold-ma2""",46083770,46083792,"""-""",22,"""scaffold-ma2""",108647105,108647175,70,150.0,-13.32,150.0,-13.32,3841792,""" 31""","""Liftoff""","""CDS""","""-""","""ID=CDS_64841;Parent=maker-scaf…","""CDS_64841""","""maker-scaffold-ma2-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-14427""","""PA2G4""","""PA2G4"""


#### CV1081

In [90]:
# CV1081 the function equal to a DataFrame
CV1081_df = proccess_bedtools_and_miRanda(
    miranda_paths = miranda_tabs,
    bedtools_paths = bed_files,
    converion_dataframe = conversion_table,
    sample_name = 'CV1081_viridis'
)
CV1081_df

miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Cluster_855""","""scaffold-ma3""",139201720,139201742,"""+""",22,"""scaffold-mi7""",3167969,3169344,1375,171.0,-10.99,171.0,-10.99,406751,""" 771""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_6261;Parent=…","""five_prime_utr_6261""","""maker-scaffold-mi7-augustus-ge…","""CV1081_viridis""","""maker-scaffold-mi7-augustus-ge…","""maker-scaffold-mi7-augustus-ge…","""crovir-transcript-1172""","""TMEM51""","""TMEM51"""
"""Cluster_667""","""scaffold-ma2""",204548839,204548862,"""+""",23,"""scaffold-ma1""",135611785,135612667,882,141.0,-13.89,141.0,-13.89,337197,""" 730""","""Liftoff""","""five_prime_utr""","""+""","""ID=five_prime_utr_1335;Parent=…","""five_prime_utr_1335""","""maker-scaffold-ma1-augustus-ge…","""CV1081_viridis""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-8286""","""KLHL41""","""KLHL41"""
"""Cluster_602""","""scaffold-ma2""",140977855,140977877,"""-""",22,"""scaffold-ma4""",37226329,37226502,173,150.0,-18.81,150.0,-18.81,290310,""" 31""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_4939;Parent=…","""five_prime_utr_4939""","""maker-scaffold-ma4-augustus-ge…","""CV1081_viridis""","""maker-scaffold-ma4-augustus-ge…","""maker-scaffold-ma4-augustus-ge…","""crovir-transcript-3464""","""GTPBP6""","""GTPBP6"""
"""Cluster_1084""","""scaffold-ma5""",71877271,71877293,"""+""",22,"""scaffold-ma3""",4691290,4691860,570,140.0,-13.47,140.0,-13.47,539474,""" 549""","""Liftoff""","""five_prime_utr""","""+""","""ID=five_prime_utr_3876;Parent=…","""five_prime_utr_3876""","""augustus_masked-scaffold-ma3-p…","""CV1081_viridis""","""augustus_masked-scaffold-ma3-p…","""augustus_masked-scaffold-ma3-p…","""crovir-transcript-4813""","""GPAA1""","""GPAA1"""
"""Cluster_1326""","""scaffold-Z""",88183891,88183913,"""-""",22,"""scaffold-ma7""",41771531,41772146,615,290.0,-40.19,146.0,-20.45,598815,""" 276 72""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_6608;Parent=…","""five_prime_utr_6608""","""augustus_masked-scaffold-ma7-p…","""CV1081_viridis""","""augustus_masked-scaffold-ma7-p…","""augustus_masked-scaffold-ma7-p…","""crovir-transcript-4624""","""SAP30""","""SAP30"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_1911""","""scaffold-un619""",5287,5308,"""+""",21,"""scaffold-Z""",34005036,34005261,225,146.0,-12.65,146.0,-12.65,18467595,""" 51""","""Liftoff""","""CDS""","""+""","""ID=CDS_3016;Parent=maker-scaff…","""CDS_3016""","""maker-scaffold-Z-augustus-gene…","""CV1081_viridis""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-2025""","""DNAH11""","""DNAH11"""
"""Cluster_1737""","""scaffold-mi7""",12015387,12015410,"""+""",23,"""scaffold-ma1""",191140056,191140269,213,140.0,-15.71,140.0,-15.71,16530429,""" 86""","""Liftoff""","""CDS""","""-""","""ID=CDS_33402;Parent=maker-scaf…","""CDS_33402""","""maker-scaffold-ma1-augustus-ge…","""CV1081_viridis""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-8796""","""RNF144A""","""RNF144A"""
"""Cluster_1385""","""scaffold-mi1""",1787355,1787377,"""-""",22,"""scaffold-Z""",13051361,13051483,122,147.0,-22.58,147.0,-22.58,13319754,""" 24""","""Liftoff""","""CDS""","""+""","""ID=CDS_1041;Parent=maker-scaff…","""CDS_1041""","""maker-scaffold-Z-augustus-gene…","""CV1081_viridis""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-1811""","""ANK3""","""ANK3"""
"""Cluster_1292""","""scaffold-Z""",57906901,57906923,"""-""",22,"""scaffold-mi10""",5368371,5368494,123,140.0,-16.67,140.0,-16.67,12382836,""" 11""","""Liftoff""","""CDS""","""-""","""ID=CDS_147327;Parent=maker-sca…","""CDS_147327""","""maker-scaffold-mi10-augustus-g…","""CV1081_viridis""","""maker-scaffold-mi10-augustus-g…","""maker-scaffold-mi10-augustus-g…","""crovir-transcript-12630""","""ECM1""","""ECM1"""


#### CV1082

In [91]:
# CV1082 the function equal to a DataFrame
CV1082_df = proccess_bedtools_and_miRanda(
    miranda_paths = miranda_tabs,
    bedtools_paths = bed_files,
    converion_dataframe = conversion_table,
    sample_name = 'CV1082_viridis'
)
CV1082_df

miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Cluster_1503""","""scaffold-mi2""",3110646,3110670,"""-""",24,"""scaffold-ma1""",239703813,239709282,5469,282.0,-26.51,142.0,-15.78,960542,""" 1427 1929""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_2562;Parent…","""three_prime_utr_2562""","""maker-scaffold-ma1-augustus-ge…","""CV1082_viridis""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-9270""","""LRP11""","""LRP11"""
"""Cluster_925""","""scaffold-ma4""",17844204,17844227,"""+""",23,"""scaffold-ma2""",43842049,43843228,1179,161.0,-16.23,161.0,-16.23,648770,""" 949""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_3881;Parent…","""three_prime_utr_3881""","""maker-scaffold-ma2-augustus-ge…","""CV1082_viridis""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-13596""","""ECM2""","""ECM2"""
"""Cluster_865""","""scaffold-ma3""",144999268,144999291,"""-""",23,"""scaffold-mi1""",19287368,19289731,2363,336.0,-37.14,185.0,-22.04,593099,""" 2082 138""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_3345;Parent…","""three_prime_utr_3345""","""maker-scaffold-mi1-augustus-ge…","""CV1082_viridis""","""maker-scaffold-mi1-augustus-ge…","""maker-scaffold-mi1-augustus-ge…","""crovir-transcript-13009""","""TET3""","""TET3"""
"""Cluster_1737""","""scaffold-mi7""",12015387,12015410,"""+""",23,"""scaffold-ma1""",265133426,265136784,3358,151.0,-21.14,151.0,-21.14,1101920,""" 2161""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_2764;Parent…","""three_prime_utr_2764""","""augustus_masked-scaffold-ma1-p…","""CV1082_viridis""","""augustus_masked-scaffold-ma1-p…","""augustus_masked-scaffold-ma1-p…","""crovir-transcript-9528""","""SLC4A1AP""","""SLC4A1AP"""
"""Cluster_122""","""scaffold-ma1""",100437388,100437410,"""-""",22,"""scaffold-Z""",53556048,53556844,796,154.0,-9.33,154.0,-9.33,40646,""" 165""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_310;Parent=…","""three_prime_utr_310""","""maker-scaffold-Z-augustus-gene…","""CV1082_viridis""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-2219""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_186""","""scaffold-ma1""",162091457,162091479,"""+""",22,"""scaffold-ma1""",172318582,172319687,1105,141.0,-17.84,141.0,-17.84,80102,""" 330""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_1496;Parent=…","""five_prime_utr_1496""","""maker-scaffold-ma1-augustus-ge…","""CV1082_viridis""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-8634""","""WDR92""","""WDR92"""
"""Cluster_83""","""scaffold-ma1""",65768523,65768545,"""-""",22,"""scaffold-mi7""",1145791,1146156,365,147.0,-10.56,147.0,-10.56,20831,""" 159""","""Liftoff""","""five_prime_utr""","""+""","""ID=five_prime_utr_6225;Parent=…","""five_prime_utr_6225""","""maker-scaffold-mi7-augustus-ge…","""CV1082_viridis""","""maker-scaffold-mi7-augustus-ge…","""maker-scaffold-mi7-augustus-ge…","""crovir-transcript-1057""","""MMP23B""","""MMP23B"""
"""Cluster_451""","""scaffold-ma2""",46083770,46083792,"""-""",22,"""scaffold-un10""",2884248,2884547,299,143.0,-15.64,143.0,-15.64,185657,""" 58""","""Liftoff""","""five_prime_utr""","""+""","""ID=five_prime_utr_6918;Parent=…","""five_prime_utr_6918""","""maker-scaffold-un10-augustus-g…","""CV1082_viridis""","""maker-scaffold-un10-augustus-g…","""maker-scaffold-un10-augustus-g…","""NA""","""maker-scaffold-un10-augustus-g…","""maker-scaffold-un10-augustus-g…"
"""Cluster_83""","""scaffold-ma1""",65768523,65768545,"""-""",22,"""scaffold-un18""",423149,423215,66,309.0,-30.78,160.0,-21.3,21345,""" 27 5""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_7046;Parent=…","""five_prime_utr_7046""","""maker-scaffold-un18-augustus-g…","""CV1082_viridis""","""maker-scaffold-un18-augustus-g…","""maker-scaffold-un18-augustus-g…","""NA""","""maker-scaffold-un18-augustus-g…","""maker-scaffold-un18-augustus-g…"


#### CV1086

In [92]:
# CV1086 the function equal to a DataFrame
CV1086_df = proccess_bedtools_and_miRanda(
    miranda_paths = miranda_tabs,
    bedtools_paths = bed_files,
    converion_dataframe = conversion_table,
    sample_name = 'CV1086_viridis'
)
CV1086_df

miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Cluster_1595""","""scaffold-mi3""",8030236,8030260,"""+""",24,"""scaffold-Z""",37962600,37963093,493,145.0,-16.29,145.0,-16.29,714641,""" 105""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_141;Parent=m…","""five_prime_utr_141""","""maker-scaffold-Z-augustus-gene…","""CV1086_viridis""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-2071""","""FKBP14""","""FKBP14"""
"""Cluster_867""","""scaffold-ma3""",146886226,146886250,"""-""",24,"""scaffold-ma2""",134212896,134213338,442,149.0,-19.07,149.0,-19.07,431698,""" 230""","""Liftoff""","""five_prime_utr""","""+""","""ID=five_prime_utr_3350;Parent=…","""five_prime_utr_3350""","""maker-scaffold-ma2-augustus-ge…","""CV1086_viridis""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-14860""","""NT5DC2""","""NT5DC2.1"""
"""Cluster_574""","""scaffold-ma2""",132418184,132418206,"""+""",22,"""scaffold-ma6""",23047528,23047651,123,142.0,-17.49,142.0,-17.49,248181,""" 101""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_5885;Parent=…","""five_prime_utr_5885""","""maker-scaffold-ma6-augustus-ge…","""CV1086_viridis""","""maker-scaffold-ma6-augustus-ge…","""maker-scaffold-ma6-augustus-ge…","""crovir-transcript-11799""","""SLC38A2""","""SLC38A2"""
"""Cluster_589""","""scaffold-ma2""",136587568,136587591,"""-""",23,"""scaffold-ma5""",69045272,69045466,194,147.0,-17.24,147.0,-17.24,255188,""" 150""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_5747;Parent=…","""five_prime_utr_5747""","""maker-scaffold-ma5-augustus-ge…","""CV1086_viridis""","""maker-scaffold-ma5-augustus-ge…","""maker-scaffold-ma5-augustus-ge…","""crovir-transcript-10836""","""DHX36""","""DHX36"""
"""Cluster_133""","""scaffold-ma1""",103763992,103764014,"""+""",22,"""scaffold-ma2""",74387238,74387503,265,146.0,-17.26,146.0,-17.26,52598,""" 213""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_2935;Parent=…","""five_prime_utr_2935""","""maker-scaffold-ma2-augustus-ge…","""CV1086_viridis""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-13985""","""IL17B""","""IL17B"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_135""","""scaffold-ma1""",104465812,104465834,"""-""",22,"""scaffold-ma3""",155562775,155564697,1922,148.0,-16.94,148.0,-16.94,86696,""" 56""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_6511;Parent…","""three_prime_utr_6511""","""maker-scaffold-ma3-augustus-ge…","""CV1086_viridis""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-6446""","""GSS""","""GSS"""
"""Cluster_131""","""scaffold-ma1""",103174546,103174568,"""+""",22,"""scaffold-ma7""",6619562,6620442,880,142.0,-12.32,142.0,-12.32,58167,""" 657""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_9084;Parent…","""three_prime_utr_9084""","""maker-scaffold-ma7-augustus-ge…","""CV1086_viridis""","""maker-scaffold-ma7-augustus-ge…","""maker-scaffold-ma7-augustus-ge…","""crovir-transcript-4319""","""PTPN13""","""PTPN13.1"""
"""Cluster_1718""","""scaffold-mi7""",6419427,6419449,"""+""",22,"""scaffold-ma1""",302870753,302871116,363,140.0,-13.25,140.0,-13.25,1072010,""" 334""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_3106;Parent…","""three_prime_utr_3106""","""maker-scaffold-ma1-augustus-ge…","""CV1086_viridis""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-9951""","""MYL2""","""MYL2"""
"""Cluster_1760""","""scaffold-mi8""",3843221,3843243,"""+""",22,"""scaffold-ma4""",73306587,73306703,116,141.0,-7.68,141.0,-7.68,1156085,""" 80""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_7212;Parent…","""three_prime_utr_7212""","""maker-scaffold-ma4-augustus-ge…","""CV1086_viridis""","""maker-scaffold-ma4-augustus-ge…","""maker-scaffold-ma4-augustus-ge…","""crovir-transcript-3771""","""ILDR2""","""ILDR2"""


#### CV1087

In [93]:
# CV1087 the function equal to a DataFrame
CV1087_df = proccess_bedtools_and_miRanda(
    miranda_paths = miranda_tabs,
    bedtools_paths = bed_files,
    converion_dataframe = conversion_table,
    sample_name = 'CV1087_viridis'
)
CV1087_df

miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Cluster_324""","""scaffold-ma1""",271893842,271893865,"""+""",23,"""scaffold-mi8""",9622040,9624637,2597,157.0,-20.64,157.0,-20.64,135543,""" 693""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_6826;Parent=…","""five_prime_utr_6826""","""maker-scaffold-mi8-augustus-ge…","""CV1087_viridis""","""maker-scaffold-mi8-augustus-ge…","""maker-scaffold-mi8-augustus-ge…","""crovir-transcript-16401""","""RBM5""","""RBM5"""
"""Cluster_632""","""scaffold-ma2""",167637184,167637206,"""+""",22,"""scaffold-ma2""",115934531,115936076,1545,145.0,-13.76,145.0,-13.76,295810,""" 287""","""Liftoff""","""five_prime_utr""","""+""","""ID=five_prime_utr_3176;Parent=…","""five_prime_utr_3176""","""maker-scaffold-ma2-augustus-ge…","""CV1087_viridis""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-14517""","""RHOT1""","""RHOT1.1"""
"""Cluster_132""","""scaffold-ma1""",103730973,103730995,"""+""",22,"""scaffold-mi1""",4880242,4880556,314,152.0,-13.19,152.0,-13.19,48641,""" 10""","""Liftoff""","""five_prime_utr""","""+""","""ID=five_prime_utr_2217;Parent=…","""five_prime_utr_2217""","""augustus_masked-scaffold-mi1-p…","""CV1087_viridis""","""augustus_masked-scaffold-mi1-p…","""augustus_masked-scaffold-mi1-p…","""crovir-transcript-12712""","""SNX19""","""SNX19"""
"""Cluster_653""","""scaffold-ma2""",183759468,183759490,"""+""",22,"""scaffold-ma2""",127450211,127450455,244,151.0,-17.87,151.0,-17.87,317334,""" 216""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_3262;Parent=…","""five_prime_utr_3262""","""maker-scaffold-ma2-augustus-ge…","""CV1087_viridis""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-14696""","""KRT7""","""KRT7"""
"""Cluster_341""","""scaffold-ma1""",287557910,287557932,"""+""",22,"""scaffold-ma3""",16262575,16263264,689,290.0,-17.83,150.0,-9.52,139338,""" 367 132""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_3916;Parent=…","""five_prime_utr_3916""","""maker-scaffold-ma3-augustus-ge…","""CV1087_viridis""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-4921""","""ODF1""","""ODF1"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_590""","""scaffold-ma2""",136587840,136587863,"""-""",23,"""scaffold-ma1""",61962960,61964881,1921,146.0,-18.67,146.0,-18.67,364458,""" 1836""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_1434;Parent…","""three_prime_utr_1434""","""augustus_masked-scaffold-ma1-p…","""CV1087_viridis""","""augustus_masked-scaffold-ma1-p…","""augustus_masked-scaffold-ma1-p…","""crovir-transcript-7477""","""LRP4""","""LRP4"""
"""Cluster_690""","""scaffold-ma2""",223741065,223741087,"""-""",22,"""scaffold-Z""",7294389,7296205,1816,150.0,-16.24,150.0,-16.24,484073,""" 404""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_41;Parent=m…","""three_prime_utr_41""","""maker-scaffold-Z-augustus-gene…","""CV1087_viridis""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-1786""","""LMBR1""","""LMBR1"""
"""Cluster_1326""","""scaffold-Z""",88183891,88183913,"""-""",22,"""scaffold-ma3""",31645844,31648075,2231,147.0,-17.29,147.0,-17.29,842110,""" 1601""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_5625;Parent…","""three_prime_utr_5625""","""maker-scaffold-ma3-augustus-ge…","""CV1087_viridis""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-5105""","""MRPL15""","""MRPL15"""
"""Cluster_898""","""scaffold-ma3""",177083078,177083100,"""+""",22,"""scaffold-Z""",92897590,92901270,3680,142.0,-11.01,142.0,-11.01,615761,""" 1716""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_637;Parent=…","""three_prime_utr_637""","""maker-scaffold-Z-augustus-gene…","""CV1087_viridis""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-2696""","""RRAS2""","""RRAS2"""


### Join miRNA counts and sequence data to the target data

#### CV0857

In [94]:
# Create an intersection between the count and sample target data
intersect = list(set(miRNA_info_df.columns) & set(CV0857_df.columns))
print(intersect)

# Join
CV0857_df = (
    CV0857_df
    .join(miRNA_info_df, on = intersect, how='left')
    .unique()
)
CV0857_df

['miRNA_length', 'sample_id', 'miRNA_cluster_original']


miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes,miRNA_counts,miRNA_cluster,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_sequence,hairpin_sequence
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,f64,f64,f64,str,str
"""Cluster_341""","""scaffold-ma1""",287557910,287557932,"""+""",22,"""scaffold-ma3""",143843640,143844480,840,141.0,-6.1,141.0,-6.1,197494,""" 294""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_6385;Parent…","""three_prime_utr_6385""","""maker-scaffold-ma3-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-6271""","""MDM4""","""MDM4""",290,"""Cluster_341""","""miR-2492-5p""","""dme-miR-2492-5p""","""De-Novo""",100.0,4.5,20.3,"""TTTATAAAGATATCTGGAAAAG""","""GCAATCTGGAGAGTGCACAGAATTTATAAA…"
"""Cluster_1772""","""scaffold-mi8""",5782808,5782830,"""-""",22,"""scaffold-ma6""",12935179,12935316,137,142.0,-15.01,142.0,-15.01,17513993,""" 35""","""Liftoff""","""CDS""","""-""","""ID=CDS_122686;Parent=maker-sca…","""CDS_122686""","""maker-scaffold-ma6-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma6-augustus-ge…","""maker-scaffold-ma6-augustus-ge…","""crovir-transcript-11732""","""PLXNB2""","""PLXNB2""",14566,"""cvi-miR-140-3p""","""miR-140-3p""","""oha-miR-140-3p""","""Probable-Identity""",100.0,3.0800e-7,44.1,"""ACCACAGGGTAGAACCACGGAC""","""ACGTGTCTCTCGGTGGCCCGCCAGTGGTTT…"
"""Cluster_987""","""scaffold-ma4""",81815030,81815053,"""-""",23,"""scaffold-ma1""",301692233,301692615,382,140.0,-19.01,140.0,-19.01,10791928,""" 117""","""Liftoff""","""CDS""","""+""","""ID=CDS_45532;Parent=augustus_m…","""CDS_45532""","""augustus_masked-scaffold-ma1-p…","""CV0857_viridis""","""augustus_masked-scaffold-ma1-p…","""augustus_masked-scaffold-ma1-p…","""crovir-transcript-9947""","""HSPB1""","""HSPB1""",1390,"""cvi-miR-133a-3p""","""miR-133a-3p""","""pbv-miR-133a-3p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""TTTGGTCCCCTTCAACCAGCTGT""","""GGCCTAGAATGCTTTGCTAAAGCTGGTAAA…"
"""Cluster_925""","""scaffold-ma4""",17844204,17844227,"""+""",23,"""scaffold-ma7""",14908061,14911063,3002,158.0,-17.97,158.0,-17.97,653175,""" 715""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_9136;Parent…","""three_prime_utr_9136""","""maker-scaffold-ma7-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma7-augustus-ge…","""maker-scaffold-ma7-augustus-ge…","""crovir-transcript-4398""","""XP_011529817""","""XP_011529817""",1358,"""cvi-miR-20a-5p""","""miR-20a-5p""","""oha-miR-20a-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""TAAAGTGCTTATAGTGCAGGTAG""","""TCCTAAATGGCTCCTGTAGCACTAAAGTGC…"
"""Cluster_1755""","""scaffold-mi8""",3813984,3814006,"""+""",22,"""scaffold-ma2""",1075938,1076051,113,149.0,-13.51,149.0,-13.51,16847205,""" 28""","""Liftoff""","""CDS""","""+""","""ID=CDS_53207;Parent=maker-scaf…","""CDS_53207""","""maker-scaffold-ma2-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-13118""","""SLC44A4""","""SLC44A4""",190,"""cvi-miR-15b-5p""","""miR-15b-5p""","""oha-miR-15b-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TAGCAGCACGACATGGTTTGTA""","""GATGGCCAGCCTTGAGGTGGTGTAGCAGCA…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_573""","""scaffold-ma2""",132417730,132417752,"""+""",22,"""scaffold-Z""",59092312,59092926,614,154.0,-14.53,154.0,-14.53,333116,""" 217""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_344;Parent=…","""three_prime_utr_344""","""maker-scaffold-Z-augustus-gene…","""CV0857_viridis""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-2271""","""FIGNL1""","""FIGNL1""",190854,"""cvi-let-7f-5p""","""let-7f-5p""","""oha-let-7f-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TGAGGTAGTAGATTGTATAGTT""","""GCTGTTCCAGGGCCCCCTGGGATGAGGTAG…"
"""Cluster_557""","""scaffold-ma2""",124502255,124502279,"""-""",24,"""scaffold-ma2""",140525542,140526543,1001,145.0,-14.59,145.0,-14.59,316927,""" 819""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_4810;Parent…","""three_prime_utr_4810""","""maker-scaffold-ma2-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-15017""","""CDKN2D""","""CDKN2D""",153299,"""cvi-miR-148b-5p""","""miR-148b-5p""","""pbv-miR-148b-5p""","""Very-Probable-Identity""",100.0,2.3300e-8,48.1,"""GAAGTTCTGTTATACACTTTGACT""","""ACTGTTACAGCGCTTGAGGTGAAGTTCTGT…"
"""Cluster_650""","""scaffold-ma2""",182258369,182258393,"""-""",24,"""scaffold-ma5""",67967718,67967787,69,140.0,-23.29,140.0,-23.29,6613736,""" 20""","""Liftoff""","""CDS""","""+""","""ID=CDS_120204;Parent=maker-sca…","""CDS_120204""","""maker-scaffold-ma5-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma5-augustus-ge…","""maker-scaffold-ma5-augustus-ge…","""crovir-transcript-10826""","""MED12L""","""MED12L.1""",8049,"""cvi-miR-7-5p""","""miR-7-5p""","""oha-miR-7-5p""","""Very-Probable-Identity""",100.0,2.3300e-8,48.1,"""TGGAAGACTAGTGATTTTGTTGTT""","""GACGTTGGTCTAGTTCTGTGTGGAAGACTA…"
"""Cluster_965""","""scaffold-ma4""",57567321,57567345,"""-""",24,"""scaffold-ma3""",138697442,138697619,177,141.0,-14.46,141.0,-14.46,10226651,""" 18""","""Liftoff""","""CDS""","""+""","""ID=CDS_92428;Parent=maker-scaf…","""CDS_92428""","""maker-scaffold-ma3-augustus-ge…","""CV0857_viridis""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-6191""","""MAPK13""","""MAPK13""",649,"""cvi-miR-222a-3p""","""miR-222a-3p""","""oha-miR-222a-3p""","""Very-Probable-Identity""",100.0,2.3300e-8,48.1,"""AGCTACATCTGGCTACTGGGTCTC""","""AACCTCAGTTGCTCATCAGTCGCTCAGTAG…"


#### CV0985

In [95]:
# Create an intersection between the count and sample target data
intersect = list(set(miRNA_info_df.columns) & set(CV0985_df.columns))
print(intersect)

# Join
CV0985_df = (
    CV0985_df
    .join(miRNA_info_df, on = intersect, how='left')
    .unique()
)
CV0985_df

['miRNA_length', 'sample_id', 'miRNA_cluster_original']


miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes,miRNA_counts,miRNA_cluster,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_sequence,hairpin_sequence
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,f64,f64,f64,str,str
"""Cluster_833""","""scaffold-ma3""",122669751,122669773,"""+""",22,"""scaffold-ma1""",23403586,23403836,250,149.0,-9.78,149.0,-9.78,8188184,""" 173""","""Liftoff""","""CDS""","""+""","""ID=CDS_15342;Parent=maker-scaf…","""CDS_15342""","""maker-scaffold-ma1-augustus-ge…","""CV0985_concolor""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-7050""","""XRCC3""","""XRCC3""",12214,"""cvi-miR-101a-3p""","""miR-101a-3p""","""oha-miR-101a-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""GTACAGTACTGTGATAACTGAA""","""ATTGACAGGCTGCCCTGGCTCAGTTATCAC…"
"""Cluster_1737""","""scaffold-mi7""",12015387,12015410,"""+""",23,"""scaffold-ma3""",104507152,104507544,392,145.0,-21.07,145.0,-21.07,1104704,""" 328""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_6035;Parent…","""three_prime_utr_6035""","""maker-scaffold-ma3-augustus-ge…","""CV0985_concolor""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-5762""","""SLC35A3""","""SLC35A3""",11,"""cvi-miR-365a-1-5p""","""miR-365a-1-5p""","""oha-miR-365a-1-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""AGGGACTTTTGGGGGCAGCTGTG""","""TTTCGCCAGTGCAGGGAAAATGAGGGACTT…"
"""Cluster_926""","""scaffold-ma4""",17844344,17844367,"""+""",23,"""scaffold-ma4""",31942612,31942821,209,144.0,-18.24,144.0,-18.24,9929528,""" 122""","""Liftoff""","""CDS""","""+""","""ID=CDS_102193;Parent=augustus_…","""CDS_102193""","""augustus_masked-scaffold-ma4-p…","""CV0985_concolor""","""augustus_masked-scaffold-ma4-p…","""augustus_masked-scaffold-ma4-p…","""crovir-transcript-3400""","""ASMT""","""ASMT.1""",947,"""cvi-miR-19b-5p""","""miR-19b-5p""","""oha-miR-19b-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""AGTTTTGCAGGTTTGCATCCAGC""","""TGTTAAACACTGTTCTCTGGTTAGTTTTGC…"
"""Cluster_557""","""scaffold-ma2""",124502255,124502279,"""-""",24,"""scaffold-ma6""",47556721,47556839,118,145.0,-16.51,145.0,-16.51,4802426,""" 5""","""Liftoff""","""CDS""","""+""","""ID=CDS_125475;Parent=maker-sca…","""CDS_125475""","""maker-scaffold-ma6-augustus-ge…","""CV0985_concolor""","""maker-scaffold-ma6-augustus-ge…","""maker-scaffold-ma6-augustus-ge…","""crovir-transcript-12059""","""PMM2""","""PMM2.1""",91499,"""cvi-miR-148b-5p""","""miR-148b-5p""","""pbv-miR-148b-5p""","""Very-Probable-Identity""",100.0,2.3300e-8,48.1,"""GAAGTTCTGTTATACACTTTGACT""","""ACTGTTACAGCGCTTGAGGTGAAGTTCTGT…"
"""Cluster_1382""","""scaffold-Z""",113418190,113418213,"""+""",23,"""scaffold-ma2""",23589207,23589282,75,141.0,-8.94,141.0,-8.94,13215616,""" 46""","""Liftoff""","""CDS""","""-""","""ID=CDS_54998;Parent=maker-scaf…","""CDS_54998""","""maker-scaffold-ma2-augustus-ge…","""CV0985_concolor""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-13395""","""SUMF1""","""SUMF1""",1,"""cvi-miR-9-3p-1""","""miR-9-3p""","""oha-miR-9-3p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""TCTTTGGTTATCTAGCTGTATGA""","""TGCCTGCAGGGGTTGGTTGTTATCTTTGGT…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_235""","""scaffold-ma1""",204856224,204856247,"""-""",23,"""scaffold-ma4""",32154620,32155540,920,140.0,-7.9,140.0,-7.9,157537,""" 78""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_6980;Parent…","""three_prime_utr_6980""","""maker-scaffold-ma4-augustus-ge…","""CV0985_concolor""","""maker-scaffold-ma4-augustus-ge…","""maker-scaffold-ma4-augustus-ge…","""crovir-transcript-3408""","""RNF149""","""RNF149""",36672,"""cvi-miR-30c-5p""","""miR-30c-5p""","""oha-miR-30c-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""TGTAAACATCCTACACTCTCAGC""","""TAAATCCAAGTGGTAGAGAGTGTAAACATC…"
"""Cluster_590""","""scaffold-ma2""",136587840,136587863,"""-""",23,"""scaffold-ma2""",46469723,46469898,175,149.0,-16.11,149.0,-16.11,5499217,""" 93""","""Liftoff""","""CDS""","""+""","""ID=CDS_57398;Parent=augustus_m…","""CDS_57398""","""augustus_masked-scaffold-ma2-p…","""CV0985_concolor""","""augustus_masked-scaffold-ma2-p…","""augustus_masked-scaffold-ma2-p…","""crovir-transcript-13645""","""DNAH1""","""DNAH1""",8318,"""cvi-miR-27a-5p""","""miR-27a-5p""","""pbv-miR-27a-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""AGGGCTTAGCTCACTTGTGAACA""","""TTCCTCAGACTGCATAGGGTAGGGCTTAGC…"
"""Cluster_406""","""scaffold-ma2""",7365313,7365336,"""+""",23,"""scaffold-mi9""",1955322,1955581,259,140.0,-17.42,140.0,-17.42,211483,""" 140""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_9680;Parent…","""three_prime_utr_9680""","""maker-scaffold-mi9-augustus-ge…","""CV0985_concolor""","""maker-scaffold-mi9-augustus-ge…","""maker-scaffold-mi9-augustus-ge…","""crovir-transcript-4129""","""XP_011536258""","""XP_011536258""",17778,"""cvi-miR-191-5p""","""miR-191-5p""","""oha-miR-191-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""CAACGGAATCCCAAAAGCAGCTG""","""TCTGCAAAGGCTTAAGAATGGGCAACGGAA…"
"""Cluster_1385""","""scaffold-mi1""",1787355,1787377,"""-""",22,"""scaffold-ma2""",51739464,51739904,440,140.0,-16.94,140.0,-16.94,631295,""" 191""","""Liftoff""","""five_prime_utr""","""+""","""ID=five_prime_utr_2799;Parent=…","""five_prime_utr_2799""","""maker-scaffold-ma2-augustus-ge…","""CV0985_concolor""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-13710""","""GFRA2""","""GFRA2.1""",2532,"""cvi-miR-125b-5p-1""","""miR-125b-5p""","""oha-miR-125b-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TCCCTGAGACCCTAACTTGTGA""","""ATGGACTGCGCCCCTCTCTGTCCCTGAGAC…"


#### CV0987

In [96]:
# Create an intersection between the count and sample target data
intersect = list(set(miRNA_info_df.columns) & set(CV0987_df.columns))
print(intersect)

# Join
CV0987_df = (
    CV0987_df
    .join(miRNA_info_df, on = intersect, how='left')
    .unique()
)
CV0987_df

['miRNA_length', 'sample_id', 'miRNA_cluster_original']


miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes,miRNA_counts,miRNA_cluster,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_sequence,hairpin_sequence
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,f64,f64,f64,str,str
"""Cluster_659""","""scaffold-ma2""",190152946,190152968,"""+""",22,"""scaffold-mi7""",9291073,9291176,103,156.0,-12.94,156.0,-12.94,7101435,""" 78""","""Liftoff""","""CDS""","""+""","""ID=CDS_133189;Parent=augustus_…","""CDS_133189""","""augustus_masked-scaffold-mi7-p…","""CV0987_lutosus""","""augustus_masked-scaffold-mi7-p…","""augustus_masked-scaffold-mi7-p…","""crovir-transcript-1518""","""ADGRD2""","""ADGRD2""",420,"""Cluster_659""","""miR-99a-5p""","""oha-miR-99a-5p""","""De-Novo""",100.0,4.5,20.3,"""CGATCTTGTGTTACTTTAGGCC""","""TTTGTTACAGAGAACAGAGGCCTAAAGTAG…"
"""Cluster_794""","""scaffold-ma3""",83692475,83692497,"""+""",22,"""scaffold-ma1""",111380913,111381167,254,166.0,-29.24,166.0,-29.24,365656,""" 195""","""Liftoff""","""five_prime_utr""","""+""","""ID=five_prime_utr_1261;Parent=…","""five_prime_utr_1261""","""maker-scaffold-ma1-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-8100""","""EAF1""","""EAF1""",1147,"""cvi-miR-214-5p""","""miR-214-5p""","""pbv-miR-214-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TGCCTGTCTACACTTGCTGTGC""","""CTGGATAGAGTTGTCACGTGTCTGCCTGTC…"
"""Cluster_1756""","""scaffold-mi8""",3814794,3814816,"""+""",22,"""scaffold-mi10""",4085466,4086456,990,142.0,-17.64,142.0,-17.64,17073912,""" 158""","""Liftoff""","""CDS""","""-""","""ID=CDS_146698;Parent=maker-sca…","""CDS_146698""","""maker-scaffold-mi10-augustus-g…","""CV0987_lutosus""","""maker-scaffold-mi10-augustus-g…","""maker-scaffold-mi10-augustus-g…","""crovir-transcript-12547""","""CHRNB2""","""CHRNB2""",18675,"""cvi-miR-16c-5p""","""miR-16c-5p""","""pbv-miR-16c-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TAGCAGCACGTAAATACTGGAG""","""AGGTCTGCTGTCTGCTGTGCTTTAGCAGCA…"
"""Cluster_1731""","""scaffold-mi7""",9664187,9664210,"""+""",23,"""scaffold-ma3""",137786173,137789657,3484,319.0,-39.49,164.0,-23.39,16279642,""" 1161 1410""","""Liftoff""","""CDS""","""-""","""ID=CDS_92173;Parent=augustus_m…","""CDS_92173""","""augustus_masked-scaffold-ma3-p…","""CV0987_lutosus""","""augustus_masked-scaffold-ma3-p…","""augustus_masked-scaffold-ma3-p…","""crovir-transcript-6172""","""CELSR2""","""CELSR2""",54386,"""cvi-miR-199a-5p""","""miR-199a-5p""","""oha-miR-199a-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""CCCAGTGTTCGGACTACCTGTTC""","""CTGAGAAGATCCACTCCGTCTACCCAGTGT…"
"""Cluster_1756""","""scaffold-mi8""",3814794,3814816,"""+""",22,"""scaffold-ma2""",219673579,219673686,107,140.0,-17.43,140.0,-17.43,17022236,""" 47""","""Liftoff""","""CDS""","""+""","""ID=CDS_78015;Parent=maker-scaf…","""CDS_78015""","""maker-scaffold-ma2-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-15819""","""HMGCS1""","""HMGCS1""",18675,"""cvi-miR-16c-5p""","""miR-16c-5p""","""pbv-miR-16c-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TAGCAGCACGTAAATACTGGAG""","""AGGTCTGCTGTCTGCTGTGCTTTAGCAGCA…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_122""","""scaffold-ma1""",100437388,100437410,"""-""",22,"""scaffold-ma4""",63714330,63717060,2730,142.0,-15.47,142.0,-15.47,46812,""" 844""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_7179;Parent…","""three_prime_utr_7179""","""maker-scaffold-ma4-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma4-augustus-ge…","""maker-scaffold-ma4-augustus-ge…","""crovir-transcript-3720""","""BACH1""","""BACH1""",402,"""cvi-miR-130c-3p""","""miR-130c-3p""","""oha-miR-130c-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""CAGTGCAATGTAAAAAGGGCAT""","""GGGCACGGCCCCTGCCCGAGGCTCTTTTCA…"
"""Cluster_925""","""scaffold-ma4""",17844204,17844227,"""+""",23,"""scaffold-ma1""",114453173,114455048,1875,280.0,-43.26,140.0,-21.63,9712358,""" 199 535""","""Liftoff""","""CDS""","""-""","""ID=CDS_26093;Parent=maker-scaf…","""CDS_26093""","""maker-scaffold-ma1-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-8173""","""DTX3L""","""DTX3L""",5539,"""cvi-miR-20a-5p""","""miR-20a-5p""","""oha-miR-20a-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""TAAAGTGCTTATAGTGCAGGTAG""","""TCCTAAATGGCTCCTGTAGCACTAAAGTGC…"
"""Cluster_865""","""scaffold-ma3""",144999268,144999291,"""-""",23,"""scaffold-ma2""",60416941,60417060,119,146.0,-16.93,146.0,-16.93,8830818,""" 6""","""Liftoff""","""CDS""","""+""","""ID=CDS_59380;Parent=maker-scaf…","""CDS_59380""","""maker-scaffold-ma2-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-13836""","""TBC1D9B""","""TBC1D9B""",23,"""cvi-miR-135-5p""","""miR-135-5p""","""oha-miR-135-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""TATGGCTTTTTATTCCTATGTGA""","""TTAAGTCCTCTGCTGTGGTCTATGGCTTTT…"
"""Cluster_1640""","""scaffold-mi5""",454312,454333,"""-""",21,"""scaffold-ma3""",154727435,154730094,2659,143.0,-15.4,143.0,-15.4,15374040,""" 1719""","""Liftoff""","""CDS""","""-""","""ID=CDS_94659;Parent=augustus_m…","""CDS_94659""","""augustus_masked-scaffold-ma3-p…","""CV0987_lutosus""","""augustus_masked-scaffold-ma3-p…","""augustus_masked-scaffold-ma3-p…","""crovir-transcript-6432""","""RBM12""","""RBM12""",2240,"""Cluster_1640""","""miR-4949-3p""","""dme-miR-4949-3p""","""De-Novo""",100.0,0.26,24.3,"""TATGTATGTGCGTGTGTGTGT""","""CCCGTATGTGTGTATGTAGATATGTATGTG…"


#### CV1081

In [97]:
# Create an intersection between the count and sample target data
intersect = list(set(miRNA_info_df.columns) & set(CV1081_df.columns))
print(intersect)

# Join
CV1081_df = (
    CV1081_df
    .join(miRNA_info_df, on = intersect, how='left')
    .unique()
)
CV1081_df

['miRNA_length', 'sample_id', 'miRNA_cluster_original']


miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes,miRNA_counts,miRNA_cluster,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_sequence,hairpin_sequence
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,f64,f64,f64,str,str
"""Cluster_650""","""scaffold-ma2""",182258369,182258393,"""-""",24,"""scaffold-ma6""",34609608,34609707,99,146.0,-17.28,146.0,-17.28,6617077,""" 18""","""Liftoff""","""CDS""","""-""","""ID=CDS_123935;Parent=maker-sca…","""CDS_123935""","""maker-scaffold-ma6-augustus-ge…","""CV1081_viridis""","""maker-scaffold-ma6-augustus-ge…","""maker-scaffold-ma6-augustus-ge…","""crovir-transcript-11901""","""OSBPL8""","""OSBPL8""",6259,"""cvi-miR-7-5p""","""miR-7-5p""","""oha-miR-7-5p""","""Very-Probable-Identity""",100.0,2.3300e-8,48.1,"""TGGAAGACTAGTGATTTTGTTGTT""","""GACGTTGGTCTAGTTCTGTGTGGAAGACTA…"
"""Cluster_793""","""scaffold-ma3""",83687551,83687574,"""+""",23,"""scaffold-ma6""",51447038,51447782,744,144.0,-12.02,144.0,-12.02,7678750,""" 533""","""Liftoff""","""CDS""","""+""","""ID=CDS_126165;Parent=maker-sca…","""CDS_126165""","""maker-scaffold-ma6-augustus-ge…","""CV1081_viridis""","""maker-scaffold-ma6-augustus-ge…","""maker-scaffold-ma6-augustus-ge…","""crovir-transcript-12134""","""IL2RB""","""IL2RB""",6285,"""cvi-miR-199c-5p""","""miR-199c-5p""","""oha-miR-199c-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""CCCAGTGTTCAGACTACCTGTTC""","""TAACCCCAACCTGCTCCGTCGCCCCAGTGT…"
"""Cluster_451""","""scaffold-ma2""",46083770,46083792,"""-""",22,"""scaffold-ma2""",37816209,37818042,1833,140.0,-11.71,140.0,-11.71,255442,""" 892""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_3829;Parent…","""three_prime_utr_3829""","""maker-scaffold-ma2-augustus-ge…","""CV1081_viridis""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-13530""","""ASB14""","""ASB14""",308,"""cvi-miR-135-5p-1""","""miR-135-5p""","""oha-miR-135-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TATGGCTTTTTATTCCTATGTG""","""ACCATTATCCCATTGTCTTCTATGGCTTTT…"
"""Cluster_135""","""scaffold-ma1""",104465812,104465834,"""-""",22,"""scaffold-ma4""",75895410,75897713,2303,151.0,-16.22,151.0,-16.22,87200,""" 2020""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_7231;Parent…","""three_prime_utr_7231""","""maker-scaffold-ma4-augustus-ge…","""CV1081_viridis""","""maker-scaffold-ma4-augustus-ge…","""maker-scaffold-ma4-augustus-ge…","""crovir-transcript-3799""","""ZNF654""","""ZNF654""",50,"""cvi-miR-153-3p""","""miR-153-3p""","""oha-miR-153-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TTGCATAGTCACAAAAGTGATC""","""ATTCTTCACAGCTGCCAGTGTCATTTTTGT…"
"""Cluster_573""","""scaffold-ma2""",132417730,132417752,"""+""",22,"""scaffold-ma6""",56410293,56411844,1551,161.0,-20.25,161.0,-20.25,340399,""" 11""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_8572;Parent…","""three_prime_utr_8572""","""maker-scaffold-ma6-augustus-ge…","""CV1081_viridis""","""maker-scaffold-ma6-augustus-ge…","""maker-scaffold-ma6-augustus-ge…","""crovir-transcript-12203""","""GNPTAB""","""GNPTAB""",71631,"""cvi-let-7f-5p""","""let-7f-5p""","""oha-let-7f-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TGAGGTAGTAGATTGTATAGTT""","""GCTGTTCCAGGGCCCCCTGGGATGAGGTAG…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_866""","""scaffold-ma3""",146885107,146885130,"""-""",23,"""scaffold-Z""",22294643,22295973,1330,142.0,-17.64,142.0,-17.64,595081,""" 263""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_125;Parent=…","""three_prime_utr_125""","""maker-scaffold-Z-augustus-gene…","""CV1081_viridis""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-1912""","""CACNB2""","""CACNB2""",93,"""cvi-miR-29c-3p""","""miR-29c-3p""","""xla-miR-29c-3p""","""Potential-Identity""",100.0,3.3600e-7,44.1,"""TAGCACCATTTGAAATCGGTTAT""","""AAACATCTCTTACACAGGCTGACCGATTTC…"
"""Cluster_1503""","""scaffold-mi2""",3110646,3110670,"""-""",24,"""scaffold-ma1""",174437680,174439820,2140,145.0,-9.93,145.0,-9.93,960182,""" 1205""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_2202;Parent…","""three_prime_utr_2202""","""maker-scaffold-ma1-augustus-ge…","""CV1081_viridis""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-8689""","""TASP1""","""TASP1""",15508,"""cvi-miR-30e-5p""","""miR-30e-5p""","""oha-miR-30e-5p""","""Very-Probable-Identity""",100.0,2.3300e-8,48.1,"""TGTAAACATCCTTGACTGGAAGCT""","""ATCTGGGCAGTTGTTGCCCCTGTAAACATC…"
"""Cluster_1864""","""scaffold-un31""",32389,32412,"""-""",23,"""scaffold-ma6""",18178138,18179175,1037,140.0,-12.61,140.0,-12.61,1217406,""" 135""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_8271;Parent…","""three_prime_utr_8271""","""maker-scaffold-ma6-augustus-ge…","""CV1081_viridis""","""maker-scaffold-ma6-augustus-ge…","""maker-scaffold-ma6-augustus-ge…","""crovir-transcript-11762""","""ASZ1""","""ASZ1.1""",387,"""cvi-miR-29b-3p""","""miR-29b-3p""","""oha-miR-29b-3p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""TAGCACCATTTGAAATCAGTGTT""","""GTCTGGAGCTTCTTCAGGAATCTGGTTTCA…"
"""Cluster_925""","""scaffold-ma4""",17844204,17844227,"""+""",23,"""scaffold-ma7""",41108847,41108994,147,140.0,-13.72,140.0,-13.72,9803193,""" 46""","""Liftoff""","""CDS""","""+""","""ID=CDS_137934;Parent=maker-sca…","""CDS_137934""","""maker-scaffold-ma7-augustus-ge…","""CV1081_viridis""","""maker-scaffold-ma7-augustus-ge…","""maker-scaffold-ma7-augustus-ge…","""crovir-transcript-4617""","""GLRA3""","""GLRA3.1""",1221,"""cvi-miR-20a-5p""","""miR-20a-5p""","""oha-miR-20a-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""TAAAGTGCTTATAGTGCAGGTAG""","""TCCTAAATGGCTCCTGTAGCACTAAAGTGC…"


#### CV1082

In [98]:
# Create an intersection between the count and sample target data
intersect = list(set(miRNA_info_df.columns) & set(CV1082_df.columns))
print(intersect)

# Join
CV1082_df = (
    CV1082_df
    .join(miRNA_info_df, on = intersect, how='left')
    .unique()
)
CV1082_df

['miRNA_length', 'sample_id', 'miRNA_cluster_original']


miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes,miRNA_counts,miRNA_cluster,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_sequence,hairpin_sequence
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,f64,f64,f64,str,str
"""Cluster_451""","""scaffold-ma2""",46083770,46083792,"""-""",22,"""scaffold-Z""",92915816,92919307,3491,143.0,-20.3,143.0,-20.3,3792728,""" 3292""","""Liftoff""","""CDS""","""-""","""ID=CDS_8853;Parent=augustus_ma…","""CDS_8853""","""augustus_masked-scaffold-Z-pro…","""CV1082_viridis""","""augustus_masked-scaffold-Z-pro…","""augustus_masked-scaffold-Z-pro…","""crovir-transcript-2703""","""PRR12""","""PRR12""",100,"""cvi-miR-135-5p-1""","""miR-135-5p""","""oha-miR-135-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TATGGCTTTTTATTCCTATGTG""","""ACCATTATCCCATTGTCTTCTATGGCTTTT…"
"""Cluster_794""","""scaffold-ma3""",83692475,83692497,"""+""",22,"""scaffold-ma2""",19059658,19060606,948,150.0,-22.02,150.0,-22.02,7767184,""" 586""","""Liftoff""","""CDS""","""+""","""ID=CDS_54822;Parent=augustus_m…","""CDS_54822""","""augustus_masked-scaffold-ma2-p…","""CV1082_viridis""","""augustus_masked-scaffold-ma2-p…","""augustus_masked-scaffold-ma2-p…","""crovir-transcript-13375""","""THSD1""","""THSD1""",210,"""cvi-miR-214-5p""","""miR-214-5p""","""pbv-miR-214-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TGCCTGTCTACACTTGCTGTGC""","""CTGGATAGAGTTGTCACGTGTCTGCCTGTC…"
"""Cluster_451""","""scaffold-ma2""",46083770,46083792,"""-""",22,"""scaffold-ma3""",137947597,137947715,118,140.0,-8.09,140.0,-8.09,3869334,""" 21""","""Liftoff""","""CDS""","""-""","""ID=CDS_92223;Parent=maker-scaf…","""CDS_92223""","""maker-scaffold-ma3-augustus-ge…","""CV1082_viridis""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-6180""","""KIF21B""","""KIF21B""",100,"""cvi-miR-135-5p-1""","""miR-135-5p""","""oha-miR-135-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TATGGCTTTTTATTCCTATGTG""","""ACCATTATCCCATTGTCTTCTATGGCTTTT…"
"""Cluster_1858""","""scaffold-un23""",21014,21036,"""-""",22,"""scaffold-ma2""",161705827,161706003,176,154.0,-14.67,154.0,-14.67,17926105,""" 124""","""Liftoff""","""CDS""","""+""","""ID=CDS_72963;Parent=maker-scaf…","""CDS_72963""","""maker-scaffold-ma2-augustus-ge…","""CV1082_viridis""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-15313""","""MUSK""","""MUSK""",203,"""Cluster_1858""","""miR-4773""","""hsa-miR-4773""","""De-Novo""",100.0,1.1,22.3,"""CGATCTTGTGCTCCTGTTCATC""","""CGGGAAGCAAGCAAGCGAGATGGGCAGGGG…"
"""Cluster_590""","""scaffold-ma2""",136587840,136587863,"""-""",23,"""scaffold-ma1""",29925288,29925734,446,146.0,-25.09,146.0,-25.09,258065,""" 180""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_809;Parent=m…","""five_prime_utr_809""","""maker-scaffold-ma1-augustus-ge…","""CV1082_viridis""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-7123""","""PPP4R4""","""PPP4R4""",3757,"""cvi-miR-27a-5p""","""miR-27a-5p""","""pbv-miR-27a-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""AGGGCTTAGCTCACTTGTGAACA""","""TTCCTCAGACTGCATAGGGTAGGGCTTAGC…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_186""","""scaffold-ma1""",162091457,162091479,"""+""",22,"""scaffold-un42""",92945,94037,1092,152.0,-13.61,152.0,-13.61,1815283,""" 352""","""Liftoff""","""CDS""","""+""","""ID=CDS_149315;Parent=augustus_…","""CDS_149315""","""augustus_masked-scaffold-un42-…","""CV1082_viridis""","""augustus_masked-scaffold-un42-…","""augustus_masked-scaffold-un42-…","""NA""","""augustus_masked-scaffold-un42-…","""augustus_masked-scaffold-un42-…",6,"""Cluster_186""","""miR-5007-5p""","""hsa-miR-5007-5p""","""De-Novo""",100.0,4.5,20.3,"""TGAACATGGACTATCAGCCAGC""","""CCACTGACACAACTGCAGCCAGTGAACATG…"
"""Cluster_866""","""scaffold-ma3""",146885107,146885130,"""-""",23,"""scaffold-mi1""",18102718,18102868,150,141.0,-14.28,141.0,-14.28,9050411,""" 98""","""Liftoff""","""CDS""","""-""","""ID=CDS_48892;Parent=maker-scaf…","""CDS_48892""","""maker-scaffold-mi1-augustus-ge…","""CV1082_viridis""","""maker-scaffold-mi1-augustus-ge…","""maker-scaffold-mi1-augustus-ge…","""crovir-transcript-12955""","""TMED4""","""TMED4""",82,"""cvi-miR-29c-3p""","""miR-29c-3p""","""xla-miR-29c-3p""","""Potential-Identity""",100.0,3.3600e-7,44.1,"""TAGCACCATTTGAAATCGGTTAT""","""AAACATCTCTTACACAGGCTGACCGATTTC…"
"""Cluster_4""","""scaffold-ma1""",3055559,3055581,"""-""",22,"""scaffold-ma1""",275970768,275971515,747,148.0,-15.5,148.0,-15.5,42101,""" 542""","""Liftoff""","""CDS""","""+""","""ID=CDS_42120;Parent=maker-scaf…","""CDS_42120""","""maker-scaffold-ma1-augustus-ge…","""CV1082_viridis""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-9626""","""PKDCC""","""PKDCC.1""",43,"""cvi-miR-9-5p""","""miR-9-5p""","""pbv-miR-9-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TCTTTGGTTATCTAGCTGTATG""","""ATCCACAGGGCCGGTTTTTCTCTTTGGTTA…"
"""Cluster_574""","""scaffold-ma2""",132418184,132418206,"""+""",22,"""scaffold-ma1""",247772916,247773963,1047,146.0,-16.24,146.0,-16.24,5184577,""" 833""","""Liftoff""","""CDS""","""+""","""ID=CDS_38526;Parent=maker-scaf…","""CDS_38526""","""maker-scaffold-ma1-augustus-ge…","""CV1082_viridis""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-9342""","""SLC16A10""","""SLC16A10.1""",1175,"""cvi-miR-98-5p""","""miR-98-5p""","""oha-miR-98-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TGAGGTAGTAAGTTGTATTGTT""","""GTGCTTCTCGCTCCTACCAGGGTGAGGTAG…"


#### CV1086

In [99]:
# Create an intersection between the count and sample target data
intersect = list(set(miRNA_info_df.columns) & set(CV1086_df.columns))
print(intersect)

# Join
CV1086_df = (
    CV1086_df
    .join(miRNA_info_df, on = intersect, how='left')
    .unique()
)
CV1086_df

['miRNA_length', 'sample_id', 'miRNA_cluster_original']


miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes,miRNA_counts,miRNA_cluster,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_sequence,hairpin_sequence
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,f64,f64,f64,str,str
"""Cluster_1693""","""scaffold-mi7""",559484,559506,"""+""",22,"""scaffold-mi7""",951195,951422,227,148.0,-20.42,148.0,-20.42,16028104,""" 84""","""Liftoff""","""CDS""","""-""","""ID=CDS_130113;Parent=maker-sca…","""CDS_130113""","""maker-scaffold-mi7-augustus-ge…","""CV1086_viridis""","""maker-scaffold-mi7-augustus-ge…","""maker-scaffold-mi7-augustus-ge…","""crovir-transcript-1067""","""PUSL1""","""PUSL1""",84292,"""cvi-miR-200a""","""miR-200a""","""oha-miR-200a""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TAACACTGTCTGGTAACGATGT""","""GATGATGGTCCTCTGTGGGCATCTTACTAG…"
"""Cluster_924""","""scaffold-ma4""",17843958,17843980,"""+""",22,"""scaffold-un4261""",523,886,363,140.0,-15.34,140.0,-15.34,9685439,""" 226""","""Liftoff""","""CDS""","""-""","""ID=CDS_151406;Parent=augustus_…","""CDS_151406""","""augustus_masked-scaffold-un426…","""CV1086_viridis""","""augustus_masked-scaffold-un426…","""augustus_masked-scaffold-un426…","""NA""","""augustus_masked-scaffold-un426…","""augustus_masked-scaffold-un426…",40,"""cvi-miR-18a-3p""","""miR-18a-3p""","""oha-miR-18a-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""ACTGCCCTAAGTGCTCCTTCTG""","""ACAATAACTGCTTTTTGTTCTAAGGTGCAT…"
"""Cluster_1756""","""scaffold-mi8""",3814794,3814816,"""+""",22,"""scaffold-mi1""",21926212,21927600,1388,144.0,-9.49,144.0,-9.49,806165,""" 716""","""Liftoff""","""five_prime_utr""","""-""","""ID=five_prime_utr_2378;Parent=…","""five_prime_utr_2378""","""augustus_masked-scaffold-mi1-p…","""CV1086_viridis""","""augustus_masked-scaffold-mi1-p…","""augustus_masked-scaffold-mi1-p…","""crovir-transcript-13088""","""BUD13""","""BUD13""",7881,"""cvi-miR-16c-5p""","""miR-16c-5p""","""pbv-miR-16c-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TAGCAGCACGTAAATACTGGAG""","""AGGTCTGCTGTCTGCTGTGCTTTAGCAGCA…"
"""Cluster_1326""","""scaffold-Z""",88183891,88183913,"""-""",22,"""scaffold-ma1""",36908445,36911419,2974,140.0,-13.03,140.0,-13.03,838184,""" 2525""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_1212;Parent…","""three_prime_utr_1212""","""augustus_masked-scaffold-ma1-p…","""CV1086_viridis""","""augustus_masked-scaffold-ma1-p…","""augustus_masked-scaffold-ma1-p…","""crovir-transcript-7181""","""STON2""","""STON2.1""",261,"""Cluster_1326""","""miR-6419""","""mmu-miR-6419""","""De-Novo""",100.0,4.5,20.3,"""TGAGTGTCAGATTATGGAAGAT""","""TGTCATTCCAACATGTCAGATGAGTGTCAG…"
"""Cluster_574""","""scaffold-ma2""",132418184,132418206,"""+""",22,"""scaffold-ma3""",88545533,88547082,1549,155.0,-17.02,155.0,-17.02,348321,""" 1047""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_5952;Parent…","""three_prime_utr_5952""","""maker-scaffold-ma3-augustus-ge…","""CV1086_viridis""","""maker-scaffold-ma3-augustus-ge…","""maker-scaffold-ma3-augustus-ge…","""crovir-transcript-5618""","""PHPT1""","""PHPT1.1""",4806,"""cvi-miR-98-5p""","""miR-98-5p""","""oha-miR-98-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TGAGGTAGTAAGTTGTATTGTT""","""GTGCTTCTCGCTCCTACCAGGGTGAGGTAG…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_654""","""scaffold-ma2""",183760035,183760057,"""+""",22,"""scaffold-mi8""",4690152,4690344,192,140.0,-17.36,140.0,-17.36,328342,""" 33""","""Liftoff""","""five_prime_utr""","""+""","""ID=five_prime_utr_6729;Parent=…","""five_prime_utr_6729""","""maker-scaffold-mi8-augustus-ge…","""CV1086_viridis""","""maker-scaffold-mi8-augustus-ge…","""maker-scaffold-mi8-augustus-ge…","""crovir-transcript-16130""","""NLGN3""","""NLGN3""",1206,"""cvi-miR-24-3p""","""miR-24-3p""","""oha-miR-24-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TGGCTCAGTTCAGCAGGAACAG""","""GTTGATGGACCCGTCCTCCGGTGCCTACTG…"
"""Cluster_1858""","""scaffold-un23""",21014,21036,"""-""",22,"""scaffold-ma1""",256507682,256510518,2836,140.0,-18.93,140.0,-18.93,1192614,""" 1323""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_2702;Parent…","""three_prime_utr_2702""","""augustus_masked-scaffold-ma1-p…","""CV1086_viridis""","""augustus_masked-scaffold-ma1-p…","""augustus_masked-scaffold-ma1-p…","""crovir-transcript-9444""","""SLC30A6""","""SLC30A6""",192,"""Cluster_1858""","""miR-4773""","""hsa-miR-4773""","""De-Novo""",100.0,1.1,22.3,"""CGATCTTGTGCTCCTGTTCATC""","""CGGGAAGCAAGCAAGCGAGATGGGCAGGGG…"
"""Cluster_1037""","""scaffold-ma5""",21172344,21172366,"""-""",22,"""scaffold-ma2""",22796933,22797060,127,148.0,-5.69,148.0,-5.69,11096759,""" 66""","""Liftoff""","""CDS""","""+""","""ID=CDS_54956;Parent=maker-scaf…","""CDS_54956""","""maker-scaffold-ma2-augustus-ge…","""CV1086_viridis""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-13388""","""CNTN4""","""CNTN4""",362,"""Cluster_1037""","""miR-92a-5p""","""ami-miR-92a-5p""","""De-Novo""",100.0,4.5,20.3,"""ATTTAACTCTAGTTGCAATGAT""","""GAGATGCCTGAAGGTTGAATCGTTGCAGCT…"
"""Cluster_83""","""scaffold-ma1""",65768523,65768545,"""-""",22,"""scaffold-mi10""",2971483,2972214,731,144.0,-16.34,144.0,-16.34,28513,""" 85""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_9811;Parent…","""three_prime_utr_9811""","""maker-scaffold-mi10-augustus-g…","""CV1086_viridis""","""maker-scaffold-mi10-augustus-g…","""maker-scaffold-mi10-augustus-g…","""crovir-transcript-12463""","""DACT2""","""DACT2.1""",110,"""cvi-miR-129a-5p""","""miR-129a-5p""","""oha-miR-129a-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""CTTTTTGCGGTCTGGGCTTGCT""","""AGACGTTGTCCTTCGCGAATCTTTTTGCGG…"


#### CV1087

In [100]:
# Create an intersection between the count and sample target data
intersect = list(set(miRNA_info_df.columns) & set(CV1087_df.columns))
print(intersect)

# Join
CV1087_df = (
    CV1087_df
    .join(miRNA_info_df, on = intersect, how='left')
    .unique()
)
CV1087_df

['miRNA_length', 'sample_id', 'miRNA_cluster_original']


miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes,miRNA_counts,miRNA_cluster,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_sequence,hairpin_sequence
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,f64,f64,f64,str,str
"""Cluster_822""","""scaffold-ma3""",107895321,107895343,"""-""",22,"""scaffold-ma1""",299075422,299077401,1979,153.0,-9.64,153.0,-9.64,537510,""" 1808""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_3058;Parent…","""three_prime_utr_3058""","""maker-scaffold-ma1-augustus-ge…","""CV1087_viridis""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-9966""","""HSF5""","""HSF5""",5,"""cvi-miR-137a-3p""","""miR-137a-3p""","""pbv-miR-137a-3p""","""Probable-Identity""",100.0,3.0800e-7,44.1,"""TATTGCTTAAGAATACGCGTAG""","""CTTTCTGACTCTCTTCGGTGACGGGTATTC…"
"""Cluster_1428""","""scaffold-mi1""",14178023,14178045,"""-""",22,"""scaffold-ma7""",7263579,7263767,188,142.0,-14.81,142.0,-14.81,14038796,""" 58""","""Liftoff""","""CDS""","""-""","""ID=CDS_135214;Parent=maker-sca…","""CDS_135214""","""maker-scaffold-ma7-augustus-ge…","""CV1087_viridis""","""maker-scaffold-ma7-augustus-ge…","""maker-scaffold-ma7-augustus-ge…","""crovir-transcript-4338""","""ABCG2""","""ABCG2.1""",101,"""Cluster_1428""","""miR-4796-5p""","""hsa-miR-4796-5p""","""De-Novo""",100.0,0.29,24.3,"""TCTGCTACTGTCACTTTACAAT""","""TCCCACCCATTCTCTTGGTCCGTAAAGATG…"
"""Cluster_591""","""scaffold-ma2""",136588087,136588109,"""-""",22,"""scaffold-mi8""",5928664,5929588,924,146.0,-30.02,146.0,-30.02,5743172,""" 790""","""Liftoff""","""CDS""","""-""","""ID=CDS_141333;Parent=maker-sca…","""CDS_141333""","""maker-scaffold-mi8-augustus-ge…","""CV1087_viridis""","""maker-scaffold-mi8-augustus-ge…","""maker-scaffold-mi8-augustus-ge…","""crovir-transcript-16199""","""HAS3""","""HAS3""",1659,"""cvi-miR-23a-5p""","""miR-23a-5p""","""oha-miR-23a-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""GGGGTTCCTGGTGATGTGATTT""","""TCCTCCTGCTGCTGCCTGCTGGGGTTCCTG…"
"""Cluster_593""","""scaffold-ma2""",136791118,136791141,"""+""",23,"""scaffold-mi5""",3687336,3687428,92,145.0,-21.74,145.0,-21.74,6035970,""" 52""","""Liftoff""","""CDS""","""+""","""ID=CDS_111780;Parent=maker-sca…","""CDS_111780""","""maker-scaffold-mi5-augustus-ge…","""CV1087_viridis""","""maker-scaffold-mi5-augustus-ge…","""maker-scaffold-mi5-augustus-ge…","""crovir-transcript-391""","""ABCC6""","""ABCC6""",204,"""cvi-miR-181c-5p""","""miR-181c-5p""","""oha-miR-181c-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""AACATTCATTCTGTCGGTGGGTT""","""CCAAAACTAAAAAGTCACAATCAACATTCA…"
"""Cluster_924""","""scaffold-ma4""",17843958,17843980,"""+""",22,"""scaffold-ma4""",80627638,80628038,400,149.0,-22.85,149.0,-22.85,9631470,""" 42""","""Liftoff""","""CDS""","""-""","""ID=CDS_106455;Parent=maker-sca…","""CDS_106455""","""maker-scaffold-ma4-augustus-ge…","""CV1087_viridis""","""maker-scaffold-ma4-augustus-ge…","""maker-scaffold-ma4-augustus-ge…","""crovir-transcript-3847""","""GPR161""","""GPR161""",125,"""cvi-miR-18a-3p""","""miR-18a-3p""","""oha-miR-18a-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""ACTGCCCTAAGTGCTCCTTCTG""","""ACAATAACTGCTTTTTGTTCTAAGGTGCAT…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_1846""","""scaffold-un11""",475610,475632,"""+""",22,"""scaffold-mi4""",6104964,6105074,110,148.0,-14.22,148.0,-14.22,17687847,""" 46""","""Liftoff""","""CDS""","""+""","""ID=CDS_109322;Parent=maker-sca…","""CDS_109322""","""maker-scaffold-mi4-augustus-ge…","""CV1087_viridis""","""maker-scaffold-mi4-augustus-ge…","""maker-scaffold-mi4-augustus-ge…","""crovir-transcript-83""","""RAB6B""","""RAB6B""",40675,"""cvi-let-7a-5p-1""","""let-7a-5p""","""oha-let-7a-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TGAGGTAGTAGGTTGTATAGTT""","""CGATTTAGACTGTCCTTTGGGGTGAGGTAG…"
"""Cluster_654""","""scaffold-ma2""",183760035,183760057,"""+""",22,"""scaffold-ma1""",220662895,220663001,106,141.0,-18.09,141.0,-18.09,6846493,""" 67""","""Liftoff""","""CDS""","""-""","""ID=CDS_35537;Parent=maker-scaf…","""CDS_35537""","""maker-scaffold-ma1-augustus-ge…","""CV1087_viridis""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-9027""","""SNX3""","""SNX3""",3931,"""cvi-miR-24-3p""","""miR-24-3p""","""oha-miR-24-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TGGCTCAGTTCAGCAGGAACAG""","""GTTGATGGACCCGTCCTCCGGTGCCTACTG…"
"""Cluster_83""","""scaffold-ma1""",65768523,65768545,"""-""",22,"""scaffold-ma5""",62832946,62833098,152,141.0,-14.83,141.0,-14.83,407417,""" 27""","""Liftoff""","""CDS""","""+""","""ID=CDS_119572;Parent=maker-sca…","""CDS_119572""","""maker-scaffold-ma5-augustus-ge…","""CV1087_viridis""","""maker-scaffold-ma5-augustus-ge…","""maker-scaffold-ma5-augustus-ge…","""crovir-transcript-10768""","""MFF""","""MFF""",37,"""cvi-miR-129a-5p""","""miR-129a-5p""","""oha-miR-129a-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""CTTTTTGCGGTCTGGGCTTGCT""","""AGACGTTGTCCTTCGCGAATCTTTTTGCGG…"
"""Cluster_235""","""scaffold-ma1""",204856224,204856247,"""-""",23,"""scaffold-ma7""",19173119,19173301,182,144.0,-15.74,144.0,-15.74,2385477,""" 106""","""Liftoff""","""CDS""","""+""","""ID=CDS_136230;Parent=maker-sca…","""CDS_136230""","""maker-scaffold-ma7-augustus-ge…","""CV1087_viridis""","""maker-scaffold-ma7-augustus-ge…","""maker-scaffold-ma7-augustus-ge…","""crovir-transcript-4442""","""FBXW7""","""FBXW7.1""",47929,"""cvi-miR-30c-5p""","""miR-30c-5p""","""oha-miR-30c-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""TGTAAACATCCTACACTCTCAGC""","""TAAATCCAAGTGGTAGAGAGTGTAAACATC…"


### Save the sample data
Save each of the DataFrames as a parquet

In [101]:
# CV0857
CV0857_df.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Samples/CV0857_target_and_count_data.2025.01.22.parquet')
# As a CSV
# CV0857_df.write_csv('/home/administrator/Documents/Kaas/Venom_ncRNA_project/Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Samples/CV0857_target_and_count_data.csv')
# The parquet is literally an order of magnitude smaller


In [102]:
# CV0985
CV0985_df.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Samples/CV0985_target_and_count_data.2025.01.22.parquet')

In [103]:
# CV0987
CV0987_df.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Samples/CV0987_target_and_count_data.2025.01.22.parquet')

In [104]:
# CV1081
CV1081_df.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Samples/CV1081_target_and_count_data.2025.01.22.parquet')

In [105]:
# CV1082
CV1082_df.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Samples/CV1082_target_and_count_data.2025.01.22.parquet')

In [106]:
# CV1086
CV1086_df.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Samples/CV1086_target_and_count_data.2025.01.22.parquet')

In [107]:
# CV1087
CV1087_df.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Samples/CV1087_target_and_count_data.2025.01.22.parquet')

### Concatenate all 7 Samples and save as a single parquet

In [108]:
# Concatenate all of the samples
all_miRNA_samples_df = pl.concat([
    CV0857_df,
    CV0985_df,
    CV0987_df,
    CV1081_df,
    CV1082_df,
    CV1086_df,
    CV1087_df
]).unique()

# Save as a parquet
all_miRNA_samples_df.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Samples/all_samples_target_and_count_data.2025.01.20.parquet')
all_miRNA_samples_df

miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,ID,transcript_id,sample_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes,miRNA_counts,miRNA_cluster,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_sequence,hairpin_sequence
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,f64,f64,f64,str,str
"""Cluster_659""","""scaffold-ma2""",190152946,190152968,"""+""",22,"""scaffold-Z""",2378200,2378302,102,162.0,-16.91,162.0,-16.91,6962650,""" 15""","""Liftoff""","""CDS""","""-""","""ID=CDS_319;Parent=maker-scaffo…","""CDS_319""","""maker-scaffold-Z-augustus-gene…","""CV1082_viridis""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-1729""","""CTDSPL""","""CTDSPL""",5,"""Cluster_659""","""miR-99a-5p""","""oha-miR-99a-5p""","""De-Novo""",100.0,4.5,20.3,"""CGATCTTGTGTTACTTTAGGCC""","""TTTGTTACAGAGAACAGAGGCCTAAAGTAG…"
"""Cluster_451""","""scaffold-ma2""",46083770,46083792,"""-""",22,"""scaffold-ma2""",135542855,135543137,282,141.0,-8.65,141.0,-8.65,3846344,""" 1""","""Liftoff""","""CDS""","""-""","""ID=CDS_69393;Parent=maker-scaf…","""CDS_69393""","""maker-scaffold-ma2-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-14911""","""CDC37""","""CDC37""",667,"""cvi-miR-135-5p-1""","""miR-135-5p""","""oha-miR-135-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TATGGCTTTTTATTCCTATGTG""","""ACCATTATCCCATTGTCTTCTATGGCTTTT…"
"""Cluster_196""","""scaffold-ma1""",168876324,168876346,"""+""",22,"""scaffold-ma1""",137582192,137582435,243,140.0,-21.14,140.0,-21.14,1995625,""" 32""","""Liftoff""","""CDS""","""+""","""ID=CDS_28114;Parent=maker-scaf…","""CDS_28114""","""maker-scaffold-ma1-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-8302""","""CDCA7""","""CDCA7""",182,"""Cluster_196""","""miR-955-3p""","""dme-miR-955-3p""","""De-Novo""",100.0,4.5,20.3,"""TGGGCTATTTCTGAACATAAGT""","""TACCAATGAGTTATCTGGCAGTTGGGCTAT…"
"""Cluster_794""","""scaffold-ma3""",83692475,83692497,"""+""",22,"""scaffold-ma1""",127062563,127062668,105,150.0,-25.75,150.0,-25.75,7745514,""" 24""","""Liftoff""","""CDS""","""-""","""ID=CDS_26775;Parent=augustus_m…","""CDS_26775""","""augustus_masked-scaffold-ma1-p…","""CV0987_lutosus""","""augustus_masked-scaffold-ma1-p…","""augustus_masked-scaffold-ma1-p…","""crovir-transcript-8266""","""NEB""","""NEB.1""",1147,"""cvi-miR-214-5p""","""miR-214-5p""","""pbv-miR-214-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TGCCTGTCTACACTTGCTGTGC""","""CTGGATAGAGTTGTCACGTGTCTGCCTGTC…"
"""Cluster_193""","""scaffold-ma1""",168276927,168276949,"""-""",22,"""scaffold-ma6""",57533777,57538205,4428,283.0,-27.83,143.0,-15.02,128648,""" 1080 3276""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_8585;Parent…","""three_prime_utr_8585""","""augustus_masked-scaffold-ma6-p…","""CV0987_lutosus""","""augustus_masked-scaffold-ma6-p…","""augustus_masked-scaffold-ma6-p…","""crovir-transcript-12216""","""KDM7A""","""KDM7A""",6903,"""cvi-miR-206""","""miR-206""","""oha-miR-206""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TGGAATGTAAGGAAGTGTGTGG""","""GAATTTTTCTTTTTGAGACAACACACTTCT…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_833""","""scaffold-ma3""",122669751,122669773,"""+""",22,"""scaffold-Z""",85024349,85025839,1490,303.0,-17.19,157.0,-9.55,545091,""" 1196 348""","""Liftoff""","""three_prime_utr""","""-""","""ID=three_prime_utr_555;Parent=…","""three_prime_utr_555""","""maker-scaffold-Z-augustus-gene…","""CV0987_lutosus""","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-2594""","""YTHDC1""","""YTHDC1""",50734,"""cvi-miR-101a-3p""","""miR-101a-3p""","""oha-miR-101a-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""GTACAGTACTGTGATAACTGAA""","""ATTGACAGGCTGCCCTGGCTCAGTTATCAC…"
"""Cluster_855""","""scaffold-ma3""",139201720,139201742,"""+""",22,"""scaffold-ma7""",47007849,47008856,1007,140.0,-5.22,140.0,-5.22,572679,""" 234""","""Liftoff""","""three_prime_utr""","""+""","""ID=three_prime_utr_9312;Parent…","""three_prime_utr_9312""","""maker-scaffold-ma7-augustus-ge…","""CV0985_concolor""","""maker-scaffold-ma7-augustus-ge…","""maker-scaffold-ma7-augustus-ge…","""crovir-transcript-4665""","""BOD1L1""","""BOD1L1.1""",61787,"""cvi-let-7a-2-3p""","""let-7a-2-3p""","""pbv-let-7a-2-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""CTATACAATCTACTGTCTTTCC""","""AACAATGCTGTCCTTGAGGCTGAGGTAGTA…"
"""Cluster_919""","""scaffold-ma4""",10560708,10560730,"""-""",22,"""scaffold-mi7""",1857296,1857535,239,140.0,-11.2,140.0,-11.2,9520609,""" 108""","""Liftoff""","""CDS""","""-""","""ID=CDS_130356;Parent=maker-sca…","""CDS_130356""","""maker-scaffold-mi7-augustus-ge…","""CV1082_viridis""","""maker-scaffold-mi7-augustus-ge…","""maker-scaffold-mi7-augustus-ge…","""crovir-transcript-1115""","""WRAP73""","""WRAP73.1""",1,"""Cluster_919""","""miR-9572-5p""","""aca-miR-9572-5p""","""De-Novo""",100.0,4.5,20.3,"""ACAACCTGACAGACTGGAGATA""","""ATTAAAAAACGTCTTGTCTATCGTTCCCGT…"
"""Cluster_1339""","""scaffold-Z""",94331990,94332012,"""+""",22,"""scaffold-mi7""",3567469,3567575,106,148.0,-10.52,148.0,-10.52,13153239,""" 78""","""Liftoff""","""CDS""","""-""","""ID=CDS_131073;Parent=maker-sca…","""CDS_131073""","""maker-scaffold-mi7-augustus-ge…","""CV0987_lutosus""","""maker-scaffold-mi7-augustus-ge…","""maker-scaffold-mi7-augustus-ge…","""crovir-transcript-1212""","""SAE1""","""SAE1""",68,"""cvi-miR-129b-5p""","""miR-129b-5p""","""oha-miR-129b-5p""","""Potential-Identity""",100.0,0.000001,42.1,"""CTTTTTGCGGTCTGGGCTTTCT""","""CAATGACAGGTCTTCTCTGGGTCTTTTTGC…"


### Count the number of miRNAs for each sample, gene, and feature type

In [109]:
# Take all the sample data and use the group_by function to find the number of miRNAs per gene, per sample, per feature type
# Change the original DataFrame since I already saved it
all_miRNA_samples_df2 = (
    all_miRNA_samples_df
    .group_by(
        [
            'sample_id', 'miRNA_target_chrom', 'genes', 'feature_type'
        ]
    ) # Group_by sample_id, feature_type, and genes
    # .group_by(['sample_id', 'genes', 'feature_type'], maintain_order=True) # Group_by sample_id, feature_type, and genes
    .agg(pl.col('miRNA_cluster_original').count().alias('number_of_miRNAs'))
)
# Note the number of rows should be 219,179

# Save as parquet
all_miRNA_samples_df2.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Number_of_miRNAs_per_sample/all_miRNA_numbers_per_sample-gene-feature_type.2025.01.22.parquet')
all_miRNA_samples_df2

sample_id,miRNA_target_chrom,genes,feature_type,number_of_miRNAs
str,str,str,str,u32
"""CV0857_viridis""","""scaffold-ma5""","""NPR1.1""","""CDS""",64
"""CV1087_viridis""","""scaffold-ma1""","""trnascan-scaffold-ma1-noncodin…","""CDS""",2
"""CV1081_viridis""","""scaffold-ma3""","""MBOAT1""","""CDS""",16
"""CV0985_concolor""","""scaffold-ma2""","""SGCD""","""CDS""",6
"""CV0985_concolor""","""scaffold-Z""","""TGFBR2""","""five_prime_utr""",3
…,…,…,…,…
"""CV0857_viridis""","""scaffold-Z""","""LOC102723407""","""CDS""",7
"""CV1086_viridis""","""scaffold-ma1""","""FMO3""","""CDS""",21
"""CV1087_viridis""","""scaffold-un13""","""maker-scaffold-un13-augustus-g…","""CDS""",19
"""CV1082_viridis""","""scaffold-mi3""","""trnascan-scaffold-mi3-noncodin…","""CDS""",2


#### Create a version that excludes feature type as a grouping variable

In [None]:
# Take all the sample data and use the group_by function to find the number of miRNAs per gene, and per sample, but not per feature type
# Change the original DataFrame since I already saved it
no_feature_type_df = (
    all_miRNA_samples_df
    .drop([
        'feature_type', 'total_score', 'total_energy', 'max_score', 'max_energy', 'positions'
    ])
    .group_by(
        [
            'sample_id', 'miRNA_target_chrom', 'genes'
        ]
    ) # Group_by sample_id, feature_type, and genes
    # .group_by(['sample_id', 'genes', 'feature_type'], maintain_order=True) # Group_by sample_id, feature_type, and genes
    .agg(pl.col('miRNA_cluster_original').count().alias('number_of_miRNAs'))
)

# Save as parquet
no_feature_type_df.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Number_of_miRNAs_per_sample/all_miRNA_numbers_per_sample-gene.2025.01.22.parquet')
no_feature_type_df

sample_id,miRNA_target_chrom,genes,number_of_miRNAs
str,str,str,u32
"""CV1086_viridis""","""scaffold-ma2""","""TRNT1""",37
"""CV1087_viridis""","""scaffold-mi9""","""GSTT1.2""",22
"""CV1086_viridis""","""scaffold-ma2""","""SHQ1""",37
"""CV1081_viridis""","""scaffold-ma2""","""MAP2K6""",19
"""CV0987_lutosus""","""scaffold-ma5""","""RYK""",47
…,…,…,…
"""CV1086_viridis""","""scaffold-ma4""","""ST3GAL5""",17
"""CV1081_viridis""","""scaffold-ma1""","""maker-scaffold-ma1-augustus-ge…",33
"""CV0987_lutosus""","""scaffold-ma5""","""maker-scaffold-ma5-augustus-ge…",82
"""CV1082_viridis""","""scaffold-mi7""","""GTF3C5""",27


### Create a slightly more conservative data set for the number of miRNAs per sample

In [111]:
# Take all the sample data and use the group_by function to find the number of miRNAs per gene, per sample, per feature type
# Change the original DataFrame since I already saved it
no_feature_type_df2 = (
    all_miRNA_samples_df
    .filter((pl.col('total_score') >= 155) & (pl.col('total_energy') <= -7))
    .group_by(
        [
            'sample_id', 'miRNA_target_chrom', 'genes', 'feature_type'
        ]
    ) # Group_by sample_id, feature_type, and genes
    .agg(pl.col('miRNA_cluster_original').count().alias('number_of_miRNAs'))
)
# Note the number of rows should be 159315

# Save as parquet
no_feature_type_df2.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Number_of_miRNAs_per_sample/filtered_miRNA_numbers_per_sample-gene-feature_type.2025.01.22.parquet')
no_feature_type_df2

sample_id,miRNA_target_chrom,genes,feature_type,number_of_miRNAs
str,str,str,str,u32
"""CV1082_viridis""","""scaffold-ma4""","""CRYL1""","""CDS""",1
"""CV0857_viridis""","""scaffold-ma1""","""CSTF3""","""three_prime_utr""",4
"""CV1086_viridis""","""scaffold-mi9""","""DDX51""","""three_prime_utr""",3
"""CV1086_viridis""","""scaffold-ma1""","""GYPC""","""three_prime_utr""",10
"""CV1081_viridis""","""scaffold-ma3""","""E2F5""","""three_prime_utr""",21
…,…,…,…,…
"""CV1081_viridis""","""scaffold-mi7""","""PPP5C""","""CDS""",1
"""CV1082_viridis""","""scaffold-ma2""","""TNFSF14""","""three_prime_utr""",11
"""CV1082_viridis""","""scaffold-ma3""","""CLPS""","""three_prime_utr""",10
"""CV0985_concolor""","""scaffold-ma3""","""PTPN7""","""five_prime_utr""",1


#### Create a version that excludes feature type as a grouping variable

In [112]:
# Take all the sample data and use the group_by function to find the number of miRNAs per gene, and per sample, but not per feature type
# Change the original DataFrame since I already saved it
no_feature_type_df2 = (
    all_miRNA_samples_df
    .filter((pl.col('total_score') >= 155) & (pl.col('total_energy') <= -7))
        .drop([
        'feature_type', 'total_score', 'total_energy', 'max_score', 'max_energy', 'positions'
    ])
    .group_by(
        [
            'sample_id', 'miRNA_target_chrom', 'genes'
        ]
    ) # Group_by sample_id, feature_type, and genes
    .agg(pl.col('miRNA_cluster_original').count().alias('number_of_miRNAs'))
)

# Save as parquet
no_feature_type_df2.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/Number_of_miRNAs_per_sample/filtered_miRNA_numbers_per_sample-gene.2025.01.22.parquet')
no_feature_type_df2

sample_id,miRNA_target_chrom,genes,number_of_miRNAs
str,str,str,u32
"""CV1087_viridis""","""scaffold-un540""","""maker-scaffold-un540-augustus-…",7
"""CV0857_viridis""","""scaffold-ma3""","""PLPPR5""",2
"""CV0857_viridis""","""scaffold-ma3""","""ELOC""",6
"""CV1082_viridis""","""scaffold-ma7""","""PLRG1""",10
"""CV0985_concolor""","""scaffold-ma6""","""PLBD1""",6
…,…,…,…
"""CV1081_viridis""","""scaffold-Z""","""MPLKIP""",1
"""CV0857_viridis""","""scaffold-Z""","""RPL13A""",1
"""CV0857_viridis""","""scaffold-Z""","""PRKAG2""",1
"""CV0857_viridis""","""scaffold-mi7""","""SNAP25.2""",3


## Format miRNA target data for the reference genome

### Format the miRNA target data for the reference with the second version of the function

In [113]:
# Use the processing function
reference_df = proccess_bedtools_and_miRanda2(
    miranda_paths=ref_miranda_tabs,
    bedtools_paths=ref_bed_files,
    converion_dataframe=conversion_table,
    genome_type='reference'
)
reference_df

miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,transcript_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str
"""Cluster_1105""","""scaffold-ma5""",86969921,86969943,"""-""",22,"""scaffold-ma1""",11793483,11795320,1837,598.0,-41.84,155.0,-12.03,11523007,""" 104 337 1006 1743""","""maker""","""CDS""","""+""","""Parent=maker-scaffold-ma1-augu…","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-6926""","""FANCM""","""FANCM"""
"""Cluster_1135""","""scaffold-ma6""",43370039,43370061,"""+""",22,"""scaffold-ma2""",132216793,132216907,114,141.0,-11.65,141.0,-11.65,11728830,""" 29""","""maker""","""CDS""","""+""","""Parent=maker-scaffold-ma2-augu…","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-14801""","""TSR2""","""TSR2"""
"""Cluster_1292""","""scaffold-Z""",57906901,57906923,"""-""",22,"""scaffold-un76""",78291,78524,233,152.0,-22.74,152.0,-22.74,12415970,""" 152""","""maker""","""CDS""","""-""","""Parent=maker-scaffold-un76-aug…","""maker-scaffold-un76-augustus-g…","""maker-scaffold-un76-augustus-g…","""maker-scaffold-un76-augustus-g…","""NA""","""maker-scaffold-un76-augustus-g…","""maker-scaffold-un76-augustus-g…"
"""Cluster_476""","""scaffold-ma2""",74469087,74469110,"""+""",23,"""scaffold-ma5""",73596269,73596354,85,147.0,-16.19,147.0,-16.19,4209383,""" 48""","""maker""","""CDS""","""-""","""Parent=maker-scaffold-ma5-augu…","""maker-scaffold-ma5-augustus-ge…","""maker-scaffold-ma5-augustus-ge…","""maker-scaffold-ma5-augustus-ge…","""crovir-transcript-10881""","""XP_006716231""","""XP_006716231"""
"""Cluster_593""","""scaffold-ma2""",136791118,136791141,"""+""",23,"""scaffold-ma5""",41852556,41852723,167,151.0,-10.06,151.0,-10.06,6022674,""" 1""","""maker""","""CDS""","""+""","""Parent=maker-scaffold-ma5-augu…","""maker-scaffold-ma5-augustus-ge…","""maker-scaffold-ma5-augustus-ge…","""maker-scaffold-ma5-augustus-ge…","""crovir-transcript-10494""","""NPR1""","""NPR1.1"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_1863""","""scaffold-un31""",31144,31166,"""-""",22,"""scaffold-Z""",75556560,75556916,356,155.0,-11.91,155.0,-11.91,1200433,""" 169""","""maker""","""three_prime_utr""","""-""","""Parent=maker-scaffold-Z-august…","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-2431""","""IMMT""","""IMMT"""
"""Cluster_1310""","""scaffold-Z""",69496303,69496324,"""-""",21,"""scaffold-un12""",137600,139276,1676,160.0,-15.14,160.0,-15.14,836799,""" 769""","""maker""","""three_prime_utr""","""-""","""Parent=maker-scaffold-un12-aug…","""maker-scaffold-un12-augustus-g…","""maker-scaffold-un12-augustus-g…","""maker-scaffold-un12-augustus-g…","""NA""","""maker-scaffold-un12-augustus-g…","""maker-scaffold-un12-augustus-g…"
"""Cluster_1337""","""scaffold-Z""",93572587,93572609,"""+""",22,"""scaffold-ma2""",53560663,53561251,588,145.0,-18.34,145.0,-18.34,861141,""" 555""","""maker""","""three_prime_utr""","""-""","""Parent=maker-scaffold-ma2-augu…","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""maker-scaffold-ma2-augustus-ge…","""crovir-transcript-13763""","""FAF2""","""FAF2"""
"""Cluster_440""","""scaffold-ma2""",40590314,40590336,"""+""",22,"""scaffold-ma2""",84187106,84189544,2438,284.0,-34.36,144.0,-17.4,236145,""" 1942 1738""","""maker""","""three_prime_utr""","""+""","""Parent=augustus_masked-scaffol…","""augustus_masked-scaffold-ma2-p…","""augustus_masked-scaffold-ma2-p…","""augustus_masked-scaffold-ma2-p…","""crovir-transcript-14078""","""TECR""","""TECR"""


In [114]:
# Create an intersection between the count and reference target data
intersect = list(set(miRNA_info_df.columns) & set(reference_df.columns))
print(intersect)

# Join
reference_df = (
    reference_df
    .join(miRNA_info_df, on = intersect, how='left')
    .unique()
)
print(reference_df.schema)
reference_df

['miRNA_length', 'miRNA_cluster_original']
Schema({'miRNA_cluster_original': String, 'miRNA_sequence_chrom': String, 'miRNA_start': Int64, 'miRNA_end': Int64, 'miRNA_strandedness': String, 'miRNA_length': Int64, 'miRNA_target_chrom': String, 'miRNA_target_start': Int64, 'miRNA_target_end': Int64, 'miRNA_target_length': Int64, 'total_score': Float64, 'total_energy': Float64, 'max_score': Float64, 'max_energy': Float64, 'strand': Int64, 'positions': String, 'assembler': String, 'feature_type': String, 'miRNA_target_strandedness': String, 'gff_id_info': String, 'transcript_id': String, 'gtf_gene': String, 'gtf_gene_trimmed': String, 'crovir_transcript': String, 'converted_id': String, 'genes': String, 'sample_id': String, 'miRNA_counts': Int64, 'miRNA_cluster': String, 'base_miRNA_name': String, 'best_miRNA_ortholog': String, 'miRNA_name_probability': String, 'blast_percent_identity': Float64, 'E.value': Float64, 'bit_score': Float64, 'miRNA_sequence': String, 'hairpin_sequence': String})

miRNA_cluster_original,miRNA_sequence_chrom,miRNA_start,miRNA_end,miRNA_strandedness,miRNA_length,miRNA_target_chrom,miRNA_target_start,miRNA_target_end,miRNA_target_length,total_score,total_energy,max_score,max_energy,strand,positions,assembler,feature_type,miRNA_target_strandedness,gff_id_info,transcript_id,gtf_gene,gtf_gene_trimmed,crovir_transcript,converted_id,genes,sample_id,miRNA_counts,miRNA_cluster,base_miRNA_name,best_miRNA_ortholog,miRNA_name_probability,blast_percent_identity,E.value,bit_score,miRNA_sequence,hairpin_sequence
str,str,i64,i64,str,i64,str,i64,i64,i64,f64,f64,f64,f64,i64,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,f64,f64,f64,str,str
"""Cluster_1911""","""scaffold-un619""",5287,5308,"""+""",21,"""scaffold-Z""",56485349,56486593,1244,140.0,-14.69,140.0,-14.69,18480156,""" 69""","""maker""","""CDS""","""+""","""Parent=maker-scaffold-Z-august…","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""maker-scaffold-Z-augustus-gene…","""crovir-transcript-2249""","""CDK13""","""CDK13""","""CV1086_viridis""",177,"""cvi-miR-129a-5p-1""","""miR-129a-5p""","""oha-miR-129a-5p""","""Very-Probable-Identity""",100.0,0.000001,42.1,"""CTTTTTGCGGTCTGGGCTTGC""","""CGGGAGATTTTCTCTCTCGGATCTTTTTGC…"
"""Cluster_406""","""scaffold-ma2""",7365313,7365336,"""+""",23,"""scaffold-ma4""",81665745,81666450,705,142.0,-7.75,142.0,-7.75,3135398,""" 53""","""maker""","""CDS""","""+""","""Parent=augustus_masked-scaffol…","""augustus_masked-scaffold-ma4-p…","""augustus_masked-scaffold-ma4-p…","""augustus_masked-scaffold-ma4-p…","""crovir-transcript-3874""","""ZDHHC23""","""ZDHHC23""","""CV1086_viridis""",15363,"""cvi-miR-191-5p""","""miR-191-5p""","""oha-miR-191-5p""","""Very-Probable-Identity""",100.0,8.5100e-8,46.1,"""CAACGGAATCCCAAAAGCAGCTG""","""TCTGCAAAGGCTTAAGAATGGGCAACGGAA…"
"""Cluster_919""","""scaffold-ma4""",10560708,10560730,"""-""",22,"""scaffold-ma1""",252237108,252238153,1045,140.0,-13.88,140.0,-13.88,627872,""" 105""","""maker""","""three_prime_utr""","""-""","""Parent=maker-scaffold-ma1-augu…","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-9432""","""GNG4""","""GNG4""","""CV0987_lutosus""",970,"""Cluster_919""","""miR-9572-5p""","""aca-miR-9572-5p""","""De-Novo""",100.0,4.5,20.3,"""ACAACCTGACAGACTGGAGATA""","""ATTAAAAAACGTCTTGTCTATCGTTCCCGT…"
"""Cluster_544""","""scaffold-ma2""",115351804,115351826,"""+""",22,"""scaffold-mi6""",9775321,9777649,2328,142.0,-14.86,142.0,-14.86,301191,""" 692""","""maker""","""three_prime_utr""","""-""","""Parent=maker-scaffold-mi6-augu…","""maker-scaffold-mi6-augustus-ge…","""maker-scaffold-mi6-augustus-ge…","""maker-scaffold-mi6-augustus-ge…","""crovir-transcript-11575""","""CMTM3""","""CMTM3""","""CV1082_viridis""",45,"""cvi-miR-193-5p""","""miR-193-5p""","""oha-miR-193-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TGGGTCTTTGCGGGCGAGATGA""","""CGAGCAGCTGGGAGTTTGGGGCTGGGTCTT…"
"""Cluster_1756""","""scaffold-mi8""",3814794,3814816,"""+""",22,"""scaffold-ma4""",57826407,57829229,2822,147.0,-11.1,147.0,-11.1,1136521,""" 821""","""maker""","""three_prime_utr""","""-""","""Parent=maker-scaffold-ma4-augu…","""maker-scaffold-ma4-augustus-ge…","""maker-scaffold-ma4-augustus-ge…","""maker-scaffold-ma4-augustus-ge…","""crovir-transcript-3625""","""TRAPPC10""","""TRAPPC10""","""CV0985_concolor""",13288,"""cvi-miR-16c-5p""","""miR-16c-5p""","""pbv-miR-16c-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TAGCAGCACGTAAATACTGGAG""","""AGGTCTGCTGTCTGCTGTGCTTTAGCAGCA…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cluster_807""","""scaffold-ma3""",90552069,90552090,"""-""",21,"""scaffold-ma1""",82538679,82541415,2736,143.0,-16.08,143.0,-16.08,525901,""" 1297""","""maker""","""three_prime_utr""","""+""","""Parent=maker-scaffold-ma1-augu…","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""maker-scaffold-ma1-augustus-ge…","""crovir-transcript-7651""","""SLC25A16""","""SLC25A16""","""CV0857_viridis""",22,"""Cluster_807""","""miR-147-5p""","""oha-miR-147-5p""","""De-Novo""",100.0,1.0,22.3,"""TCCGAGCTTCATTTCTGCAAC""","""GGGTTCAGCTCCTGCTGGGCTGCAGAAACA…"
"""Cluster_545""","""scaffold-ma2""",115401565,115401587,"""+""",22,"""scaffold-un16""",530768,530874,106,151.0,-16.67,151.0,-16.67,4691266,""" 1""","""maker""","""CDS""","""-""","""Parent=augustus_masked-scaffol…","""augustus_masked-scaffold-un16-…","""augustus_masked-scaffold-un16-…","""augustus_masked-scaffold-un16-…","""NA""","""augustus_masked-scaffold-un16-…","""augustus_masked-scaffold-un16-…","""CV1086_viridis""",79,"""cvi-miR-365a-3p""","""miR-365a-3p""","""oha-miR-365a-3p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TAATGCCCCTAAAAATCCTTAT""","""TGTGAGACAGCAAGAAAAATGAGGGACTTT…"
"""Cluster_1595""","""scaffold-mi3""",8030236,8030260,"""+""",24,"""scaffold-mi8""",4349138,4349262,124,140.0,-13.01,140.0,-13.01,15284097,""" 4""","""maker""","""CDS""","""-""","""Parent=maker-scaffold-mi8-augu…","""maker-scaffold-mi8-augustus-ge…","""maker-scaffold-mi8-augustus-ge…","""maker-scaffold-mi8-augustus-ge…","""crovir-transcript-16120""","""PLS3""","""PLS3""","""CV0857_viridis""",74,"""cvi-miR-147-5p""","""miR-147-5p""","""oha-miR-147-5p""","""Very-Probable-Identity""",100.0,2.3300e-8,48.1,"""TGGAATCATTTCTGCACAAACTAG""","""AAACGACACTCTCTGAATCTAGTGGAATCA…"
"""Cluster_131""","""scaffold-ma1""",103174546,103174568,"""+""",22,"""scaffold-ma7""",33422730,33422828,98,142.0,-10.19,142.0,-10.19,894524,""" 26""","""maker""","""CDS""","""+""","""Parent=maker-scaffold-ma7-augu…","""maker-scaffold-ma7-augustus-ge…","""maker-scaffold-ma7-augustus-ge…","""maker-scaffold-ma7-augustus-ge…","""crovir-transcript-4556""","""VPS37A""","""VPS37A""","""CV0985_concolor""",174477,"""cvi-miR-26-5p-1""","""miR-26-5p""","""oha-miR-26-5p""","""Very-Probable-Identity""",100.0,3.0800e-7,44.1,"""TTCAAGTAATCCAGGATAGGCT""","""ATTTACAAAGGCTGTGGCTAGGTTCAAGTA…"


In [115]:
# Save the data as a parquet
reference_df.write_parquet('Results/miRanda/miRanda_2025-01-12/Full_miRNA_data/miRNA_counts_for_reference/reference_target_and_count_data.2025.01.22.parquet')