Step 1
Date: 2024/12/19
The purpose of this script is to take the GFF created by gffread with the following command:

gffread -E /home/administrator/Documents/Kaas/Venom_ncRNA_project/Usable_data/Genome_files/CroVir_rnd1.all.maker.final.homologIDs.updatedNov2019_with_myos_geneidmod_edited_with_BPP.gtf -o /home/administrator/Documents/Kaas/Venom_ncRNA_project/Usable_data/Genome_files/Crotalus_viridis_annotation_with_BPP_and_myotoxin_2024.12.18.gff

and fix re-add the three_prime_utr and five_prime_utr sequences back.

Next step: BCFtools_get_fasta_file_three_prime_utr_2024.12.20.sh (/home/administrator/Documents/Kaas/Venom_ncRNA_project/Scripts/BCFtools/BCFtools_get_fasta_file_three_prime_utr_2024.12.20.sh)
Previous step: None

In [2]:
# Import needed packages
import pandas as pd
import polars as pl
import os

In [3]:
# Laod file path into memory
gtf = '/home/administrator/Documents/Kaas/Venom_ncRNA_project/Usable_data/Genome_files/CroVir_rnd1.all.maker.final.homologIDs.updatedNov2019_with_myos_geneidmod_edited_with_BPP.gtf'

# Load GTF into memory
gtf_df = pd.read_csv(
    gtf, sep = '\t', comment = '#', header = None, names= [
        'seqid', 'source', 'type2', 'start', 'end', 'score', 'strand', 'phase', 'attributes2'
    ]
)

# Laod the file path of the GFF into memory
gff = '/home/administrator/Documents/Kaas/Venom_ncRNA_project/Usable_data/Genome_files/Crotalus_viridis_annotation_with_BPP_and_myotoxin_2024.12.18.gff'

# Load the GFF into memory
gff_df = pd.read_csv(
    gff, sep='\t', comment='#', header=None, names = [
        'seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'
    ]
)

In [4]:
# Get common columns for the two data frames
shared_cols = gff_df.columns.intersection(gtf_df.columns).tolist()
print(shared_cols)

['seqid', 'source', 'start', 'end', 'score', 'strand', 'phase']


In [5]:
# Join the data frames and filter out any removing everthing unneaded from the GFF section
gxf_df = (pd.merge(
    gff_df, gtf_df,
    on = shared_cols,
    how = 'outer'
)
)
gxf_df

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,type2,attributes2
0,PE-reconstructed-10x-myo,.,transcript,1,2076,.,+,.,ID=myotoxin_model_1;geneID=myotoxin1,gene,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m..."
1,PE-reconstructed-10x-myo,.,transcript,1,2076,.,+,.,ID=myotoxin_model_1;geneID=myotoxin1,transcript,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m..."
2,PE-reconstructed-10x-myo,.,exon,1,746,.,+,.,Parent=myotoxin_model_1,exon,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m..."
3,PE-reconstructed-10x-myo,.,exon,1640,1765,.,+,.,Parent=myotoxin_model_1,exon,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m..."
4,PE-reconstructed-10x-myo,.,exon,1911,2076,.,+,.,Parent=myotoxin_model_1,exon,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m..."
...,...,...,...,...,...,...,...,...,...,...,...
372379,scaffold-un945,maker,,3236,3291,.,+,.,,three_prime_utr,"gene_id ""maker-scaffold-un945-augustus-gene-0...."
372380,scaffold-un1500,maker,,778,780,.,-,.,,three_prime_utr,"gene_id ""augustus_masked-scaffold-un1500-proce..."
372381,scaffold-un2620,maker,,307,837,.,-,.,,three_prime_utr,"gene_id ""maker-scaffold-un2620-augustus-gene-0..."
372382,scaffold-un3951,maker,,1109,1281,.,-,.,,five_prime_utr,"gene_id ""maker-scaffold-un3951-augustus-gene-0..."


In [6]:
# Fix the ranges for the three_prime_utr and five_prime_utr and get proper feature values into the type column so that 3' and 5' UTRs aren't missing anymore
gxf_df2 = (
    gxf_df
        .query('type2 != "gene"') # Filter out instances of gene IDs in that row to reduce data complexity
        # Copy they three_prime_utr and five_prime_utrs into the main type column
        # .assign(
        #     type = lambda x: x['type2'].where(x['type2'].isin(['three_prime_utr', 'five_prime_utr']), x['type'])
        # )
        # .drop(columns = ['type2', 'attributes2']) # Remove columns I don't need
        .drop_duplicates() # Equivalent to distinct() in R
        # Add a new column to clarify what genes are what
        .assign(
            gene_id = lambda x: x['attributes2'].str.extract(r'gene_id "([^"]+)"')
        )
)
gxf_df2

# Create a dictionary to map gene_id to attributes where type2 is 'exon'
exon_attributes = gxf_df2[gxf_df2['type2'] == 'exon'].set_index('gene_id')['attributes'].to_dict()

# Replace missing values in 'attributes' for 'three_prime_utr' or 'five_prime_utr' rows when 'type2' is 'exon'
gxf_df2['attributes'] = gxf_df2.apply(
    lambda row: exon_attributes.get(row['gene_id'], row['attributes']) if pd.isna(row['attributes']) else row['attributes'],
    axis=1
)

In [7]:
gxf_df2

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,type2,attributes2,gene_id
1,PE-reconstructed-10x-myo,.,transcript,1,2076,.,+,.,ID=myotoxin_model_1;geneID=myotoxin1,transcript,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
2,PE-reconstructed-10x-myo,.,exon,1,746,.,+,.,Parent=myotoxin_model_1,exon,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
3,PE-reconstructed-10x-myo,.,exon,1640,1765,.,+,.,Parent=myotoxin_model_1,exon,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
4,PE-reconstructed-10x-myo,.,exon,1911,2076,.,+,.,Parent=myotoxin_model_1,exon,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
6,scaffold-Z,maker,transcript,5240,9278,.,-,.,ID=maker-scaffold-Z-augustus-gene-0.12-mRNA-1;...,transcript,"gene_id ""maker-scaffold-Z-augustus-gene-0.12_c...",maker-scaffold-Z-augustus-gene-0.12_crovir-tra...
...,...,...,...,...,...,...,...,...,...,...,...,...
372379,scaffold-un945,maker,,3236,3291,.,+,.,Parent=maker-scaffold-un945-augustus-gene-0.1-...,three_prime_utr,"gene_id ""maker-scaffold-un945-augustus-gene-0....",maker-scaffold-un945-augustus-gene-0.1
372380,scaffold-un1500,maker,,778,780,.,-,.,Parent=augustus_masked-scaffold-un1500-process...,three_prime_utr,"gene_id ""augustus_masked-scaffold-un1500-proce...",augustus_masked-scaffold-un1500-processed-gene...
372381,scaffold-un2620,maker,,307,837,.,-,.,Parent=maker-scaffold-un2620-augustus-gene-0.1...,three_prime_utr,"gene_id ""maker-scaffold-un2620-augustus-gene-0...",maker-scaffold-un2620-augustus-gene-0.1
372382,scaffold-un3951,maker,,1109,1281,.,-,.,Parent=maker-scaffold-un3951-augustus-gene-0.1...,five_prime_utr,"gene_id ""maker-scaffold-un3951-augustus-gene-0...",maker-scaffold-un3951-augustus-gene-0.1


In [8]:
# Define a function that takes a data frame and re-calculates ranges for the exon
def range_recalc(df1, df2):

    # Iterate through rows, checking if the data ranges overlap
    # df1 is the exon df
    for i, rows1 in df1.iterrows():

        # df2 is the 3'UTR df
        for j, rows2 in df2.iterrows():
            
            # Check if the row is a three_prime_utr row, and if so do this stuff:
            if rows2['type2'] == 'three_prime_utr':

                # Check if the sequence IDs are the same and only continue if true
                if (
                    rows1['seqid'] == rows2['seqid'] 
                    and rows1['type'] == 'exon' 
                    and rows1['gene_id'] == rows2['gene_id']
                    and rows1['source'] == rows2['source']
                ):

                    print(f"Matching seqid found: {rows1['seqid']}")

                    # Check if what strand the sequence is on
                    # If the strand is sense do that
                    if rows1['strand'] == '+':
                        # Check if the ranges overlap
                        if rows1['start'] < rows2['start'] and rows1['end'] == rows2['end']:
                            # Adjust the end point for row1 (the exon)
                            df1.at[i, 'end'] = rows2['start'] - 1
                            print(f"Adjusted end for row {i}: {df1.at[i, 'end']}")
                        else:
                            print(f"Ranges do not overlap for row {i}")
                            continue

                    # If the strand is anti-sense do this
                    elif rows1['strand'] == '-':
                        # Check if the ranges overlap
                        if rows1['start'] == rows2['start'] and rows1['end'] > rows2['end']:
                            # Adjust the start point for row1 (the exon)
                            df1.at[i, 'start'] = rows2['end'] + 1 
                            print(f"Adjusted start for row {i}: {df1.at[i, 'start']}")
                        else:
                            print(f"Ranges do not overlap for row {i}")
                            continue 
                    else:
                        continue
                else:
                    continue


            # Check if the row is a five_prime_utr row, and if so do this stuff:        
            elif rows2['type2'] == 'five_prime_utr':

                # Check if the sequence IDs are the same and only continue if true
                if (
                    rows1['seqid'] == rows2['seqid'] 
                    and rows1['type'] == 'exon' 
                    and rows1['gene_id'] == rows2['gene_id']
                    and rows1['source'] == rows2['source']
                ):

                    print(f"Matching seqid found: {rows1['seqid']}")

                    # Check if what strand the sequence is on
                    # If the strand is sense do that
                    if rows1['strand'] == '+':
                        # Check if the ranges overlap
                        if rows1['start'] == rows2['start'] and rows1['end'] > rows2['end']:
                            # Adjust the start point for row1 (the exon)
                            df1.at[i, 'start'] = rows2['end'] + 1 
                            print(f"Adjusted start for row {i}: {df1.at[i, 'start']}")
                        else:
                            print(f"Ranges do not overlap for row {i}")
                            continue 
                    
                    # If the strand is anti-sense do this
                    elif rows1['strand'] == '-':
                        # Check if the ranges overlap
                        if rows1['start'] < rows2['start'] and rows1['end'] == rows2['end']:
                            # Adjust the end point for rows1 (the exon)
                            df1.at[i, 'end'] == rows2['start'] - 1
                            print(f"Adjusted end for row {i}: {df1.at[i, 'end']}")
                        else:
                            print(f"Ranges do not overlap for row {i}")
                            continue
                    else:
                        continue
                else:
                    continue

            else:
                continue

    return df1

In [9]:
# # Filter out everything but myotoxin from gxf_df2
# gxf_myo_df = gxf_df2.query("gene_id == 'myotoxin1'")
# gxf_myo_df

# # Seperate 3' UTRs from everthing else
# three_utr_df = gxf_myo_df.query("type2 == 'three_prime_utr'").drop_duplicates() 
# # Seperate 5' UTRs from everthing else
# five_utr_df = gxf_myo_df.query("type2 == 'five_prime_utr'").drop_duplicates()

# # Get 3' and 5' UTRs
# three_five_utr_df = gxf_myo_df.query("type2 == 'three_prime_utr' or type2 == 'five_prime_utr'").drop_duplicates()

In [None]:
# # Use the function to re-calculate the start or end for the CDS
# three_five_utr_recalc_df = range_recalc(gxf_myo_df, three_five_utr_df)
# # print(type(three_five_utr_recalc_df))

In [None]:
# three_five_utr_df

In [None]:
# three_five_utr_recalc_df

In [None]:
# Create an order for the gene_ids based on their first occurrence
gene_order = gxf_df2['gene_id'].drop_duplicates().reset_index().set_index('gene_id')['index']
print(gene_order)

# Map each row to its gene_id's first occurrence index
row_order = gxf_df2['gene_id'].map(gene_order)

# Sort the data frame using the calculated order
gxf_df3 = gxf_df2.iloc[row_order.argsort(kind='stable')]
gxf_df3


gene_id
myotoxin1                                                                    1
maker-scaffold-Z-augustus-gene-0.12_crovir-transcript-1688                   6
maker-scaffold-Z-augustus-gene-0.8_crovir-transcript-1686                   14
augustus_masked-scaffold-Z-processed-gene-0.1_crovir-transcript-1684        24
augustus_masked-scaffold-Z-processed-gene-0.2_crovir-transcript-1685        31
                                                                         ...  
augustus_masked-scaffold-un6484-processed-gene-0.0                      355120
augustus_masked-scaffold-un6633-processed-gene-0.0                      355127
augustus_masked-scaffold-un7034-processed-gene-0.0                      355134
myotoxin3                                                               355141
myotoxin2                                                               355147
Name: index, Length: 18585, dtype: int64
10563


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,type2,attributes2,gene_id
1,PE-reconstructed-10x-myo,.,transcript,1,2076,.,+,.,ID=myotoxin_model_1;geneID=myotoxin1,transcript,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
2,PE-reconstructed-10x-myo,.,exon,1,746,.,+,.,Parent=myotoxin_model_1,exon,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
3,PE-reconstructed-10x-myo,.,exon,1640,1765,.,+,.,Parent=myotoxin_model_1,exon,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
4,PE-reconstructed-10x-myo,.,exon,1911,2076,.,+,.,Parent=myotoxin_model_1,exon,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
355152,PE-reconstructed-10x-myo,.,,1,689,.,+,.,Parent=myotoxin_model_1,five_prime_utr,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
...,...,...,...,...,...,...,...,...,...,...,...,...
10625,scaffold-Z,maker,exon,50089151,50089234,.,-,.,Parent=maker-scaffold-Z-augustus-gene-167.13-m...,exon,"gene_id ""maker-scaffold-Z-augustus-gene-167.13...",maker-scaffold-Z-augustus-gene-167.13_crovir-t...
10626,scaffold-Z,maker,exon,50091196,50091332,.,-,.,Parent=maker-scaffold-Z-augustus-gene-167.13-m...,exon,"gene_id ""maker-scaffold-Z-augustus-gene-167.13...",maker-scaffold-Z-augustus-gene-167.13_crovir-t...
10627,scaffold-Z,maker,exon,50098875,50099096,.,-,.,Parent=maker-scaffold-Z-augustus-gene-167.13-m...,exon,"gene_id ""maker-scaffold-Z-augustus-gene-167.13...",maker-scaffold-Z-augustus-gene-167.13_crovir-t...
10628,scaffold-Z,maker,exon,50108507,50108665,.,-,.,Parent=maker-scaffold-Z-augustus-gene-167.13-m...,exon,"gene_id ""maker-scaffold-Z-augustus-gene-167.13...",maker-scaffold-Z-augustus-gene-167.13_crovir-t...


In [None]:

# Create a slice of the rows so that I can check the run in a reasonable amount of time
percent = 3 # percent of rows to be taken from the data

# Calculate the number of rows to be taken
num_rows = int(len(gxf_df3) * (percent / 100))
print(num_rows)

# Take the slice of rows
gxf_df4 = gxf_df3.iloc[:num_rows]
gxf_df4

In [28]:
# Now that I know it works, I can apply the function to the entire data frame and give it a second data frame that contains both 3' and 5' UTRs
# Seperate three_prime_utr and five_prime_utrs
utrs_df = (
    gxf_df4
        .query("type2 in ['three_prime_utr', 'five_prime_utr']")
        .drop_duplicates()
)
utrs_df


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,type2,attributes2,gene_id
355152,PE-reconstructed-10x-myo,.,,1,689,.,+,.,Parent=myotoxin_model_1,five_prime_utr,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
355153,PE-reconstructed-10x-myo,.,,1923,2076,.,+,.,Parent=myotoxin_model_1,three_prime_utr,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
355154,scaffold-Z,maker,,5240,5320,.,-,.,Parent=maker-scaffold-Z-augustus-gene-0.12-mRNA-1,three_prime_utr,"gene_id ""maker-scaffold-Z-augustus-gene-0.12_c...",maker-scaffold-Z-augustus-gene-0.12_crovir-tra...
355155,scaffold-Z,maker,,195400,195678,.,-,.,Parent=maker-scaffold-Z-augustus-gene-0.15-mRNA-1,three_prime_utr,"gene_id ""maker-scaffold-Z-augustus-gene-0.15_c...",maker-scaffold-Z-augustus-gene-0.15_crovir-tra...
355156,scaffold-Z,maker,,399852,400362,.,+,.,Parent=maker-scaffold-Z-augustus-gene-1.4-mRNA-1,three_prime_utr,"gene_id ""maker-scaffold-Z-augustus-gene-1.4_cr...",maker-scaffold-Z-augustus-gene-1.4_crovir-tran...
...,...,...,...,...,...,...,...,...,...,...,...,...
355639,scaffold-Z,maker,,49814289,49815107,.,+,.,Parent=maker-scaffold-Z-augustus-gene-166.5-mR...,five_prime_utr,"gene_id ""maker-scaffold-Z-augustus-gene-166.5_...",maker-scaffold-Z-augustus-gene-166.5_crovir-tr...
355640,scaffold-Z,maker,,49882494,49885680,.,+,.,Parent=maker-scaffold-Z-augustus-gene-166.5-mR...,three_prime_utr,"gene_id ""maker-scaffold-Z-augustus-gene-166.5_...",maker-scaffold-Z-augustus-gene-166.5_crovir-tr...
355641,scaffold-Z,maker,,49894593,49895219,.,-,.,Parent=maker-scaffold-Z-augustus-gene-166.6-mR...,three_prime_utr,"gene_id ""maker-scaffold-Z-augustus-gene-166.6_...",maker-scaffold-Z-augustus-gene-166.6_crovir-tr...
355642,scaffold-Z,maker,,50028720,50028977,.,-,.,Parent=maker-scaffold-Z-augustus-gene-166.7-mR...,five_prime_utr,"gene_id ""maker-scaffold-Z-augustus-gene-166.7_...",maker-scaffold-Z-augustus-gene-166.7_crovir-tr...


In [25]:
# Apply the function to the main data frame and the above data frame
recscaled_exons_df = range_recalc(gxf_df4, utrs_df)
recscaled_exons_df

Matching seqid found: PE-reconstructed-10x-myo
Adjusted start for row 2: 690
Matching seqid found: PE-reconstructed-10x-myo
Ranges do not overlap for row 2
Matching seqid found: PE-reconstructed-10x-myo
Ranges do not overlap for row 3
Matching seqid found: PE-reconstructed-10x-myo
Ranges do not overlap for row 3
Matching seqid found: PE-reconstructed-10x-myo
Ranges do not overlap for row 4
Matching seqid found: PE-reconstructed-10x-myo
Adjusted end for row 4: 1922
Matching seqid found: scaffold-Z
Adjusted start for row 7: 5321
Matching seqid found: scaffold-Z
Ranges do not overlap for row 8
Matching seqid found: scaffold-Z
Ranges do not overlap for row 9
Matching seqid found: scaffold-Z
Adjusted start for row 92: 195679
Matching seqid found: scaffold-Z
Ranges do not overlap for row 93
Matching seqid found: scaffold-Z
Ranges do not overlap for row 202
Matching seqid found: scaffold-Z
Ranges do not overlap for row 203
Matching seqid found: scaffold-Z
Adjusted end for row 204: 399851
Matc

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,type2,attributes2,gene_id
1,PE-reconstructed-10x-myo,.,transcript,1,2076,.,+,.,ID=myotoxin_model_1;geneID=myotoxin1,transcript,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
2,PE-reconstructed-10x-myo,.,exon,690,746,.,+,.,Parent=myotoxin_model_1,exon,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
3,PE-reconstructed-10x-myo,.,exon,1640,1765,.,+,.,Parent=myotoxin_model_1,exon,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
4,PE-reconstructed-10x-myo,.,exon,1911,1922,.,+,.,Parent=myotoxin_model_1,exon,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
355152,PE-reconstructed-10x-myo,.,,1,689,.,+,.,Parent=myotoxin_model_1,five_prime_utr,"gene_id ""myotoxin1""; transcript_id ""myotoxin_m...",myotoxin1
...,...,...,...,...,...,...,...,...,...,...,...,...
10625,scaffold-Z,maker,exon,50089151,50089234,.,-,.,Parent=maker-scaffold-Z-augustus-gene-167.13-m...,exon,"gene_id ""maker-scaffold-Z-augustus-gene-167.13...",maker-scaffold-Z-augustus-gene-167.13_crovir-t...
10626,scaffold-Z,maker,exon,50091196,50091332,.,-,.,Parent=maker-scaffold-Z-augustus-gene-167.13-m...,exon,"gene_id ""maker-scaffold-Z-augustus-gene-167.13...",maker-scaffold-Z-augustus-gene-167.13_crovir-t...
10627,scaffold-Z,maker,exon,50098875,50099096,.,-,.,Parent=maker-scaffold-Z-augustus-gene-167.13-m...,exon,"gene_id ""maker-scaffold-Z-augustus-gene-167.13...",maker-scaffold-Z-augustus-gene-167.13_crovir-t...
10628,scaffold-Z,maker,exon,50108507,50108665,.,-,.,Parent=maker-scaffold-Z-augustus-gene-167.13-m...,exon,"gene_id ""maker-scaffold-Z-augustus-gene-167.13...",maker-scaffold-Z-augustus-gene-167.13_crovir-t...


In [37]:
# Format the recalcualted CDS data frame
formated_recscaled_exons_df = (
    recscaled_exons_df
        .assign(
            order = row_order,
            type_priority = (recscaled_exons_df['type2'] != 'transcript').astype(int)
        )
        .sort_values(by=['order', 'type_priority'])
        .get(['seqid', 'source', 'type2', 'start', 'end', 'score', 'strand', 'phase', 'attributes']) # Reorder and remove columns
        .rename(columns={'type2': 'type'})
        .reset_index(drop=True)
)
formated_recscaled_exons_df


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,PE-reconstructed-10x-myo,.,transcript,1,2076,.,+,.,ID=myotoxin_model_1;geneID=myotoxin1
1,PE-reconstructed-10x-myo,.,exon,690,746,.,+,.,Parent=myotoxin_model_1
2,PE-reconstructed-10x-myo,.,exon,1640,1765,.,+,.,Parent=myotoxin_model_1
3,PE-reconstructed-10x-myo,.,exon,1911,1922,.,+,.,Parent=myotoxin_model_1
4,PE-reconstructed-10x-myo,.,five_prime_utr,1,689,.,+,.,Parent=myotoxin_model_1
...,...,...,...,...,...,...,...,...,...
10558,scaffold-Z,maker,exon,50089151,50089234,.,-,.,Parent=maker-scaffold-Z-augustus-gene-167.13-m...
10559,scaffold-Z,maker,exon,50091196,50091332,.,-,.,Parent=maker-scaffold-Z-augustus-gene-167.13-m...
10560,scaffold-Z,maker,exon,50098875,50099096,.,-,.,Parent=maker-scaffold-Z-augustus-gene-167.13-m...
10561,scaffold-Z,maker,exon,50108507,50108665,.,-,.,Parent=maker-scaffold-Z-augustus-gene-167.13-m...


In [30]:
# Save the new gff file
formated_recscaled_exons_df.to_csv(
    '/home/administrator/Documents/Kaas/Venom_ncRNA_project/Usable_data/Genome_files/Crotalus_viridis_annotation_with_BPP_and_myotoxin_with_three_and_five_prime_utrs_2024.12.18.gff',
    sep = '\t',
    index = False,
    header = False
)