In [None]:
# From https://www.phind.com/search?cache=oxjhksj83gvyo4mhm1vvwzrv

import pandas as pd

# Assuming genome_seq is a dictionary with chromosome names as keys and sequences as values
genome_seq = {
    'chr1': 'ACTG...',
    # Add other chromosomes here
}

# Load GTF file
gtf_file = 'path_to_your_gtf_file.gtf'
gtf_df = pd.read_csv(gtf_file, sep='\t', comment='#', header=None, names=['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])

# Filter for CDS features
cds_df = gtf_df[gtf_df['feature'] == 'CDS']

# Function to calculate GC content for third codon positions
def calculate_gc3(sequence):
    # Extract every third base starting from the first position
    third_bases = sequence[::3]
    
    gc_count = third_bases.upper().count('G') + third_bases.upper().count('C')
    total_bases = len(third_bases)
    
    if total_bases > 0:
        gc3 = gc_count / total_bases
    else:
        gc3 = 0  # Avoid division by zero
    
    return gc3

# Calculate GC3 values for each CDS
cds_df['gc3'] = cds_df.apply(lambda row: calculate_gc3(genome_seq[row['seqname']][row['start']-1:row['end']]), axis=1)

# Display results
print(cds_df[['seqname', 'start', 'end', 'gc3']])