In [1]:
import csv
from collections import defaultdict

def parse_gtf(file_path):
    genes = defaultdict(list)
    
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith("#"):
                continue  # Skip header lines
            
            fields = line.strip().split('\t')
            if fields[2] == "exon":  # Check if the feature is an exon
                attributes = fields[8]
                # Extract gene_id
                gene_id = None
                for attr in attributes.split(';'):
                    attr = attr.strip()
                    if attr.startswith('gene_id'):
                        gene_id = attr.split(' ')[1].replace('"','')
                        break
                
                if gene_id:
                    start = int(fields[3])
                    end = int(fields[4])
                    genes[gene_id].append((start, end))
    
    return genes

def merge_intervals(intervals):
    """Merge overlapping intervals."""
    intervals = sorted(intervals)
    merged = []
    
    for interval in intervals:
        if not merged or merged[-1][1] < interval[0]:
            merged.append(interval)
        else:
            merged[-1] = (merged[-1][0], max(merged[-1][1], interval[1]))
    
    return merged

def calculate_gene_lengths(genes):
    gene_lengths = {}
    
    for gene_id, exons in genes.items():
        merged_exons = merge_intervals(exons)
        length = sum(end - start + 1 for start, end in merged_exons)
        gene_lengths[gene_id] = length
    
    return gene_lengths

def main(gtf_file_path, output_file_path):
    genes = parse_gtf(gtf_file_path)
    gene_lengths = calculate_gene_lengths(genes)
    
    # Saving the lengths to a file
    with open(output_file_path, 'w', newline='') as output_file:
        writer = csv.writer(output_file)
        writer.writerow(['gene_id', 'length'])
        for gene_id, length in gene_lengths.items():
            writer.writerow([gene_id, length])

# Example usage
input_gtf_path = "../A_annotation/carcar_annotation_v5.gtf"
output_file_path = "../A_annotation/carcar_annotation_v5_gene_lengths.csv"
main(input_gtf_path, output_file_path)


In [2]:
import csv
from collections import defaultdict

def parse_gtf(file_path):
    genes = defaultdict(list)
    
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith("#"):
                continue  # Skip header lines
            
            fields = line.strip().split('\t')
            if fields[2] == "exon":  # Check if the feature is an exon
                attributes = fields[8]
                # Extract gene_id
                gene_id = None
                for attr in attributes.split(';'):
                    attr = attr.strip()
                    if attr.startswith('gene_id'):
                        gene_id = attr.split(' ')[1].replace('"','')
                        break
                
                if gene_id:
                    start = int(fields[3])
                    end = int(fields[4])
                    genes[gene_id].append((start, end))
    
    return genes

def merge_intervals(intervals):
    """Merge overlapping intervals."""
    intervals = sorted(intervals)
    merged = []
    
    for interval in intervals:
        if not merged or merged[-1][1] < interval[0]:
            merged.append(interval)
        else:
            merged[-1] = (merged[-1][0], max(merged[-1][1], interval[1]))
    
    return merged

def calculate_gene_lengths(genes):
    gene_lengths = {}
    
    for gene_id, exons in genes.items():
        merged_exons = merge_intervals(exons)
        length = sum(end - start + 1 for start, end in merged_exons)
        gene_lengths[gene_id] = length
    
    return gene_lengths

def main(gtf_file_path, output_file_path):
    genes = parse_gtf(gtf_file_path)
    gene_lengths = calculate_gene_lengths(genes)
    
    # Saving the lengths to a file
    with open(output_file_path, 'w', newline='') as output_file:
        writer = csv.writer(output_file)
        writer.writerow(['gene_id', 'length'])
        for gene_id, length in gene_lengths.items():
            writer.writerow([gene_id, length])

# Example usage
input_gtf_path = "../B_transcriptome/cc_transcriptome_all.gtf"
output_file_path = "../B_transcriptome/transcriptome_gene_lengths.tsv"
main(input_gtf_path, output_file_path)