In [3]:
# Load packages 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns 

In [4]:
import csv
from collections import defaultdict

def parse_gtf(file_path):
    transcripts = defaultdict(list)
    
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith("#"):
                continue  # Skip header lines
            
            fields = line.strip().split('\t')
            if fields[2] == "exon":  # Check if the feature is an exon
                attributes = fields[8]
                # Extract transcript_id
                transcript_id = None
                for attr in attributes.split(';'):
                    attr = attr.strip()
                    if attr.startswith('transcript_id'):
                        transcript_id = attr.split(' ')[1].replace('"','')
                        break
                
                if transcript_id:
                    start = int(fields[3])
                    end = int(fields[4])
                    transcripts[transcript_id].append((start, end))
    
    return transcripts

def merge_intervals(intervals):
    intervals = sorted(intervals)
    merged = []
    
    for interval in intervals:
        if not merged or merged[-1][1] < interval[0]:
            merged.append(interval)
        else:
            merged[-1] = (merged[-1][0], max(merged[-1][1], interval[1]))
    
    return merged

def calculate_transcript_lengths(transcripts):
    transcript_lengths = {}
    
    for transcript_id, exons in transcripts.items():
        merged_exons = merge_intervals(exons)
        length = sum(end - start + 1 for start, end in merged_exons)
        transcript_lengths[transcript_id] = length
    
    return transcript_lengths

def main(gtf_file_path, output_file_path):
    transcripts = parse_gtf(gtf_file_path)
    transcript_lengths = calculate_transcript_lengths(transcripts)
    
    # Saving the lengths to a file
    with open(output_file_path, 'w') as output_file:
        writer = csv.writer(output_file, delimiter='\t')
        writer.writerow(['Transcript_ID', 'Length'])
        for transcript_id, length in transcript_lengths.items():
            writer.writerow([transcript_id, length])

# Example usage
input_gtf_path = "../B_transcriptome/cc_transcriptome_all.gtf"
output_file_path = "../B_transcriptome/transcriptome_transcript_lengths.tsv"
main(input_gtf_path, output_file_path)


In [5]:
import csv
from collections import defaultdict

def parse_gtf(file_path):
    transcripts = defaultdict(list)
    
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith("#"):
                continue  # Skip header lines
            
            fields = line.strip().split('\t')
            if fields[2] == "exon":  # Check if the feature is an exon
                attributes = fields[8]
                # Extract transcript_id
                transcript_id = None
                for attr in attributes.split(';'):
                    attr = attr.strip()
                    if attr.startswith('transcript_id'):
                        transcript_id = attr.split(' ')[1].replace('"','')
                        break
                
                if transcript_id:
                    start = int(fields[3])
                    end = int(fields[4])
                    transcripts[transcript_id].append((start, end))
    
    return transcripts

def merge_intervals(intervals):
    intervals = sorted(intervals)
    merged = []
    
    for interval in intervals:
        if not merged or merged[-1][1] < interval[0]:
            merged.append(interval)
        else:
            merged[-1] = (merged[-1][0], max(merged[-1][1], interval[1]))
    
    return merged

def calculate_transcript_lengths(transcripts):
    transcript_lengths = {}
    
    for transcript_id, exons in transcripts.items():
        merged_exons = merge_intervals(exons)
        length = sum(end - start + 1 for start, end in merged_exons)
        transcript_lengths[transcript_id] = length
    
    return transcript_lengths

def main(gtf_file_path, output_file_path):
    transcripts = parse_gtf(gtf_file_path)
    transcript_lengths = calculate_transcript_lengths(transcripts)
    
    # Saving the lengths to a file
    with open(output_file_path, 'w') as output_file:
        writer = csv.writer(output_file, delimiter='\t')
        writer.writerow(['Transcript_ID', 'Length'])
        for transcript_id, length in transcript_lengths.items():
            writer.writerow([transcript_id, length])

# Example usage
input_gtf_path = "../A_annotation/carcar_annotation_v5.gtf"
output_file_path = "../A_annotation/carcar_annotation_v5_transcript_lengths.tsv"
main(input_gtf_path, output_file_path)
