In [7]:
from Bio import SeqIO
import pandas as pd
import re
pd.options.mode.chained_assignment = None 

In [26]:
with open('5utr.fasta') as fasta_file:  # Will close handle cleanly
    identifiers_5utr = []
    sequences_5utr = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        identifiers_5utr.append(seq_record.id)
        sequences_5utr.append(str(seq_record.seq))
        
ident_cleaned_5utr = [re.search("(ENST.*)\.", x)[1] for x in identifiers_5utr]

In [27]:
with open('3utr.fasta') as fasta_file:  # Will close handle cleanly
    identifiers_3utr = []
    sequences_3utr = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        identifiers_3utr.append(seq_record.id)
        sequences_3utr.append(str(seq_record.seq))
        
ident_cleaned_3utr = [re.search("(ENST.*)\.", x)[1] for x in identifiers_3utr]

In [28]:
with open('cds.fasta') as fasta_file:  # Will close handle cleanly
    identifiers_cds = []
    sequences_cds = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        identifiers_cds.append(seq_record.id)
        sequences_cds.append(str(seq_record.seq))
        
ident_cleaned_cds = [re.search("(ENST.*)\.", x)[1] for x in identifiers_cds]

In [29]:
utr5_df = pd.DataFrame({"EnsemblTranscriptID":ident_cleaned_5utr, "utr":sequences_5utr})
utr3_df = pd.DataFrame({"EnsemblTranscriptID":ident_cleaned_cds, "cds":sequences_cds})
cds_df = pd.DataFrame({"EnsemblTranscriptID":ident_cleaned_3utr, "3utr":sequences_3utr})

In [30]:
merged_df = utr5_df.merge(cds_df, on="EnsemblTranscriptID").merge(utr3_df, on="EnsemblTranscriptID")

In [35]:
merged_df.to_csv("../gencodev19_seq.csv")

### Complete 5utr sequence file with coordinates

In [27]:
with open('5utr.fasta') as fasta_file:  # Will close handle cleanly
    identifiers_5utr = []
    sequences_5utr = []
    descript_5utr = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        identifiers_5utr.append(seq_record.id)
        sequences_5utr.append(str(seq_record.seq))
        descript_5utr.append(seq_record.description)

ident = [re.search("(ENST.*)\.", x)[1] for x in identifiers_5utr]
chroms = [re.search("range=(\S*):", x)[1] for x in descript_5utr]
ranges = [re.search("range=.*:(\S*)", x)[1].split("-") for x in descript_5utr]
starts = [r[0] for r in ranges]
stops = [r[1] for r in ranges]
strands = [re.search("strand=(\S)", x)[1] for x in descript_5utr]

In [30]:
utr5_df = pd.DataFrame({"EnsemblTranscriptID":ident, "utr":sequences_5utr, "chr":chroms, "start":starts,
                       "stop":stops, "strand":strands})

In [31]:
utr5_df.to_csv("../gencodev19_5utr_seq.csv")

### 5utr coordinates BED

In [2]:
utr5_bed = pd.read_csv("5utr.bed", sep="\t", names=["chr", "start", "stop", "EnsemblTranscriptID", "?", "strand"])

In [3]:
utr5_bed["EnsemblTranscriptID"] = [re.search("(ENST.*)\.", x)[1] for x in utr5_bed["EnsemblTranscriptID"]]

In [5]:
utr5_bed = utr5_bed[utr5_bed.chr.isin(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
       'chr9', 'chrX', 'chrY', 'chr10', 'chr11', 'chr12', 'chr13',
       'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20',
       'chr21', 'chr22'])]

In [8]:
tuples = list(zip(utr5_bed.start, utr5_bed.stop))
utr5_bed["pos"] = [[x] for x in tuples]
utr5_bed["id_chr"] = list(zip(utr5_bed.EnsemblTranscriptID, utr5_bed.chr, utr5_bed.strand))

In [10]:
utr5_bed_posaggreg = utr5_bed[["pos", "id_chr"]].groupby("id_chr").agg({'pos': 'sum'})

In [11]:
utr5_bed_posaggreg["EnsemblTranscriptID"] = [x[0] for x in utr5_bed_posaggreg.index]
utr5_bed_posaggreg["chr"] = [x[1] for x in utr5_bed_posaggreg.index]
utr5_bed_posaggreg["strand"] = [x[2] for x in utr5_bed_posaggreg.index]

In [12]:
utr5_bed_posaggreg = utr5_bed_posaggreg.reset_index()[["EnsemblTranscriptID", "chr", "pos", "strand"]]

In [14]:
utr5_bed_posaggreg.to_csv("../gencodev19_5utr_pos.csv")

### Loss of function intolerant genes

In [119]:
gnomad_lof = pd.read_csv("gnomad.v2.1.1.lof_metrics.by_transcript.tsv", sep="\t")

In [122]:
gnomad_lof = gnomad_lof[["transcript", "oe_lof_upper", "pLI"]]

In [125]:
gnomad_lof = gnomad_lof[gnomad_lof["oe_lof_upper"] < 0.35]

In [128]:
gnomad_lof = gnomad_lof.rename(columns={"transcript":"EnsemblTranscriptID"})

In [137]:
utr5_lof = gnomad_lof.merge(utr5_bed, on = "EnsemblTranscriptID")

In [139]:
utr5_bed = pd.read_csv("5utr.bed", sep="\t", names=["chr", "start", "stop", "EnsemblTranscriptID", "?", "strand"])
utr5_bed["EnsemblTranscriptID"] = [re.search("(ENST.*)\.", x)[1] for x in utr5_bed["EnsemblTranscriptID"]]
utr5_bed = utr5_bed[utr5_bed.chr.isin(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
       'chr9', 'chrX', 'chrY', 'chr10', 'chr11', 'chr12', 'chr13',
       'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20',
       'chr21', 'chr22'])]
utr5_bed = utr5_bed[utr5_bed.EnsemblTranscriptID.isin(gnomad_lof["EnsemblTranscriptID"])]
utr5_bed.to_csv("gencodev19_5utr_lof.bed", sep="\t", header=False, index=False)