In [None]:
#Redo Fig 3B

import pybedtools
import pandas as pd
import seaborn as sns
from matplotlib import colors as mcolors
import matplotlib.pyplot as plt
import os
import sys
from Bio import SeqIO
import numpy as np
import math

In [7]:
def get_filter_id_list(pop):
    #Only take the de novo transcripts from the paper (i.e. not what was an expressed TE according to TransposonUltimate)
    Nuc_dict = SeqIO.to_dict(SeqIO.parse(f"FilteredTranscripts/{pop}_de_novo_transcripts.fasta", "fasta"))
    return Nuc_dict.keys()

In [None]:
def run_pybedtools_intersect(seqFile, teFile, pop, Type, TEtool):
    #Run pybedtools intersect
    a = pybedtools.example_bedtool(seqFile)
    b = pybedtools.example_bedtool(teFile)
    c = a.intersect(b, wao = True, s = True)
    Out = {}
    count = 0
    Accepted = get_filter_id_list(pop)
    for elem in c:
        if elem[3] not in Accepted and Type != "Intergenic":
            #print(elem[3])
            continue

        if elem[-1] == "0":
            TEclass = "NoTE"
        elif TEtool == "EarlGrey":
            TEclass = elem[9].split("/")[-1]
        elif TEtool == "FasTE":
            TEclass = elem[12].split("__")[1].split("_")[-1]
        elif TEtool == "MCHelper":
            TEclass = elem[12] + "_" + elem[13]
        elif TEtool == "RepeatModeler":
            TEclass =elem[13]

        Out[count] = [elem[3], pop, Type, TEclass, elem[-1]]
        count +=1
        

    return Out

In [None]:
def run_for_all(TEtool):
    #Create the outfile to recreate Fig3B
    Out = open(f"Fig1B_Dataframe_{TEtool}.csv", "w")
    Out.write("ID,Type,Family,Overlap_nt\n")
    TranslatePopDict = {"AK5":"FI", "DK5":"DK", "GI5":"ES", "SW5":"SE", "UM":"UA", "YE":"TR", "Zamb":"ZI"} #Translate between naming (differs between some data files old name vs new name)
    for pop in ["AK5", "DK5", "GI5", "SW5", "UM", "YE", "Zamb"]:
        up = run_pybedtools_intersect(f"/global/group/research/m_lebh01/TEpaperCorrection/Bedfiles/Upstream/{pop}UpstreamTransformed.bed", f"/global/group/research/m_lebh01/TEpaperCorrection/TEannotationBed/{TEtool}/{TranslatePopDict[pop]}TEonly.bed", pop, "Upstream", TEtool)
        dn = run_pybedtools_intersect(f"/global/group/research/m_lebh01/TEpaperCorrection/Bedfiles/Downstream/{pop}EndTransformed.bed", f"/global/group/research/m_lebh01/TEpaperCorrection/TEannotationBed/{TEtool}/{TranslatePopDict[pop]}TEonly.bed", pop, "Downstream", TEtool)
        tr = run_pybedtools_intersect(f"/global/group/research/m_lebh01/TEpaperCorrection/Bedfiles/Transcript/{TranslatePopDict[pop]}Transformed.bed", f"/global/group/research/m_lebh01/TEpaperCorrection/TEannotationBed/{TEtool}/{TranslatePopDict[pop]}TEonly.bed", pop, "Transcript", TEtool)
        nc = run_pybedtools_intersect(f"/global/group/research/m_lebh01/TEpaperCorrection/Bedfiles/RandomIntergenic/{pop}_intergenic_random.bed", f"/global/group/research/m_lebh01/TEpaperCorrection/TEannotationBed/{TEtool}//{TranslatePopDict[pop]}TEonly.bed", pop, "Intergenic", TEtool)
        for elem in up:
            Out.write(f"{up[elem][0] + '_' + up[elem][1]},{up[elem][2]},{up[elem][3]},{up[elem][4]}\n")
        for elem in tr:
            Out.write(f"{tr[elem][0] + '_' + tr[elem][1]},{tr[elem][2]},{tr[elem][3]},{tr[elem][4]}\n")
        for elem in nc:
            Out.write(f"{nc[elem][0] + '_' + nc[elem][1]},{nc[elem][2]},{nc[elem][3]},{nc[elem][4]}\n")
        for elem in dn:
            Out.write(f"{dn[elem][0] + '_' + dn[elem][1]},{dn[elem][2]},{dn[elem][3]},{dn[elem][4]}\n")

In [None]:
def plot(TEtool):

    #Manually color the TE so its a bit easier to compare
    fixed_colors = {
        "CR1":    "lightblue",
        "Copia":    "darkblue",
        "Gypsy":        "darkblue",
        "Pao":"darkblue",
        "BEL":"darkblue",
        "LTR":"darkblue",
        "ERV":"darkblue",
        "LTR":        "darkblue",
        "LTR_BELPAO":        "darkblue",
        "LTR_COPIA":        "darkblue",
        "LTR_Copia":        "darkblue",
        "LTR_Gypsy":        "darkblue",
        "LTR_Pao":        "darkblue",
        "LTR_GYPSY":        "darkblue",
        "LTR_LARD":        "darkblue",
        "LTR_TRIM":        "darkblue",
        "LTR_unknown":        "darkblue",
        "LTR_ERV1":        "darkblue",
        "LTR_Ngaro":        "darkblue",
        "ClassI":"blue",
        "CLASSI_unknown":"blue",
        "PLE":"blue",
        "nLTR":"blue",
        "DIRS":"blue",
        "SINE_unknown":        "pink",
        "SINE":        "pink",
        "SINE?_unknown":        "pink",
        "SINE_tRNA":        "pink",
        "Unknown":      "grey",
        "unknown":      "grey",
        "Other_unknown":      "grey",
        "Unknown_unknown":      "grey",
        "ID":      "grey",
        "NoTE":      "grey",
        "tRNA":      "grey",
        "rRNA":      "grey",
        "rRNA_unknown":      "grey",
        "NoTE":          "grey",
        "Helitron":         "yellow",
        "RC_Helitron":         "yellow",
        "Helitron_unknown":         "yellow",
        "HELITRON_unknown":         "yellow",
        "I":         "lightblue",
        "LINE_I":         "lightblue",
        "LINE_I-Jockey":         "lightblue",
        "LINE_Jockey":         "lightblue",
        "LINE_CR1":         "lightblue",
        "LINE_JOCKEY":         "lightblue",
        "LINE_LOA":         "lightblue",
        "LINE_R1":         "lightblue",
        "LINE_RTE-X":         "lightblue",
        "LINE_Rex-Babar":         "lightblue",
        "LINE_R2":         "lightblue",
        "LINE_R1-LOA":         "lightblue",
        "LINE_L2":         "lightblue",
        "LINE_unknown":         "lightblue",
        "I-Jockey":     "lightblue",
        "Jockey":     "lightblue",
        "R1":      "lightblue",
        "R1-LOA":      "lightblue",
        "LINE":      "lightblue",
        "L1":      "lightblue",
        "RTE":      "lightblue",
        "R2":      "lightblue",
        "MITE":        "orange",
        "MAVERICK_unknown":        "orange",
        "MITE_unknown":        "orange",
        "nMITE":"orange",
        "ClassII":      "orange",
        "TcMar-Pogo":      "orange",
        "TcMar-Tc1":      "orange",
        "Tc1-Mariner":      "orange",
        "Zator":      "orange",
        "hAT":      "orange",
        "TC1MARINER_unkown":      "orange",
        "TIR_PIFHARBINGER_unkown":      "orange",
        "TIR_TRANSIB_unkown":      "orange",
        "MULE-NOF":      "orange",
        "TIR_P":      "orange",
        "TIR_unknown":      "orange",
        "Novosib":      "orange",
        "Sola":      "orange",
        "TcMar-Mariner":      "orange",
        "hAT-hATm":      "orange",
        "PIF-Harbinger":      "orange",
        "hAT-Ac":      "orange",
        "hAT-hobo":      "orange",
        "P":      "orange",
        "PiggyBac":      "orange",
        "CMC-Transib":        "orange",
        "CMC":        "orange",
        "DNA_CMC-EnSpm":        "orange",
        "DNA_CMC-Transib":        "orange",
        "DNA_MULE-NOF":        "orange",
        "DNA_P":        "orange",
        "DNA_TcMar-Tc1":        "orange",
        "DNA_TcMar-Pogo":        "orange",
        "DNA_hAT-hATm":        "orange",
        "DNA_hAT-hobo":        "orange",
        "DNA_unknown":        "orange",
        "DNA_CMC-Chapaev-3":        "orange",
        "DNA_Crypton-H":        "orange",
        "DNA_Kolobok-T2":        "orange",
        "DNA_Maverick":        "orange",
        "DNA_PIF-Harbinger":        "orange",
        "DNA_PIF-Spy":        "orange",
        "DNA_Zisupton":        "orange",
        "DNA_hAT-Ac":        "orange",
        "DNA_hAT-Pegasus":        "orange",

    }
    def ensure_rgba(value, fallback=(0.5, 0.5, 0.5, 1.0)):
        #Convert any color value to a valid RGBA tuple.
        try:
            return mcolors.to_rgba(value)
        except:
            return fallback

    fallback_cmap = plt.cm.tab20

    #Load the dataframe
    df = pd.read_csv(f"Fig1B_Dataframe_{TEtool}.csv")
    df = df[df["Type"] != "NoTE"]

    grouped = df.groupby(['Type', 'Family'])['Overlap_nt'].sum().reset_index()
    families = grouped['Family'].unique()

    # Construct the color map
    color_map = {}
    for i, fam in enumerate(families):
        if fam in fixed_colors:
            color_map[fam] = ensure_rgba(fixed_colors[fam])
        else:
            # in case its not there:
            fallback_color = fallback_cmap(i % 20)
            color_map[fam] = ensure_rgba(fallback_color)

    def color_key(fam):
        return color_map[fam]  # already guaranteed RGBA tuple

    #Make the pie chart
    types = grouped['Type'].unique()
    n_types = len(types)

    cols = 2
    rows = math.ceil(n_types / cols)

    fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 4 * rows))
    axes = axes.flatten()

    for ax, t in zip(axes, types):
        subset = grouped[grouped['Type'] == t].copy()

        subset["color_sort"] = subset["Family"].apply(color_key)
        subset = subset.sort_values("color_sort")

        explode = [(0.1 if fam in ["Gypsy", "LTR_Gypsy", "LTR_GYPSY"] else 0)
                   for fam in subset['Family']] #highlight GYPSY in all plots

        pie_colors = [color_map[fam] for fam in subset['Family']]

        ax.pie(
            subset['Overlap_nt'],
            labels=None,
            colors=pie_colors,
            explode=explode,
            startangle=90,
            shadow=True,
            wedgeprops={"edgecolor": "black", "linewidth": 0.8}
        )
        ax.set_title(f"Type: {t}")

    # Hide unused axes
    for i in range(n_types, len(axes)):
        axes[i].axis("off")

    handles = [
        plt.Line2D([0], [0], marker='o', color='w',
                    markerfacecolor=color_map[fam],
                    label=fam, markersize=10)
        for fam in sorted(families, key=color_key)
    ]

    fig.legend(handles=handles, title="Family", loc='center right')

    fig.suptitle(f"TE annotation tool = {TEtool}", fontsize=16)

    plt.tight_layout(rect=[0, 0, 0.90, 1])
    plt.savefig(f"Fig3B_{TEtool}.jpg", bbox_inches="tight")
    plt.close(fig)


In [None]:
def main(TEtool):
    #Run for all
    run_for_all(TEtool)
    plot(TEtool)