In [None]:
#Redo Fig 3A

#Import all packages needed
import pybedtools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from Bio import SeqIO

In [None]:
def get_filter_id_list(pop):
    #Get all de novo transcripts from the paper (i.e. exclude expressed TE)
    Nuc_dict = SeqIO.to_dict(SeqIO.parse(f"FilteredTranscripts/{pop}_de_novo_transcripts.fasta", "fasta"))
    return Nuc_dict.keys()

In [None]:
def annotationNoRepeat(Bedfile):
    #Exclude non-transposons repeats
    Old = open(Bedfile, "r")
    New = open(f"{Bedfile.split('.')[0]}TEonly.bed", "w")
    for line in Old:
        if "Simple_repeat" in line or "Satellite" in line or "Low_complexity" in line:
            continue
        else:
            New.write(line)
    New.close()
    Old.close()

In [None]:
def run_pybedtools_coverage(seqFile, teFile, pop, Type):
    #get the repeat overlap
    a = pybedtools.example_bedtool(seqFile)
    b = pybedtools.example_bedtool(teFile)
    c = a.coverage(b, s= True)
    Out = {}
    for elem in c:
        Out[elem[3] + "_" + pop + "_" + Type] = elem[-1]
    return Out

In [None]:
def run_for_all(TEtool):
    #Now run this for all files
    Out = open(f"Fig1A_Dataframe_{TEtool}.csv", "w")
    Out.write("ID,Type,Coverage,TEpresence\n")
    TranslatePopDict = {"AK5":"FI", "DK5":"DK", "GI5":"ES", "SW5":"SE", "UM":"UA", "YE":"TR", "Zamb":"ZI"}
    for pop in ["AK5", "DK5", "GI5", "SW5", "UM", "YE", "Zamb"]:
        Accepted = get_filter_id_list(pop) #Oly keep these ones

        annotationNoRepeat(f"/global/group/research/m_lebh01/TEpaperCorrection/TEannotationBed/{TEtool}/{TranslatePopDict[pop]}.bed")
        up = run_pybedtools_coverage(f"/global/group/research/m_lebh01/TEpaperCorrection/Bedfiles/Upstream/{pop}UpstreamTransformed.bed", f"/global/group/research/m_lebh01/TEpaperCorrection/TEannotationBed/{TEtool}/{TranslatePopDict[pop]}TEonly.bed", pop, "Upstream")
        dn = run_pybedtools_coverage(f"/global/group/research/m_lebh01/TEpaperCorrection/Bedfiles/Downstream/{pop}EndTransformed.bed", f"/global/group/research/m_lebh01/TEpaperCorrection/TEannotationBed/{TEtool}/{TranslatePopDict[pop]}TEonly.bed", pop, "Downstream")
        tr = run_pybedtools_coverage(f"/global/group/research/m_lebh01/TEpaperCorrection/Bedfiles/Transcript/{TranslatePopDict[pop]}Transformed.bed", f"/global/group/research/m_lebh01/TEpaperCorrection/TEannotationBed/{TEtool}/{TranslatePopDict[pop]}TEonly.bed", pop, "Transcript")
        nc = run_pybedtools_coverage(f"/global/group/research/m_lebh01/TEpaperCorrection/Bedfiles/RandomIntergenic/{pop}_intergenic_random.bed", f"/global/group/research/m_lebh01/TEpaperCorrection/TEannotationBed/{TEtool}//{TranslatePopDict[pop]}TEonly.bed", pop, "Intergenic")
        for i in up:
            if float(up[i]) == 0:
                TEpresence = "AbsenceTE"
            else:
                TEpresence = "PresenceTE"
            
            if i.split("_")[0] not in Accepted and "Intergenic" not in i:
                continue
            Out.write(f"{i},{i.split('_')[-1]},{float(up[i])},{TEpresence}\n")
        for i in tr:
            if float(tr[i]) == 0:
                TEpresence = "AbsenceTE"
            else:
                TEpresence = "PresenceTE"
            if i.split("_")[0] not in Accepted and "Intergenic" not in i:
                continue
            Out.write(f"{i},{i.split('_')[-1]},{float(tr[i])},{TEpresence}\n")
        for i in nc:
            if float(nc[i]) == 0:
                TEpresence = "AbsenceTE"
            else:
                TEpresence = "PresenceTE"
            if i.split("_")[0] not in Accepted and "Intergenic" not in i:
                continue
            Out.write(f"{i},{i.split('_')[-1]},{float(nc[i])},{TEpresence}\n")
        for i in dn:
            if float(dn[i]) == 0:
                TEpresence = "AbsenceTE"
            else:
                TEpresence = "PresenceTE"
            if i.split("_")[0] not in Accepted and "Intergenic" not in i:
                continue
            Out.write(f"{i},{i.split('_')[-1]},{float(dn[i])},{TEpresence}\n")


In [None]:
def plot(TEtool):
    #Make the plot 
    df = pd.read_csv(f"Fig1A_Dataframe_{TEtool}.csv")

    plt.clf()
    plt.figure(dpi = 150, figsize=(7,5))
    ax = sns.histplot(data=df, x="Type", hue="TEpresence", multiple="fill",  edgecolor="black", discrete=True, alpha = 0.3 ,hue_order=["AbsenceTE", "PresenceTE"])
    sns.move_legend(ax, "upper right", bbox_to_anchor=(1.2, 1))
    # Adjust the plot layout to ensure nothing is cut off
    plt.tight_layout(pad=3.0)
    sns.stripplot(
        data=df, x="Type", y="Coverage",jitter= 0.5, legend=False,size = 0.5, color = "black", alpha = 0.5
    )
    sns.violinplot(data=df, x="Type", y="Coverage" , edgecolor="black",density_norm="area", cut = 0, fill = False, inner = None)
    sns.set_style("ticks")
    plt.ylabel("Proportion with TE overlap (bars)/Rel. TE overlap (violin)")
    ax.set(title = f"TE annotation tool = {TEtool}")
    plt.xticks(rotation=45, ha='right')
    plt.savefig(f"Fig3A_{TEtool}.jpg",  bbox_inches="tight")

In [None]:
def main(TEtool):
#Now run (TE tool is the name of the TE tool. The folder is named that way.)
    run_for_all(TEtool)
    plot(TEtool)