In [5]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
from pybedtools import BedTool


In [None]:
# download the data from the following link

In [6]:
# set source data path
GENCODEPOLYA_PATH="../../data/raw_data/annotations/gencode.vM25.polyAs.gtf"
POLYASITES_PATH="../../data/raw_data/annotations/atlas.clusters.2.0.GRCm38.96.bed"
POLYADB_PATH="../../data/raw_data/annotations/mouse.PAS.txt"
GENCODE_PATH="../../data/raw_data/annotations/gencode.vM25.annotation.bed"

# set interval data path
GENCODEPOLYA_PROCESSED_PATH="../../data/raw_data/annotations/gencode.vM25.polya.processed.bed"
POLYASITES_PROCESSED_PATH="../../data/raw_data/annotations/polyasites.mouse.processed.bed"
POLYADB_TOLIFT_PATH="../../data/raw_data/annotations/polyadb.mouse.tolift.bed"
POLYADB_PROCESSED_PATH="../../data/raw_data/annotations/polyadb.mouse.processed.bed"

# set output data path
GENCODE_PROCESSED_PATH="../../data/raw_data/annotations/gencode.vM25.annotation.bed"


In [7]:
# process genome annotation
gencode_df = pd.read_csv(
    GENCODE_PATH, sep="\t", header=None
)
gencode_df = gencode_df.rename(
    columns={
        0: "chr",
        1: "start",
        2: "stop",
        4: "score",
        5: "strand",
        6: "source",
        7: "type",
        9: "annotation",
    }
)

In [None]:
gencode_exon_df = gencode_df[gencode_df.type == "exon"]
gencode_exon_df[['gene_id', 'transcript_id', 'gene_name', 'exon_number']] = gencode_exon_df['annotation'].str.extract('gene_id "([^"]*)";.*transcript_id "([^"]*)";.*gene_name "([^"]*)";.*exon_number ([0-9]*);')
gencode_exon_df['exon_number'] = gencode_exon_df['exon_number'].astype(int)

# classify exon
gencode_exon_df['exon_type'] = 'Internal exon'
first_exons = gencode_exon_df.groupby('transcript_id')['exon_number'].idxmin()
last_exons = gencode_exon_df.groupby('transcript_id')['exon_number'].idxmax()
gencode_exon_df.loc[first_exons, 'exon_type'] = '5\' most exon'
gencode_exon_df.loc[last_exons, 'exon_type'] = '3\' most exon'
exon_counts = gencode_exon_df.groupby('transcript_id').size()
single_exon_transcripts = exon_counts[exon_counts == 1].index
gencode_exon_df.loc[gencode_exon_df['transcript_id'].isin(single_exon_transcripts), 'exon_type'] = 'Single exon'

In [None]:
gencode_genebody_df = gencode_df[gencode_df.type == "gene"]
gencode_genebody_df[['gene_id', 'gene_name',]] = gencode_genebody_df['annotation'].str.extract('gene_id "([^"]*)";.*gene_name "([^"]*)";')
gencode_genebody_df = gencode_genebody_df.drop_duplicates(subset=['gene_id'])

In [50]:
# process gencode polya annotation
gencode_polya = pd.read_csv(
    GENCODEPOLYA_PATH, sep="\t", header=None, skiprows=5
)
gencode_polya = gencode_polya.rename(
    columns={
        0: "chr",
        1: "source",
        2: "tag",
        3: "start",
        4: "stop",
        5: "score",
        6: "strand",
        7: "phase",
        8: "annotation",
    }
)
gencode_polya["source"] = "Gencode"
gencode_polya["score"] = "1"
gencode_polya_to_save = gencode_polya[
    gencode_polya.tag == "polyA_site"
][["chr", "start", "stop", "source", "score", "strand"]]
gencode_polya_to_save["stop"] = gencode_polya_to_save.apply(lambda x: x["start"] + 1 if x["strand"] == "+" else x["stop"], axis=1)
gencode_polya_to_save["start"] = gencode_polya_to_save["stop"] - 1
gencode_polya_to_save.to_csv(
    GENCODEPOLYA_PROCESSED_PATH, index=False, header=None, sep="\t"
)

In [None]:
# process polyasite annotation
polyasite = pd.read_csv(
    POLYASITES_PATH, sep="\t", header=None
)
polyasite = polyasite.rename(
    columns={
        0: "chr",
        1: "start",
        2: "stop",
        3: "id",
        4: "score",
        5: "strand",
        9: "annotation",
    }
)
polyasite.chr = polyasite.apply(lambda x: "chr" + str(x.chr), axis=1)
polyasite["position"] = polyasite.id.str.split(":", expand=True)[1]
polyasite["start"] = polyasite.apply(
    lambda x: int(x.position) if x.strand == "+" else int(x.position) - 1, axis=1
)
polyasite["stop"] = polyasite.apply(
    lambda x: int(x.position) + 1 if x.strand == "+" else int(x.position), axis=1
)
polyasite["source"] = "PolyASite2.0"
polyasite["score"] = "1"
# polyasite_to_save = polyasite[
#     polyasite.annotation.isin(["TE", "EX", "DS"])
# ]
polyasite_to_save = polyasite[["chr", "start", "stop", "annotation", "score", "strand"]]
polyasite_to_save["stop"] = polyasite_to_save.apply(lambda x: x["start"] + 1 if x["strand"] == "+" else x["stop"], axis=1)
polyasite_to_save["start"] = polyasite_to_save["stop"] - 1
polyasite_to_save.to_csv(
    POLYASITES_PROCESSED_PATH, index=False, header=None, sep="\t"
)

In [None]:
# process polyadb annotation
polyadb = pd.read_csv(POLYADB_PATH, sep="\t")
polyadb["start"] = polyadb.apply(
    lambda x: int(x.Position) if x.Strand == "+" else int(x.Position) - 1, axis=1
)
polyadb["stop"] = polyadb.apply(
    lambda x: int(x.Position) + 1 if x.Strand == "+" else int(x.Position), axis=1
)
polyadb["Gene Symbol"] = polyadb["Gene Symbol"].fillna("unknown")
polyadb["annotation"] = polyadb["Intron/exon location"].str.cat(polyadb[["Gene Symbol"]], sep=":")
polyadb["annotation"] = polyadb["annotation"].str.replace(" ", "_")
polyadb["score"] = "1"
polyadb_to_save = polyadb[
    ["Chromosome", "start", "stop", "annotation", "score", "Strand"]
]
polyadb_to_save["stop"] = polyadb_to_save.apply(lambda x: x["start"] + 1 if x["Strand"] == "+" else x["stop"], axis=1)
polyadb_to_save["start"] = polyadb_to_save["stop"] - 1
polyadb_to_save.to_csv(
    POLYADB_TOLIFT_PATH, index=False, header=None, sep="\t"
)
# liftOVer from hg19 to hg38
os.system(
    f"liftOver {POLYADB_TOLIFT_PATH} \
    ../../data/raw_data/annotations/mm9ToMm10.over.chain \
    {POLYADB_PROCESSED_PATH} \
    ../../data/raw_data/annotations/umap.bed"
)

In [53]:
gencode_exon_bed = BedTool.from_dataframe(gencode_exon_df)
gencode_genebody_bed = BedTool.from_dataframe(gencode_genebody_df)
polyasites_bed = BedTool.from_dataframe(polyasite_to_save)
polyadb_bed = BedTool(POLYADB_PROCESSED_PATH)
gencode_polya_bed = BedTool.from_dataframe(gencode_polya_to_save)

In [None]:
# 定义函数用于合并数据集
def merge_bed(bed):
    bed_pos = bed.filter(lambda b: b.strand == '+').sort().merge(d=10, s=True, c='4,5,6', o='last')
    bed_neg = bed.filter(lambda b: b.strand == '-').sort().merge(d=10, s=True, c='4,5,6', o='first')
    bed_merged = bed_pos.cat(bed_neg, postmerge=False).sort().to_dataframe()
    bed_merged['start'] = bed_merged.apply(lambda x: x['end']-1 if x['strand'] == '+' else x['start'], axis=1)
    bed_merged['end'] = bed_merged['start'] + 1
    return BedTool.from_dataframe(bed_merged)

# 定义一个辅助函数用于获取重叠的PASs
# def get_overlapping_pases(bed1, bed2):
#     return bed1.window(bed1, w=20, sm=True, u=True)

# 合并并去重各个数据集
merged_polyasites = merge_bed(polyasites_bed)
polyadb_bed_merged = merge_bed(polyadb_bed)
gencode_polya_bed_merged = merge_bed(gencode_polya_bed)

print(f"merged_polyasites: {len(merged_polyasites)}")

# 获取PolyA_DB v3的PASs与当前PAS集合中在±10nt范围内有重叠的PAS
# overlapping_polyadb = get_overlapping_pases(polyadb_bed_merged, merged_polyasites)
overlapping_polyadb = polyadb_bed_merged.window(merged_polyasites, w=20, sm=True, u=True)
print(f"overlapping_polyadb: {len(overlapping_polyadb)}")
polyadb_subtracted = polyadb_bed_merged.subtract(overlapping_polyadb, s=True, A=True)
print(f"polyadb_subtracted: {len(polyadb_subtracted)}")
merged_polyasites_addpd = merged_polyasites.cat(polyadb_subtracted, postmerge=False).sort()

# 同样的方法处理Gencode M25的PASs
# overlapping_gencode = get_overlapping_pases(gencode_polya_bed_merged, merged_polyasites_addpd)
overlapping_gencode = gencode_polya_bed_merged.window(merged_polyasites_addpd, w=20, sm=True, u=True)
print(f"overlapping_gencode: {len(overlapping_gencode)}")
gencode_polya_subtracted = gencode_polya_bed_merged.subtract(overlapping_gencode, s=True, A=True)
print(f"gencode_polya_subtracted: {len(gencode_polya_subtracted)}")
merged_polyasites_addall = merged_polyasites_addpd.cat(gencode_polya_subtracted, postmerge=False).sort()


In [67]:
from matplotlib_venn import venn3
import matplotlib.pyplot as plt

from pybedtools import BedTool

# polyasites_bed_merged = merge_bed(polyasites_bed)
# polyadb_bed_merged = merge_bed(polyadb_bed)
# gencode_polya_bed_merged = merge_bed(gencode_polya_bed)

# 计算各个集合的大小
polyasites_size = len(merged_polyasites_addall.window(merged_polyasites, w=20, sm=True, u=True))
polyadb_size = len(merged_polyasites_addall.window(polyadb_bed_merged, w=20, sm=True, u=True))
gencode_size = len(merged_polyasites_addall.window(gencode_polya_bed_merged, w=20, sm=True, u=True))

# 计算交集的大小
polyasites_polyadb_overlap = len(merged_polyasites_addall.window(merged_polyasites, w=20, sm=True, u=True).window(polyadb_bed_merged, w=20, sm=True, u=True))
polyasites_gencode_overlap = len(merged_polyasites_addall.window(merged_polyasites, w=20, sm=True, u=True).window(gencode_polya_bed_merged, w=20, sm=True, u=True))
polyadb_gencode_overlap = len(merged_polyasites_addall.window(gencode_polya_bed_merged, w=20, sm=True, u=True).window(polyadb_bed_merged, w=20, sm=True, u=True))

# # 对window的结果进行处理，只保留前六列
# def process_window_result(result):
#     processed_result = []
#     for line in str(result).split('\n'):
#         columns = line.split('\t')
#         if len(columns) >= 6:
#             processed_result.append('\t'.join(columns[:6]))
#     return BedTool('\n'.join(processed_result), from_string=True)

# 计算三个数据集的交集的大小
polyasites_polyadb_gencode_overlap = len(merged_polyasites_addall.window(merged_polyasites, w=20, sm=True, u=True).window(polyadb_bed_merged, w=20, sm=True, u=True).window(gencode_polya_bed_merged, w=20, sm=True, u=True))
# 计算只在两个数据集中的元素的数量
polyasites_polyadb_only = polyasites_polyadb_overlap - polyasites_polyadb_gencode_overlap
polyasites_gencode_only = polyasites_gencode_overlap - polyasites_polyadb_gencode_overlap
polyadb_gencode_only = polyadb_gencode_overlap - polyasites_polyadb_gencode_overlap

# 计算只在一个数据集中的元素的数量
polyasites_only = polyasites_size - polyasites_polyadb_overlap - polyasites_gencode_overlap + polyasites_polyadb_gencode_overlap
polyadb_only = polyadb_size - polyasites_polyadb_overlap - polyadb_gencode_overlap + polyasites_polyadb_gencode_overlap
gencode_only = gencode_size - polyasites_gencode_overlap - polyadb_gencode_overlap + polyasites_polyadb_gencode_overlap

In [70]:
import pickle
subsets=(polyasites_only, polyadb_only, polyasites_polyadb_only, gencode_only, polyasites_gencode_only, polyadb_gencode_only, polyasites_polyadb_gencode_overlap)
with open("../../data/raw_data/annotations/mm10_venn.pkl", "wb") as f:
    pickle.dump(subsets, f)
    

In [None]:
# 生成韦恩图
with open("../../data/raw_data/annotations/mm10_venn.pkl", "rb") as f:
    subsets = pickle.load(f)
venn3(subsets=subsets,
      set_labels=('PolyAsites2.0', 'PolyA_DBV3.2', 'Gencode'))
plt.show()
plt.savefig("../../data/raw_data/annotations/venn3.svg", dpi=300)

In [72]:
# Filter 3' most exon and single exon
gencode_exon_df_filtered = gencode_exon_df[gencode_exon_df['exon_type'].isin(['3\' most exon', 'Single exon'])]
downstream_len=2000

In [None]:
import bisect
def find_next_start(current, sorted_list, strand):
    # 使用二分查找找到比current_stop大的最小的start
    if strand == '+':
        idx = bisect.bisect_right(sorted_list, current)
    elif strand == '-':
        idx = bisect.bisect_right(sorted_list, current) - 1
    else:
        raise ValueError('Strand must be + or -')

    if idx != len(sorted_list) and idx != -1:
        return sorted_list[idx]
    else:
        print(f"current: {current}, idx: {idx}, strand: {strand},sorted_list: {sorted_list}")
        raise ValueError('No next start found')

chrs = gencode_genebody_df['chr'].unique()
gencode_downstream_df = pd.DataFrame()
for chr in chrs:
    chr_genbody_df = gencode_genebody_df[gencode_genebody_df['chr'] == chr]
    chr_exon_df = gencode_exon_df_filtered[gencode_exon_df_filtered['chr'] == chr]
    #chr_df = pd.merge(chr_genbody_df, chr_exon_df[['chr', 'start', 'stop', 'gene_id', 'next_start']], left_on='gene_id', right_on='gene_id', how='left')
    
    chr_genbody_df_pos = chr_genbody_df[chr_genbody_df['strand'] == '+'].sort_values(['start'])
    chr_genbody_df_neg = chr_genbody_df[chr_genbody_df['strand'] == '-'].sort_values(['stop'], ascending=False)
    chr_genbody_df_pos['next_start'] = chr_genbody_df_pos['stop'] + downstream_len
    chr_genbody_df_neg['next_start'] = chr_genbody_df_neg['start'] - downstream_len
    chr_genbody_df_pos.loc[chr_genbody_df_pos.index[:-1], 'next_start'] = chr_genbody_df_pos['start'].shift(-1)[:-1]
    chr_genbody_df_neg.loc[chr_genbody_df_neg.index[:-1], 'next_start'] = chr_genbody_df_neg['stop'].shift(-1)[:-1]
   
    exceptional_gene_pos = chr_genbody_df_pos[chr_genbody_df_pos['next_start'] < chr_genbody_df_pos['stop']]
    exceptional_gene_neg = chr_genbody_df_neg[chr_genbody_df_neg['next_start'] > chr_genbody_df_neg['start']]


    sorted_starts = sorted(chr_genbody_df_pos['start'].unique())
    sorted_stops = sorted(chr_genbody_df_neg['stop'].unique())

    if len(exceptional_gene_pos) > 0:
        for index, gene in exceptional_gene_pos.iterrows():
            try:
                chr_genbody_df_pos.loc[index, 'next_start'] = find_next_start(gene['stop'], sorted_starts, '+')
            except ValueError:
                chr_genbody_df_pos.loc[index, 'next_start'] = gene['stop'] + downstream_len
            except Exception as e:
                raise e
    if len(exceptional_gene_neg) > 0:
        for index, gene in exceptional_gene_neg.iterrows():
            try:
                chr_genbody_df_neg.loc[index, 'next_start'] = find_next_start(gene['start'], sorted_stops, '-')
            except ValueError:
                chr_genbody_df_neg.loc[index, 'next_start'] = gene['start'] - downstream_len
            except Exception as e:
                raise e
    chr_exon_df_pos = chr_exon_df[chr_exon_df['strand'] == '+'].sort_values(['start'])
    chr_exon_df_neg = chr_exon_df[chr_exon_df['strand'] == '-'].sort_values(['stop'], ascending=False)

    chr_exon_df_pos = pd.merge(chr_exon_df_pos, chr_genbody_df_pos[["gene_id","next_start"]],  left_on='gene_id', right_on='gene_id', how='left')
    chr_exon_df_neg = pd.merge(chr_exon_df_neg, chr_genbody_df_neg[["gene_id","next_start"]],  left_on='gene_id', right_on='gene_id', how='left')

    chr_exon_df_pos['downstream_start'] = chr_exon_df_pos["stop"]
    chr_exon_df_pos['downstream_stop'] = np.where(chr_exon_df_pos['downstream_start'] + downstream_len > chr_exon_df_pos['next_start'],
                                        chr_exon_df_pos['next_start'], chr_exon_df_pos['downstream_start'] + downstream_len)
    chr_exon_df_neg['downstream_stop'] = chr_exon_df_neg["start"]
    chr_exon_df_neg['downstream_start'] = np.where(chr_exon_df_neg['downstream_stop'] - downstream_len < chr_exon_df_neg['next_start'],
                                        chr_exon_df_neg['next_start'], chr_exon_df_neg['downstream_stop'] - downstream_len)
    
    chr_downstream_df = pd.concat([chr_exon_df_neg, chr_exon_df_pos])[['chr', 'downstream_start', 'downstream_stop', 'gene_id', 'score', 'strand', 'gene_name']]
    chr_downstream_df["exon_type"] = "downstream"

    gencode_downstream_df = pd.concat([gencode_downstream_df, chr_downstream_df])
gencode_downstream_df = gencode_downstream_df.rename(columns={"downstream_start": "start", "downstream_stop": "stop"})
gencode_downstream_df = gencode_downstream_df.drop_duplicates(['chr', 'start', 'stop', 'gene_id', 'score', 'strand', 'gene_name'])

In [74]:
gencode_genebody_df_final = pd.concat([gencode_genebody_df, gencode_downstream_df])
gencode_exon_df_final = pd.concat([gencode_exon_df, gencode_downstream_df])
gencode_exon_df_final = gencode_exon_df_final.drop_duplicates(["chr", "start", "stop", "gene_id", "exon_type", "strand"], keep="first")

gencode_exon_df_final["name"] = gencode_exon_df_final["gene_id"].str.cat(gencode_exon_df_final[["gene_name", "exon_type"]], sep=":")
gencode_genebody_df_final["name"] = gencode_genebody_df_final["gene_id"].str.cat(gencode_genebody_df_final[["gene_name"]], sep=":")

gencode_exon_bed = BedTool.from_dataframe(gencode_exon_df_final[["chr", "start", "stop", "name", "score", "strand"]])
gencode_genebody_bed = BedTool.from_dataframe(gencode_genebody_df_final[["chr", "start", "stop", "name", "score", "strand"]])

In [75]:
integrated_pas = merged_polyasites_addall.to_dataframe()
integrated_pas.columns = ["chr", "start", "stop", "name", "score", "strand"]
integrated_pas["name"] = integrated_pas["chr"].str.cat(integrated_pas[["start", "stop", "strand",]].astype(str), sep=":")

In [None]:
integrated_pas

In [77]:
integrated_pas_bed = BedTool.from_dataframe(integrated_pas)
intersected_pas_genebody = integrated_pas_bed.intersect(gencode_genebody_bed, wa=True, wb=True, s=True)
intersected_pas_exon = integrated_pas_bed.intersect(gencode_exon_bed, wa=True, wb=True, s=True)

In [78]:
intersected_pas_genebody_df = intersected_pas_genebody.to_dataframe()
intersected_pas_exon_df = intersected_pas_exon.to_dataframe()

intersected_pas_genebody_df[['gene_id', 'gene_name']] = intersected_pas_genebody_df.iloc[:,9].str.split(':', expand=True)
intersected_pas_exon_df[['gene_id', 'gene_name', 'exon_type']] = intersected_pas_exon_df.iloc[:,9].str.split(':', expand=True)

In [79]:
intronic_pas = intersected_pas_genebody_df[~intersected_pas_genebody_df["name"].isin(intersected_pas_exon_df["name"])]["name"].copy().drop_duplicates().tolist()
intersected_pas_intron_df = intersected_pas_genebody_df[intersected_pas_genebody_df["name"].isin(intronic_pas)].copy()
intersected_pas_intron_df["exon_type"] = "intron"
intersected_pas_df = pd.concat([intersected_pas_exon_df, intersected_pas_intron_df])

In [80]:
priority_dict = {
    "3' most exon": 5,
    "Single exon": 4,
    "Internal exon": 3,
    "5' most exon": 2,
    "downstream": 0,
    "intron": 1
}
intersected_pas_df['priority'] = intersected_pas_df['exon_type'].map(priority_dict)


In [81]:
intersected_pas_df_max_priority = intersected_pas_df.groupby('name')['priority'].max()
intersected_pas_df_with_max_priority = intersected_pas_df.merge(intersected_pas_df_max_priority, on='name', suffixes=('', '_max'))
intersected_pas_df_multiple_max = intersected_pas_df_with_max_priority[intersected_pas_df_with_max_priority['priority'] == intersected_pas_df_with_max_priority['priority_max']]
grouped_df = intersected_pas_df_multiple_max.drop_duplicates(["name", "gene_id"]).groupby("name").size()
unknown_pas = grouped_df[grouped_df > 1].index.tolist()
intersected_pas_df_drop_duplicates = intersected_pas_df.sample(frac=1).sort_values(['name', 'priority'], ascending=[True, False]).groupby('name').first().reset_index()
intersected_pas_df_drop_duplicates.loc[intersected_pas_df_drop_duplicates["name"].isin(unknown_pas), ["gene_id", "gene_name"]] = "unknown"

In [82]:
intronic_pas = intersected_pas_df_drop_duplicates[intersected_pas_df_drop_duplicates["exon_type"] == "intron"]["name"].copy().drop_duplicates().tolist()
most3Exon = intersected_pas_df_drop_duplicates[intersected_pas_df_drop_duplicates["exon_type"] == "3' most exon"]["name"].copy().drop_duplicates().tolist()
most5Exon = intersected_pas_df_drop_duplicates[intersected_pas_df_drop_duplicates["exon_type"] == "5' most exon"]["name"].copy().drop_duplicates().tolist()
singleExon = intersected_pas_df_drop_duplicates[intersected_pas_df_drop_duplicates["exon_type"] == "Single exon"]["name"].copy().drop_duplicates().tolist()
internalExon = intersected_pas_df_drop_duplicates[intersected_pas_df_drop_duplicates["exon_type"] == "Internal exon"]["name"].copy().drop_duplicates().tolist()
downstream = intersected_pas_df_drop_duplicates[intersected_pas_df_drop_duplicates["exon_type"] == "downstream"]["name"].copy().drop_duplicates().tolist()

In [83]:
integrated_pas["pas_type"] = "intergenic"
integrated_pas.loc[integrated_pas["name"].isin(intronic_pas), "pas_type"] = "Intron"
integrated_pas.loc[integrated_pas["name"].isin(most5Exon), "pas_type"] = "5' most exon"
integrated_pas.loc[integrated_pas["name"].isin(most3Exon), "pas_type"] = "3' most exon"
integrated_pas.loc[integrated_pas["name"].isin(downstream), "pas_type"] = "downstream"
integrated_pas.loc[integrated_pas["name"].isin(singleExon), "pas_type"] = "Single exon"
integrated_pas.loc[integrated_pas["name"].isin(internalExon), "pas_type"] = "Internal exon"

In [None]:
integrated_pas_final = pd.merge(left=integrated_pas, right=intersected_pas_df_drop_duplicates, on="name", how="left", suffixes=('', '_y'), ).fillna("unknown").loc[:, ["chr", "start", "stop", "name", "score", "strand", "pas_type", "gene_id", "gene_name"]]
integrated_pas_final_bed_df = integrated_pas_final.copy()
integrated_pas_final_bed = BedTool.from_dataframe(integrated_pas_final_bed_df)
# add polyadb exon information for pas
# intersect_polyadb = get_overlapping_pases(integrated_pas_final_bed, polyadb_bed_merged).to_dataframe(header=None)
intersect_polyadb = integrated_pas_final_bed.window(polyadb_bed_merged, w=20, sm=True).to_dataframe(header=None)
intersect_polyadb[["polyadb_type","polyadb_gene"]] = intersect_polyadb.iloc[:,12].str.split(':', expand=True)
intersect_polyadb = intersect_polyadb[~intersect_polyadb["polyadb_gene"].isin(["unknown", "na", "nan"])]
intersect_polyadb["polyadb_type"] = intersect_polyadb["polyadb_type"].str.replace("_", " ")
intersect_polyadb = intersect_polyadb.rename(columns={3: "name", 6: "pas_type"})
intersect_polyadb = intersect_polyadb[["name", "polyadb_type"]]
# add polyasites TE information for pas
# intersect_polyasites = get_overlapping_pases(integrated_pas_final_bed, polyasites_bed_merged).to_dataframe(header=None)
intersect_polyasites = integrated_pas_final_bed.window(merged_polyasites, w=20, sm=True).to_dataframe(header=None)
intersect_polyasites = intersect_polyasites.rename(columns={3: "name", 12:"polyasites_type"})
intersect_polyasites = intersect_polyasites[intersect_polyasites["polyasites_type"] == "TE"][["name", "polyasites_type"]]
intersect_polyasites["polyasites_type"] = "3' most exon"

integrated_pas_final_add_annotation = pd.merge(integrated_pas_final, intersect_polyadb, how="left")
integrated_pas_final_add_annotation = pd.merge(integrated_pas_final_add_annotation, intersect_polyasites, how="left")

integrated_pas_final_add_annotation["polyadb_type"].combine_first(integrated_pas_final_add_annotation["polyasites_type"]).combine_first(integrated_pas_final_add_annotation["pas_type"])
integrated_pas_final_add_annotation["integrated_pas_type"] = integrated_pas_final_add_annotation["polyadb_type"].combine_first(integrated_pas_final_add_annotation["polyasites_type"]).combine_first(integrated_pas_final_add_annotation["pas_type"])


In [90]:
integrated_pas_final = integrated_pas_final_add_annotation.loc[:, ["chr", "start", "stop", "name", "score", "strand", "integrated_pas_type", "gene_id", "gene_name"]]

In [None]:
integrated_pas_final.to_csv("mouse_integrated_pas.bed", sep="\t", index=False, header=False)