In [1]:
import pandas as pd
import subprocess

In [2]:
def get_repeat_number(family_name, bed_file):
    cmd = "grep '{};' {} | wc -l".format(family_name, bed_file)
    ps = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
    return int(ps.communicate()[0])

In [17]:
def quantify_family_through_steps(family_name):
    original_number = get_repeat_number(family_name,"../outputs/nosimple_hg38.bed")
    repeats_both_sides = get_repeat_number(family_name,"../outputs/f50_hg38.bed") #after_filtering_for_nearby_repeats
    edges_removed = get_repeat_number(family_name,"../outputs/6_f50_hg38_e100.bed") #after_expanding_and_removing_too_near_to_edges
    mapped = get_repeat_number(family_name,"../outputs/mapped_6_f50_hg38_e100_pantro.bed")
    total_number_mapped = get_repeat_number(family_name, "../outputs/pantro_hg38_all_repeats_mapped.bed")
    total_number_unmapped = get_repeat_number(family_name, "../outputs/pantro_hg38_all_repeats_unmapped.bed")
    percent_mapped = 0
    if (total_number_mapped+ total_number_unmapped) >0:
        percent_mapped = float(total_number_mapped) / (total_number_mapped+ total_number_unmapped)
    df_list = [family_name, original_number,original_number-repeats_both_sides,repeats_both_sides-edges_removed,edges_removed-mapped, total_number_mapped , total_number_unmapped, percent_mapped, mapped]
    return df_list

In [18]:
def build_filter_dataframe(family_list):
    lst = []
    for family in family_list:
        lst.append(quantify_family_through_steps(family))
    df = pd.DataFrame(lst, columns = ["family", "original_number", "repeats_too_close_both_sides", "edge_too_close", "unmapped", "total_mapped", "total_unmapped", "percentage_mapped", "through_all_filters"])
    return df

In [27]:
def get_family_coverage_dataframe(family_name):
    file_name = "../outputs/6_f50_hg38_e100_pantro/alignments/" + family_name + "/repeat_alignment_coverage.csv"
    section_col_names= ["bp", "mis", "indel", "un"]
    left_col_names = ["left"+string for string in section_col_names]
    repeat_col_names = ["repeat"+string for string in section_col_names]
    right_col_names = ["right"+string for string in section_col_names]
    columns = ["id"] + left_col_names + repeat_col_names + right_col_names
    df = pd.read_csv(file_name,names=columns)
    return df

In [23]:
LTR26_shared_families = ["LTR26", "LTR26B", "LTR26E"]
LTR26_human_only_families = ["LTR26C", "LTR26D"]
ltr26df = build_filter_dataframe(LTR26_shared_families+LTR26_human_only_families)


In [24]:
print(ltr26df[["family","original_number", "percentage_mapped", "through_all_filters"]])

   family  original_number  percentage_mapped  through_all_filters
0   LTR26              625           0.878400                  318
1  LTR26B              161           0.888199                   72
2  LTR26E              119           0.899160                   59
3  LTR26C              300           0.943333                  147
4  LTR26D              221           0.773756                   80


In [50]:
mean_list = []
for family in LTR26_shared_families + LTR26_human_only_families:
    df = get_family_coverage_dataframe(family)
    
    no_family= df.drop(["id", "leftbp", "rightbp"], axis=1)
    mean_list.append([family] + no_family.mean(axis=0).tolist())
LTR_26_coverage = pd.DataFrame(mean_list, columns = ["family", "leftmis", "leftindel", "leftunaligned", "repeatbp","repeatmis", "repeatindel", "repeatunaligned", "rightmis", "rightindel", "rightunaligned"])
print(LTR_26_coverage)

   family   leftmis  leftindel  leftunaligned    repeatbp  repeatmis  \
0   LTR26  1.415094   0.254717       0.987421  438.251572   1.331792   
1  LTR26B  1.194444   0.750000       0.027778  346.819444   1.274583   
2  LTR26E  1.644068   0.355932       0.491525  417.355932   1.457627   
3  LTR26C  1.108844   0.401361       0.013605  407.755102   1.239796   
4  LTR26D  1.250000   0.075000       1.350000  395.200000   1.240750   

   repeatindel  repeatunaligned  rightmis  rightindel  rightunaligned  
0     0.127107         0.943396  1.562893    0.468553        1.191824  
1     0.093611         0.000000  1.097222    0.736111        0.000000  
2     0.191525         1.694915  1.338983    0.220339        1.711864  
3     0.039388         0.000000  1.265306    0.149660        0.034014  
4     0.134625         1.250000  1.137500    0.125000        1.262500  


In [52]:
pd.set_option('display.expand_frame_repr', False)

In [53]:
family_list = ["LTR26C", "LTR26D", "LTR5_Hs", "SVA_E", "SVA_F","AluYh7"]
mean_list = []
for family in family_list:
    df = get_family_coverage_dataframe(family)
    
    no_family= df.drop(["id", "leftbp", "rightbp"], axis=1)
    mean_list.append([family] + no_family.mean(axis=0).tolist())
family_coverage = pd.DataFrame(mean_list, columns = ["family", "leftmis", "leftindel", "leftunaligned", "repeatbp","repeatmis", "repeatindel", "repeatunaligned", "rightmis", "rightindel", "rightunaligned"])
print(family_coverage)

    family   leftmis  leftindel  leftunaligned    repeatbp  repeatmis  repeatindel  repeatunaligned  rightmis  rightindel  rightunaligned
0   LTR26C  1.108844   0.401361       0.013605  407.755102   1.239796     0.039388         0.000000  1.265306    0.149660        0.034014
1   LTR26D  1.250000   0.075000       1.350000  395.200000   1.240750     0.134625         1.250000  1.137500    0.125000        1.262500
2  LTR5_Hs  1.223881   0.328358       0.134328  872.582090   1.683582     0.217463         0.000000  1.417910    0.388060        0.761194
3    SVA_E  1.228108   0.399813       0.284824   80.125584   1.224952     0.591200         0.157183  1.286694    0.402929        0.246806
4    SVA_F  1.317460   0.962963       1.825397  122.735450   1.665926     0.319735         1.095397  1.492063    0.507937        1.105820
5   AluYh7  1.237762   0.286713       0.818182  272.713287   2.024965     0.621049         0.699301  1.279720    0.552448        0.909091


In [None]:

family_list = ["AluYb11", "AluYb8a1", "LTR26C", "LTR26D", "LTR5_Hs", "SVA_E", "SVA_F", "AluYi6_4d","AluYj4","AluYh7"]
more_families = ["ajax", "amalthea", "ananke","aoede", "callisto", "COMP-subunit_5SRNA_rnd-6_family-13719", "COMP-subunit_5SRNA_rnd-6_family-13720", "COMP-subunit_ACRO_rnd-5_family-1624","COMP-subunit_ACRO_rnd-5_family-1625", "COMP-subunit_ACRO_rnd-5_family-37", "COMP-subunit_ACRO_rnd-5_family-38", "COMP-subunit_FAM90A_rnd-6_family-7382", "COMP-subunit_TAF11_rnd-6_family-27360", "COMP-subunit_TELO_rnd-6_family-10479", "COMP-subunit_TELO_rnd-6_family-166", "COMP-subunit_VNTR_rnd-6_family-8746", "COMP-subunit_VNTR_rnd-6_family-8747", "cyllene", "DNM1r", "elara", "erinome", "FAM90Ar", "ghimalia", "harpalyke", "HSAT5v1", "HSAT5v2", "kalyke", "MER5A1r", "pasiphae", "SATR1v", "sinpoe", "SST1v", "teucerv1_5edge", "teucerv2_3edge", "teucerv3_internal", "TIFr", "Walusat", "SN5"]

In [57]:
sva_family_names = ["SVA_A", "SVA_B", "SVA_C", "SVA_D", "SVA_E", "SVA_F"]
mean_list = []
for family in sva_family_names:
    df = get_family_coverage_dataframe(family)
    
    no_family= df.drop(["id", "leftbp", "rightbp"], axis=1)
    mean_list.append([family] + no_family.mean(axis=0).tolist())
family_coverage = pd.DataFrame(mean_list, columns = ["family", "leftmis", "leftindel", "leftunaligned", "repeatbp","repeatmis", "repeatindel", "repeatunaligned", "rightmis", "rightindel", "rightunaligned"])
print(family_coverage)



  family   leftmis  leftindel  leftunaligned    repeatbp  repeatmis  repeatindel  repeatunaligned  rightmis  rightindel  rightunaligned
0  SVA_A  1.363432   0.740247       0.297728  147.587829   1.432347     0.337597         0.106393  1.347432    0.590219        0.277094
1  SVA_B  1.435395   0.600687       0.800000  145.491065   1.704873     0.495079         0.281237  1.400687    0.711340        0.615120
2  SVA_C  1.370709   0.768879       1.752860  336.244851   1.882403     0.532059         0.822334  1.501144    1.077803        0.970252
3  SVA_D  1.407524   1.147335       0.529781  420.341693   2.034796     0.613448         0.569185  1.495298    0.918495        2.175549
4  SVA_E  1.228108   0.399813       0.284824   80.125584   1.224952     0.591200         0.157183  1.286694    0.402929        0.246806
5  SVA_F  1.317460   0.962963       1.825397  122.735450   1.665926     0.319735         1.095397  1.492063    0.507937        1.105820


In [58]:
SVAdf = build_filter_dataframe(sva_family_names)
print(SVAdf[["family","original_number", "percentage_mapped", "through_all_filters"]])

  family  original_number  percentage_mapped  through_all_filters
0  SVA_A           510460           0.859460                96249
1  SVA_B            13911           0.836676                 2910
2  SVA_C             2152           0.706320                  437
3  SVA_D             1767           0.497453                  319
4  SVA_E            12424           0.841597                 3209
5  SVA_F             1796           0.397550                  189


In [None]:
#calculating percentage that are less than half the length of 