In [76]:
import pandas as pd
import subprocess
import glob, os
import requests
from scipy import stats



In [77]:
#location of the results of our pipeline within /outputs
output_folder_name = "f50_hg38_e100_mLoxAfr"

In [78]:
#converts the CSV file for a family into a pandas dataframe
def get_family_coverage_dataframe(family_name, output_folder_name):
    file_name = "../outputs/" +  output_folder_name + "/alignments/" + family_name + "/repeat_alignment_coverage.csv"
    section_col_names= ["bp", "mis", "indel", "un"]
    left_col_names = ["left"+string for string in section_col_names]
    repeat_col_names = ["repeat"+string for string in section_col_names]
    right_col_names = ["right"+string for string in section_col_names]
    columns = ["id"] + left_col_names + repeat_col_names + right_col_names
    df = pd.read_csv(file_name,names=columns)
    return df

In [79]:
#get the names of the families that had at least one copy map to elephant
family_beds = glob.glob("../outputs/" + output_folder_name + "/target_beds/*.bed")
families = sorted([pth.split("/")[-1].split(".")[0]for pth in family_beds])

In [80]:
#get the families we expect to see in elephant from Dfam
url = "https://dfam.org/api/families"
params = {
    # The summary format is metadata-only and does not include
    # full details such as the consensus sequence and citations
    "format": "summary",

    # Only retrieve the first 10 results in this query


    # Search in elephant (clade 9785)
    "clade": "9785",

    # Include families from ancestor and descendant taxa in the results
    "clade_relatives": "both",
}
elephant_response = requests.get(url, params)
results = elephant_response.json()["results"]
elephant_families = [family['name'] for family in (elephant_response.json()['results'])]
elephant_family_lengths = [family['length'] for family in (elephant_response.json()['results'])]



In [81]:
#get the families we expect to see in human from Dfam
url = "https://dfam.org/api/families"
params = {
    # The summary format is metadata-only and does not include
    # full details such as the consensus sequence and citations
    "format": "summary",

    # Only retrieve the first 10 results in this query


    # Search in human (clade 9606)
    "clade": "9606",

    # Include families from ancestor and descendant taxa in the results
    "clade_relatives": "both",
}
human_response = requests.get(url, params)
human_families = [family['name'] for family in (human_response.json()['results'])]
human_family_lengths = [family['length'] for family in (human_response.json()['results'])]


In [82]:
# match family names to Dfam's families (adding/subtracting suffixes where we need to to find a match)
# we discard families that we can't match to the dfam results for either human or elephant
family_name_dict = {}
shared_families_trimmed = []
unshared_families_trimmed = []
unidentified_families = []
for family in families:
    if family in elephant_families:
        family_name_dict[family] = family
        shared_families_trimmed.append(family)
    elif family in human_families:
        family_name_dict[family] = family
        unshared_families_trimmed.append(family)
    elif family[-4:] == "-int":
        trimmed_family = family[:-4]
        if trimmed_family in elephant_families:
            family_name_dict[family] = trimmed_family
            shared_families_trimmed.append(family)
        elif trimmed_family in human_families:
            family_name_dict[family] = trimmed_family
            unshared_families_trimmed.append(family)
        else:
            print(family)
    elif family+"_orf2" in elephant_families:
        family_name_dict[family] = family+"_orf2"
        shared_families_trimmed.append(family)
    elif family+"_3end" in elephant_families:
        family_name_dict[family] = family+"_3end"
        shared_families_trimmed.append(family)
    elif family+"_5end" in elephant_families:
        family_name_dict[family] = family+"_5end"
        shared_families_trimmed.append(family)
    elif family+"_orf2" in human_families:
        family_name_dict[family] = family+"_orf2"
        unshared_families_trimmed.append(family)
    elif family+"_3end" in human_families:
        family_name_dict[family] = family+"_3end"
        unshared_families_trimmed.append(family)
    elif family+"_5end" in human_families:
        family_name_dict[family] = family+"_5end"
        unshared_families_trimmed.append(family)
    else:
        unidentified_families.append(family)
print(unidentified_families)


['%GAATG%n', 'ALR%Alpha', 'Alu', 'BSR%Beta', 'Charlie7b_Mars', 'L1M', 'L1P', 'L1P5', 'Tigger1a_Art', 'Tigger1a_Mars']


In [83]:
#add manually ALR/Alpha and BSR/Beta which are named differently (ASRa and BSRb) by Dfam 3.7
unshared_unidentified_families = ['ALR%Alpha', 'BSR%Beta' ]
family_name_dict['ALR%Alpha'] = "ALRa"
family_name_dict['BSR%Beta'] = "BSRb"
unshared_families_trimmed = unshared_families_trimmed + unshared_unidentified_families


In [84]:
#create a dictionary of lengths of the families so we can use this in our thresholding
lengthdict = {}
for family in shared_families_trimmed:
    idx = elephant_families.index(family_name_dict[family])
    lengthdict[family] = elephant_family_lengths[idx]
for family in unshared_families_trimmed:
    idx = human_families.index(family_name_dict[family])
    lengthdict[family] = human_family_lengths[idx]
print(lengthdict)

{'5S': 121, '7SK': 331, '7SLRNA': 320, 'AmnHarb1': 1444, 'AmnL2-1': 2613, 'AmnSINE1': 575, 'AmnSINE2': 358, 'Arthur1': 3947, 'Arthur1A': 177, 'Arthur1B': 1225, 'Arthur1C': 363, 'Arthur2': 3700, 'BLACKJACK': 2969, 'CR1-11_Crp': 1461, 'CR1-12_AMi': 614, 'CR1-13_AMi': 724, 'CR1-16_AMi': 851, 'CR1-1_Amn': 813, 'CR1-3_Croc': 3605, 'CR1-L3A_Croc': 4288, 'CR1-L3B_Croc': 3277, 'CR1_Amni-1': 1319, 'CR1_Mam': 2204, 'Chap1_Mam': 342, 'Chap1a_Mam': 1038, 'Charlie1': 2781, 'Charlie10': 2822, 'Charlie10a': 280, 'Charlie10b': 242, 'Charlie11': 2196, 'Charlie13a': 1514, 'Charlie13b': 512, 'Charlie14a': 1166, 'Charlie15a': 224, 'Charlie15b': 1061, 'Charlie16': 3051, 'Charlie16a': 342, 'Charlie17': 2916, 'Charlie17a': 217, 'Charlie17b': 1204, 'Charlie18a': 342, 'Charlie19a': 386, 'Charlie1a': 1455, 'Charlie1b': 523, 'Charlie20a': 807, 'Charlie21a': 1213, 'Charlie22a': 491, 'Charlie23a': 339, 'Charlie24': 2449, 'Charlie25': 2524, 'Charlie26a': 325, 'Charlie29a': 748, 'Charlie29b': 1194, 'Charlie2a': 2861

In [85]:
#create a dictionary of the coverage/csv dataframes for each family so we don't need to recalculate this multiple times for each family
coverage_dataframes = {}
for family in families:
    coverage_dataframes[family] = get_family_coverage_dataframe(family, output_folder_name)

In [86]:
#Set the thresholds for the alignment
#max_repeat_nonmatch is the maximum percentage of basepairs of the target we allow to be unmatched (deleted, mismatched or excluded from the alignment) within the repeat
max_repeat_nonmatch = 20.0
#max_side_nonmatch is the maximum percentage of basepairs of the target we allow to be unmatched on both sides of the repeat (ie at least one side must have less than this threshold unmatched)
max_side_nonmatch = 50.0
#fraction of full length is the fraction of the whole length of the repeat that the repeat identified by repeatmasker that the repeat must span
fraction_of_full_length = 0.5
#create a dictionary that stores the number of repeat instances meeting these thresholds for each family
num_meeting_requirement = {}
for family in shared_families_trimmed + unshared_families_trimmed:
    family_coverage = coverage_dataframes[family]

    repeat_threshold = (family_coverage["repeatindel"] + family_coverage["repeatun"] + family_coverage["repeatmis"] <= max_repeat_nonmatch)
    length_threshold = (family_coverage["repeatbp"]>=fraction_of_full_length*lengthdict[family] )
    left_threshold = (family_coverage["leftindel"] + family_coverage["leftun"] + family_coverage["leftmis"] <= max_side_nonmatch)
    right_threshold = (family_coverage["rightindel"] + family_coverage["rightun"] + family_coverage["rightmis"] <= max_side_nonmatch)
    qualifying_rows = family_coverage.loc[ repeat_threshold & length_threshold &  (left_threshold|right_threshold)]
    num_meeting_requirement[family] = len(qualifying_rows.index)


In [87]:
#find out how many meet the requirement among the shared and the unshared repeats
shared_meeting_requirement = []
unshared_meeting_requirement = []
for family in shared_families_trimmed:
    shared_meeting_requirement.append((family,num_meeting_requirement[family]))
for family in unshared_families_trimmed:
    unshared_meeting_requirement.append((family,num_meeting_requirement[family]))


In [88]:
#threshold is the number of alignments that meet the requirements, at or above which we count an element as shared
threshold = 1

#positive = we say it's shared based on the alignments
#negative = we say it's not shared based on the alignments
false_positives = [item for item in unshared_meeting_requirement if item[1]>=threshold]
true_positives = [item for item in shared_meeting_requirement if item[1]>=threshold]

false_negatives = [item for item in shared_meeting_requirement if item[1]<threshold]
true_negatives = [item for item in unshared_meeting_requirement if item[1]<threshold]

print("FN:", len(false_negatives))
print("TN:", len(true_negatives))
print("FP:", len(false_positives))
print("TP:", len(true_positives))
print("Errors: False positives (we expect these families NOT to be shared but we are seeing enough alignments meeting the thresholds)")
print(false_positives)
print("Errors: False negatives (we expect these families  to be shared but we are NOT seeing enough alignments meeting the thresholds)")
print(false_negatives)


FN: 279
TN: 602
FP: 4
TP: 418
Errors: False positives (we expect these families NOT to be shared but we are seeing enough alignments meeting the thresholds)
[('COMP-subunit_TELO_rnd-6_family-166', 1), ('MLT1A0', 1), ('MSR1', 30), ('Tigger4', 1)]
Errors: False negatives (we expect these families  to be shared but we are NOT seeing enough alignments meeting the thresholds)
[('AmnHarb1', 0), ('AmnL2-1', 0), ('Arthur1', 0), ('Arthur1B', 0), ('Arthur2', 0), ('BLACKJACK', 0), ('CR1-11_Crp', 0), ('CR1-3_Croc', 0), ('CR1-L3A_Croc', 0), ('CR1-L3B_Croc', 0), ('CR1_Amni-1', 0), ('CR1_Mam', 0), ('Chap1a_Mam', 0), ('Charlie1', 0), ('Charlie16', 0), ('Charlie17', 0), ('Charlie21a', 0), ('Charlie24', 0), ('Charlie25', 0), ('Charlie29b', 0), ('Charlie2a', 0), ('Charlie2b', 0), ('Charlie30a', 0), ('Charlie30b', 0), ('Charlie4', 0), ('Charlie5', 0), ('Charlie6', 0), ('Charlie7', 0), ('Cheshire', 0), ('Chompy-6_Croc', 0), ('DIRS-1a_Amnio', 0), ('DIRS-1b_Amnio', 0), ('DIRS-1c_Amnio', 0), ('DNA1_Mam', 0), 

In [89]:
#code that fetches the csv rows of the alignments that are qualifying it according to our thresholds (useful for looking at false positives
#can change family_name to look at different families
family_name = "MSR1"
family_coverage = coverage_dataframes[family_name]
repeat_threshold = (family_coverage["repeatindel"] + family_coverage["repeatun"] + family_coverage["repeatmis"] <= max_repeat_nonmatch)
length_threshold = (family_coverage["repeatbp"]>=fraction_of_full_length*lengthdict[family])
left_threshold = (family_coverage["leftindel"] + family_coverage["leftun"] + family_coverage["leftmis"] <= max_side_nonmatch)
right_threshold = (family_coverage["rightindel"] + family_coverage["rightun"] + family_coverage["rightmis"] <= max_side_nonmatch)
qualifying_rows = family_coverage.loc[ repeat_threshold & length_threshold &  (left_threshold|right_threshold)]
num_meeting_requirement[family] = len(qualifying_rows.index)

qualifying_rows

Unnamed: 0,id,leftbp,leftmis,leftindel,leftun,repeatbp,repeatmis,repeatindel,repeatun,rightbp,rightmis,rightindel,rightun
9,MSR1;Satellite;4264374;5146316,100,10.0,0.0,0.0,425,13.41,3.06,0.0,100,2.0,4.0,77.0
12,MSR1;Satellite;4264508;5146470,100,7.0,0.0,0.0,102,10.78,0.98,0.0,100,7.0,0.0,8.0
23,MSR1;Satellite;4267319;5149904,100,18.0,0.0,0.0,202,10.89,0.99,0.0,100,15.0,1.0,0.0
26,MSR1;Satellite;4267453;5150064,100,18.0,0.0,14.0,94,8.51,1.06,0.0,100,11.0,0.0,0.0
28,MSR1;Satellite;4267511;5150133,100,2.0,1.0,84.0,392,13.52,1.79,0.0,100,17.0,0.0,0.0
31,MSR1;Satellite;4268136;5150878,100,11.0,8.0,1.0,85,11.76,1.18,0.0,100,9.0,3.0,0.0
36,MSR1;Satellite;4270378;5153665,100,0.0,0.0,100.0,859,7.22,2.1,0.35,100,16.0,10.0,20.0
44,MSR1;Satellite;4278800;5163729,100,13.0,0.0,0.0,390,12.31,1.28,0.0,100,18.0,5.0,5.0
46,MSR1;Satellite;4280836;5166159,100,12.0,1.0,0.0,93,13.98,0.0,0.0,100,14.0,0.0,7.0
47,MSR1;Satellite;4280991;5166352,100,7.0,0.0,0.0,135,10.37,0.0,0.0,100,10.0,4.0,0.0


In [90]:
#calculate the number of alignments we did/repeats that make it through our filters for each repeat
num_csv_entries_dict = {}
for family_name in families:
    file_name = "../outputs/" +  output_folder_name + "/alignments/" + family_name + "/repeat_alignment_coverage.csv"
    with open(file_name, 'r') as fp:
        lines = len(fp.readlines())
        num_csv_entries_dict[family_name] = lines

In [91]:
#print out our false negative families and their abundance after filtering/number of alignments
for (fn, num_align_qualifying) in false_negatives:
    print(fn,num_csv_entries_dict[fn])

AmnHarb1 85
AmnL2-1 482
Arthur1 607
Arthur1B 566
Arthur2 459
BLACKJACK 434
CR1-11_Crp 14
CR1-3_Croc 365
CR1-L3A_Croc 34
CR1-L3B_Croc 13
CR1_Amni-1 440
CR1_Mam 652
Chap1a_Mam 426
Charlie1 329
Charlie16 398
Charlie17 72
Charlie21a 214
Charlie24 463
Charlie25 277
Charlie29b 403
Charlie2a 661
Charlie2b 716
Charlie30a 167
Charlie30b 235
Charlie4 163
Charlie5 568
Charlie6 40
Charlie7 1120
Cheshire 151
Chompy-6_Croc 1
DIRS-1a_Amnio 760
DIRS-1b_Amnio 842
DIRS-1c_Amnio 862
DNA1_Mam 57
ERV3-16A3_I 857
ERV3-16A3_LTR 333
ERVL-B4-int 190
ERVL-E-int 2043
ERVL40-int 210
EUTREP15 52
EUTREP6 7
EutTc1-N1 8
EutTc1-N2 43
EuthAT-2 230
Eutr2 86
Eutr2B 137
Eutr5 134
FordPrefect 146
FordPrefect_a 76
HAL1 5492
HAL1M8 340
HAL1ME 2337
HAL1b 1282
HERV16-int 572
HY1 100
HY3 105
HY4 44
HY5 6
Helitron1Na_Mam 113
Helitron1Nb_Mam 118
Kanga1 193
Kanga1a 210
Kanga1d 138
Kanga2_a 449
L1M4 2526
L1M4a1 450
L1M4a2 257
L1M4b 505
L1M5 11196
L1M6 1814
L1M7 853
L1M8 327
L1MB4 1466
L1MB5 1559
L1MC 1160
L1MC2 927
L1MC3 2190
L1MC4

In [92]:
#t-test: are we more likely to detect a family to be shared if it is shorter -- yes
tp_len = [lengthdict[family] for (family, _) in true_positives]
fn_len =  [lengthdict[family] for (family, _) in false_negatives]
print("mean length of TP:", sum(tp_len)/len(tp_len))
print("mean length of FN:",sum(fn_len)/len(fn_len))
print("t-test:", stats.ttest_ind(tp_len, fn_len))


mean length of TP: 445.8133971291866
mean length of FN: 1628.7706093189963
t-test: TtestResult(statistic=-16.216214205812708, pvalue=2.1404710665425993e-50, df=695.0)


In [93]:
#t-test: are we more likely to detect a family to be shared if it is more numerous -- yes
tp_num = [num_csv_entries_dict[family] for (family, _) in true_positives]
fn_num =  [num_csv_entries_dict[family] for (family, _) in false_negatives]
print("mean number of repeats of TP:",sum(tp_num)/len(tp_num))
print("mean number of repeats of FN:",sum(fn_num)/len(fn_num))
print("t-test:", stats.ttest_ind(tp_num, fn_num))


mean number of repeats of TP: 1529.3875598086124
mean number of repeats of FN: 657.5806451612904
t-test: TtestResult(statistic=2.080047882413263, pvalue=0.03788701738052578, df=695.0)
