# Imports & variable definition

In [None]:
import csv
import os
import datetime
import pandas as pd
import numpy as np

In [None]:
inFile=".../relatedness_data_450k_participants.tsv"
outFile=".../related_individuals.txt"
outFileFinal=".../related_individuals_with_interpretation.txt"
samplesToRemove=".../related_samples_to_remove"

**TO-do: explain how did we get input file!!!**

# Create couples

**Input** data contains the following fields:

> **22011** - field is used to identify pairs of participants who are estimated to be genetically related. More [here](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=22011).

> **22012** -  field gives the kinship coefficient associated with each pair of participants. More [here](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=22012).

> **22018** - field identifies the persons excluded from the calculations. More [here](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=22018).

**Output**: 

Tab-separated file of format: `eid-1` `eid-2` `kinship-coefficient`

In [None]:
def create_couples(inFile, outFile):
    '''
    This function gets a phenotipic file that includes fields 22011,22012,22018 as an input file
    The output is a list of couples by identifiers <ID1,ID2,kinship>
    '''
    # define fields
    exclude=22018
    pairs=22011
    kinship=22012
    
    with open(inFile,"rt") as pheno_File, open (outFile,"w") as final:
        # parse csv files
        reader=csv.reader(pheno_File,delimiter="\t")
        writer=csv.writer(final,delimiter="\t")
        
        # read first row, save the indexes of the fields
        field_pairs=[]
        field_kinship=[]
        
        fields=next(reader)
        
        for i in fields:
            if str(exclude) in str(i):
                field_exclude=fields.index(i)
            #a list of all fields that are related to the phenotype (X-1.0, X-1.1, X-2.0...)
            elif str(pairs) in str(i):                                          
                field_pairs.append(fields.index(i))
            elif str(kinship) in str(i):
                field_kinship.append(fields.index(i))
        l_zipped = list(zip(field_pairs,field_kinship))    
        
        # iterate over the remaining rows and collect related pairs
        
        d_samples={}
        d_pairs={}
        
        for r in reader:
            d_samples[r[0]]=[]
            
            
            if r[field_exclude] != 1 and r[field_exclude] != 2:
                for c in field_pairs:
                    if r[c] != "":
                        if r[c] not in d_pairs.keys():
                            d_pairs[r[c]] = [r[0]]
                        else:
                            d_pairs[r[c]].append(r[0])
                   
                
            for s in l_zipped:
                if s[0]!="":
                    if r[s[0]]!="":
                        d_samples[r[0]].append([r[s[0]],r[s[1]]])
        
        for k,v in d_pairs.items():
            if len(v)==2:
                if len(d_samples[v[0]])>1:
                    for i in d_samples[v[0]]:
                        if i[0]==k:
                            nl=[str(v[0]),str(v[1]),i[1]]
                            
                else:
                    nl=[str(v[0]),str(v[1]),d_samples[v[0]][0][1]]
                writer.writerow(nl)

In [None]:
start = datetime.datetime.now()

create_couples(inFile, outFile)

print (f"Step finished, elapsed time: {datetime.datetime.now()-start}")

In [None]:
!head -5 $outFile

In [None]:
!wc -l $outFile

# Interpreting kinship coefficient

**Input**:

File, generated at the previous step




**Output**: 

Tab-separated file of format: `eid-1` `eid-2` `kinship-coefficient` `kinship-interpretation`

In [None]:
def kinship_interpretation(inFile, outFile):
    '''
    This function gets the "create_couples" output as an input
    It adds a column with the infer relationship
    '''
    with open (inFile, "r") as inputf, open (outFile, "w") as final:
        reader= csv.reader (inputf, delimiter="\t")
        writer= csv.writer (final, delimiter="\t")

        for row in reader:
            nl= row[:]
            if float(row[2])>=0.354:
                nl.append("duplicate/MZ twin")
            elif float(row[2])>=0.177 and float(row[2])<0.354:
                    nl.append("1st-degree")
            elif float(row[2])>=0.0884 and float(row[2])<0.177:
                nl.append("2nd-degree")
            elif float(row[2])>=0.0442 and float(row[2])<0.0884:
                nl.append("3rd-degree")
            else:
                nl.append("no_inffered_relationship")
            writer.writerow(nl)

In [None]:
start = datetime.datetime.now()

kinship_interpretation(outFile, outFileFinal)

print (f"Step finished, elapsed time: {datetime.datetime.now()-start}")

In [None]:
!head -5 $outFileFinal

In [None]:
!wc -l $outFileFinal

# Remove related samples

**Input**:

File, generated at the previous step




**Output**: 

Tab-separated file with the samples to exclude, format: `eid` 

In [None]:
def samples_to_remove(inFile, outFileNoExt):
    '''
    this function gets the "kinship_interpretation" output as an input
    It creats a  list of related samples that needs to be excluded
    '''
    with open (inFile, "r") as inputf, open (outFileNoExt+".txt", "w") as final:
        reader= csv.reader (inputf, delimiter="\t")
        writer= csv.writer (final, delimiter="\t")

        pairs=[]

        for r in reader:
            # create a nested list of all couples 2nd degree or closer
            if r[3]!="3rd-degree" and r[3]!="no_inffered_relationship": 
                pairs.append([r[0],r[1]])
        
        
        # as long as there are pairs in the list, 
        # check which sample has the most relationships 
        # and delete the pairs that it's in
        while len(pairs)>0:                                                             
            d={}
            for p in pairs:
                if p[0] not in d:
                    d[p[0]]=1
                else:
                    d[p[0]]+=1
                    
                if p[1] not in d:
                    d[p[1]]=1
                else:
                    d[p[1]]+=1
                    
            most_related = max(d,key=d.get)
            writer.writerow([most_related])
            for p in pairs:
                if most_related in p:
                    pairs.remove(p)

    os.system ("cat " +outFileNoExt+ ".txt |sort|uniq > " +outFileNoExt +"_final.txt")
    os.system ("rm -f "+outFileNoExt+ ".txt")

In [None]:
start = datetime.datetime.now()

samples_to_remove(outFileFinal, samplesToRemove)

print (f"Step finished, elapsed time: {datetime.datetime.now()-start}")

In [None]:
!head -5 $samplesToRemove'_final.txt'

In [None]:
!wc -l $samplesToRemove'_final.txt'

**Note**: when running this algorithm second time, it may produce a bit different result since there are samples with the same relatedness count. We delete a random(?) one. 