In [7]:
#!/usr/bin/env python
'''
Created on 

@author: manuel.dominguezbecerra@nhs.net

'''


# Import libraries 
import argparse                          # pip install argparse
import pandas as pd                      # pip install dash
import os
import subprocess
import numpy as np

import dash                              # pip install dash
import dash_html_components as html
import dash_core_components as dcc
from dash.dependencies import Output, Input

from dash_extensions import Lottie       # pip install dash-extensions
import dash_bootstrap_components as dbc  # pip install dash-bootstrap-components
import plotly.express as px              # pip install plotly
from datetime import date
import calendar
from wordcloud import WordCloud          # pip install wordcloud
import sqlite3                           # pip install sqlite3
import plotly.graph_objects as go
import dash_table
from dash.dependencies import Input, Output, State


# Take the samples path/name_file

#parser = argparse.ArgumentParser()
#parser.add_argument('--vcf1', type=str, required=True)
#parser.add_argument('--vcf2', type=int, required=True)
#args = parser.parse_args()

#Name_file1 = args.vcf1 # Remove comments when finished
#Name_file2 = args.vcf2

## Delete this when finished
#Name_file1 = "/Users/monkiky/Desktop/Do-these-samples-belong-to-the-same-patient/Samples/W2008872_S8-J4PJL_copy.vcf"
Name_file1 = "//Users/monkiky/Desktop/VCF-matcher/Samples/Myeloid_1.2/M1.vcf"
#Name_file1 = "/Users/monkiky/Desktop/Do-these-samples-belong-to-the-same-patient/Samples/W2103014_S10-JGFJG.vcf"
Name_file2 = "/Users/monkiky/Desktop/VCF-matcher/Samples/Myeloid_1.2/M1.vcf"


### 1. Remove the header of input files

# File A
cmd = "sed '/^##/ d'  {0} >  FileA.txt".format(Name_file1)
os.system(cmd)

# File B 
cmd = "sed '/^##/ d'  {0} >  FileB.txt".format(Name_file2)
os.system(cmd)

# Files to pandas object
dataA = pd.read_csv("./FileA.txt", delimiter = "\t" )
dataB = pd.read_csv("./FileB.txt", delimiter = "\t" )

# Delete files created
cmd = os. getcwd()+"/FileA.txt"
os.remove(cmd)

cmd = os. getcwd()+"/FileB.txt"
os.remove(cmd)

# If the pd dataframes have more than 10 columns
# that means there are more than 1 sample in the vcf file

if len(dataA.columns) > 10:
     Name_sample1 = input('It seems that {} has more than one sample. Please introduce the name of the sample you wish to analyse:  '.format(os.path.basename(Name_file1)))
else:
    print('File', os.path.basename(Name_file1), 'contains one sample only.')
    Name_sample1 = dataA.columns.tolist()[9]

if len(dataB.columns) > 10:
     Name_sample2 = input('It seems that {} has more than one sample. Please introduce the name of the sample you wish to analyse:  '.format(os.path.basename(Name_file2)))
else:
    print('File', os.path.basename(Name_file2), 'contains one sample only.')
    Name_sample2 = dataB.columns.tolist()[9]

    ### 2. Filter the variants  FILTER = PASS and Variant Frecuency =>0.4

# VF is in the 9th column, we need to extract and put VF data in an independent column first
df2 = dataA[Name_sample1].str.split(':', expand=True)

df2 = df2.fillna(value=np.nan)
df2 = df2.dropna(axis=1, how='any')

df2.columns = dataA.FORMAT.iloc[0].split(':')
dataA = pd.concat([dataA, df2], axis=1)

#df2.columns[df2.columns.str.contains(pat = 'VF')] 


# Same for dataB
df3 = dataB[Name_sample2].str.split(':', expand=True)

df3 = df3.fillna(value=np.nan) # Sometimes empty columns are created giving an error as there is more columns than columns names
df3 = df3.dropna(axis=1, how='any') # This delete the empty columns

df3.columns = dataB.FORMAT.iloc[0].split(':')
dataB = pd.concat([dataB, df3], axis=1)

# Now, let´s filter the variant
# We dont want variants that doesnt pass the Filter status
# This field is mandatory, no condition is needed because it is applied to any type of VCF file.
dataA = dataA.query('FILTER == "PASS"')
dataB = dataB.query('FILTER == "PASS"')
                    
# Now, I apply Variant frecuency (VF) filter (non-mandatory field)

# If there is a VF Genotype fields in the file
# Take the variants that are > 0.4
if "VF" in dataA.columns:
    dataA["VF"] = dataA["VF"].astype(float)
    dataA = dataA.query('VF > 0.4')
    
if "VF" in dataB.columns:
    dataB["VF"] = dataB["VF"].astype(float)
    dataB = dataB.query('VF > 0.4')
    

### 3. Let's create the Ben's Code, this is CHROM+POS+REF+ALT

dataA['CHROMPOSREDALT']=dataA["#CHROM"].apply(str)+"."+dataA["POS"].apply(str)+dataA["REF"]+"."+dataA["ALT"]

dataB['CHROMPOSREDALT']=dataB["#CHROM"].apply(str)+"."+dataB["POS"].apply(str)+dataB["REF"]+"."+dataB["ALT"]



### 4. Take common Ben's Codes in the same column and put the GT of each (both  copies) in independent columns
df3 = dataA[['CHROMPOSREDALT', 'GT']].copy()
df4 = dataB[['CHROMPOSREDALT', 'GT']].copy()
frames = [df3,df4]
semi_final_df = pd.concat(frames)


final_df=(semi_final_df.assign(key=semi_final_df.groupby('CHROMPOSREDALT').cumcount())
      .pivot('CHROMPOSREDALT','key','GT')
      .rename(columns=lambda x:f"Sample{x+1}")
      .rename_axis(columns=None).reset_index())

### 5. See how many GT match

# Variables needed for both type of reports

r0 = os.path.basename(Name_file1) # Get the  file name of a path/file_name 
r1 = Name_sample1                 # Get the name of the sample 1
r2 = os.path.basename(Name_file2)
r3 = Name_sample2

if "Sample2" not in final_df.columns:
    # If Sample2 is not in final_df, that means there is not common position between the samples.
    report0 = '''
 _____________________________  REPORT  ________________________________________ 

vcf 1: {0}  AND its sample name: {1}  
vcf 2:  {2}  AND its sample name: {3}

   No common positions found between samples

 ____________________________ END REPORT  _______________________________________
'''
    print(report0.format(r0,r1,r2,r3,))

else:    
    # else match common position and carry on the report.
    final_df["Matches"] = np.where(final_df["Sample1"] == final_df["Sample2"], True, False)
    final_df.columns = ['CHROM.POS.REF.ALT',os.path.basename(Name_file1),os.path.basename(Name_file2),"Matches" ]



### 6.1 Generate results in the terminal

# Variable to introduce in the report :
    
    r4 = final_df['Matches'].value_counts().get(True, 0) # Count trues if any, retunr 0
    r5 = final_df['Matches'].value_counts().get(False, 0)
    r6 = len(final_df)
    r7 = r4/(r4+r5)
    r6 == r4+r5
    report = '''
 _____________________________  REPORT  ________________________________________ 

vcf 1: {0}  AND its sample name: {1}  
vcf 2:  {2}  AND its sample name: {3}

                                                  Homozigous: 
Number of positions with the same genotype: {4} 
                                                  Heterozigous:
                                                
                                                  
Number of positions with diferent genotype: {5} 
                                                  


Total positions compared: {6}
Percentage in common: {4}/{6}= {7}
 ____________________________ END REPORT  _______________________________________
'''
    print(report.format(r0,r1,r2,r3,r4,r5,r6,r7))


# draft

#(semi_final_df.pivot_table(index = ['CHROMPOSREDALT'], aggfunc ='size')==2).value_counts()[1] # position same genotipe
#position_same_genotype = (semi_final_df.pivot_table(index = ['CHROMPOSREDALT'], aggfunc ='size')==2).value_counts()[1]
#total_hom = semi_final_df[semi_final_df.GT == '1/1'].shape[0]
#total_het = semi_final_df[semi_final_df.GT != '1/1'].shape[0]

It seems that M1.vcf has more than one sample. Please introduce the name of the sample you wish to analyse:  369884619
It seems that M1.vcf has more than one sample. Please introduce the name of the sample you wish to analyse:  369884619

 _____________________________  REPORT  ________________________________________ 

vcf 1: M1.vcf  AND its sample name: 369884619  
vcf 2:  M1.vcf  AND its sample name: 369884619

                                                  Homozigous: 
Number of positions with the same genotype: 1191 
                                                  Heterozigous:
                                                
                                                  
Number of positions with diferent genotype: 0 
                                                  


Total positions compared: 1191
Percentage in common: 1191/1191= 1.0
 ____________________________ END REPORT  _______________________________________



In [17]:
final_df['Genotype1'] = final_df.iloc[:,2].apply(lambda x: x.split('/' or '|')[0])
final_df['Genotype2'] = final_df.iloc[:,2].apply(lambda x: x.split('/' or '|')[1])
#final_df['Genotype_comparative'] =  (final_df['Genotype1'] ==  final_df['Genotype2']) and (final_df['Matches'] == True)
final_df
#data["phylum"] = data["gtdb_taxonomy"].apply(lambda x: x.split(";")[1])
df['Four'] = np.select([df['Three'] & df['One'].eq(df['Two']),
                        df['Three'] & df['One'].ne(df['Two'])],
                       choicelist=[True, False],
                       default=pd.NA)

Unnamed: 0,CHROM.POS.REF.ALT,M1.vcf,M1.vcf.1,Matches,Genotype1,Genotype2,Genotype_comparative
0,1.12009911G.A,0/0,0/0,True,0,0,True
1,1.12009955C.T,0/1,0/1,True,0,1,False
2,1.12009956G.A,0/0,0/0,True,0,0,True
3,1.12010465G.A,0/0,0/0,True,0,0,True
4,1.12010466C.A,0/0,0/0,True,0,0,True
...,...,...,...,...,...,...,...
1186,X.49084492C.T,1/1,1/1,True,1,1,True
1187,X.49088374G.A,0/0,0/0,True,0,0,True
1188,X.8503641G.A,0/1,0/1,True,0,1,False
1189,X.8504833C.T,0/1,0/1,True,0,1,False


In [4]:
final_df['Genotype1'] =

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,W2008872,GT,GQ,AD,VF,NL,SB,GQX,CHROMPOSREDALT
10,chr2,25463483,rs2289195,G,A,100.0,PASS,"DP=2466;TI=NM_153759,NM_022552,NM_175629;GI=DN...",GT:GQ:AD:VF:NL:SB:GQX,"1/1:100:12,2445:0.9915:20:-100.0000:100",1/1,100,122445,0.9915,20,-100.0,100,chr2.25463483G.A
13,chr2,25466888,rs2289093,G,T,100.0,PASS,"DP=1839;TI=NM_153759,NM_022552,NM_175629;GI=DN...",GT:GQ:AD:VF:NL:SB:GQX,"1/1:100:1,1835:0.9978:20:-100.0000:100",1/1,100,11835,0.9978,20,-100.0,100,chr2.25466888G.T
18,chr2,25469502,rs2276598,C,T,100.0,PASS,"DP=999;TI=NM_153759,NM_022552,NM_175629;GI=DNM...",GT:GQ:AD:VF:NL:SB:GQX,"0/1:100:475,519:0.5195:20:-100.0000:100",0/1,100,475519,0.5195,20,-100.0,100,chr2.25469502C.T
23,chr2,25523196,rs7596387,T,C,100.0,PASS,"DP=1151;TI=NM_022552,NM_175629,NM_175630;GI=DN...",GT:GQ:AD:VF:NL:SB:GQX,"0/1:100:574,576:0.5004:20:-100.0000:100",0/1,100,574576,0.5004,20,-100.0,100,chr2.25523196T.C
33,chr2,198267770,rs2914980,G,GAA,100.0,PASS,DP=5670;TI=NM_012433;GI=SF3B1;FC=Noncoding,GT:GQ:AD:VF:NL:SB:GQX,"0/1:100:598,5072:0.8945:20:-100.0000:100",0/1,100,5985072,0.8945,20,-100.0,100,chr2.198267770G.GAA
36,chr3,38182136,rs4988457,C,G,100.0,PASS,"DP=1634;TI=NM_001172567,NM_001172568,NM_002468...",GT:GQ:AD:VF:NL:SB:GQX,"0/1:100:827,803:0.4914:20:-100.0000:100",0/1,100,827803,0.4914,20,-100.0,100,chr3.38182136C.G
38,chr3,105453034,rs6768096,A,G,100.0,PASS,DP=1443;TI=NM_170662;GI=CBLB;FC=Silent,GT:GQ:AD:VF:NL:SB:GQX,"1/1:100:1,1442:0.9993:20:-100.0000:100",1/1,100,11442,0.9993,20,-100.0,100,chr3.105453034A.G
51,chr4,55141055,rs1873778,A,G,100.0,PASS,DP=2903;TI=NM_006206;GI=PDGFRA;FC=Synonymous_P...,GT:GQ:AD:VF:NL:SB:GQX,"1/1:100:10,2893:0.9966:20:-100.0000:100",1/1,100,102893,0.9966,20,-100.0,100,chr4.55141055A.G
57,chr4,55599436,rs1008658,T,C,100.0,PASS,"DP=1825;TI=NM_001093772,NM_000222;GI=KIT,KIT;F...",GT:GQ:AD:VF:NL:SB:GQX,"0/1:100:896,929:0.5090:20:-100.0000:100",0/1,100,896929,0.509,20,-100.0,100,chr4.55599436T.C
59,chr4,106155199,.,C,T,100.0,PASS,"DP=3276;TI=NM_017628,NM_001127208;GI=TET2,TET2...",GT:GQ:AD:VF:NL:SB:GQX,"0/1:100:1592,1684:0.5140:20:-100.0000:100",0/1,100,15921684,0.514,20,-100.0,100,chr4.106155199C.T


In [5]:
final_df

Unnamed: 0,CHROM.POS.REF.ALT,W2008872_S8-J4PJL.vcf,W2008872_S8-J4PJL.vcf.1,Matches
0,chr10.112361870A.G,1/1,1/1,True
1,chr11.32417945T.C,0/1,0/1,True
2,chr11.534242A.G,0/1,0/1,True
3,chr11.534403GCCCAGGCCCAGC.G,0/1,0/1,True
4,chr12.11803220A.C,1/1,1/1,True
5,chr12.11803228A.G,1/1,1/1,True
6,chr13.28607916T.A,1/1,1/1,True
7,chr13.28607989T.G,1/1,1/1,True
8,chr17.74732959G.T,0/1,0/1,True
9,chr17.74733099G.A,1/1,1/1,True


### 

In [None]:

# NOOOOOOOO BORRARRRR ######################################


### 6. Generate a report in a dashboard
# Bootstrap themes by Ann: https://hellodash.pythonanywhere.com/theme_explorer
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.SIMPLEX])

app.layout = dbc.Container([
    dbc.Row([
        dbc.Col([
            dbc.Card([
                dbc.CardImg(src='./assets/wrgllogohighres.jpeg')
                ],color="light", outline=True),
                ],width=3),
                dbc.Col([
                    dbc.Card([
                        dbc.CardImg(src='./assets/salisbury-nhs-foundation-trustLeft.png')
                             ],color="light", outline=True)
                        ],width=2)
            ],justify="between"),
        dbc.Row(
            [dbc.Card([ 
                dbc.CardBody([
                    html.Div([
                    html.H1("Report: Do the samples belong to the same patient?"),
                    html.H3("This application takes common variants and compares the GT fields. A relative large number of common variants which GT fields match is indicative that both samples have a close relationship."),
                ], style={'textAlign': 'center'}) 
                     ], style={'textLeft':'center', })      
                      ], style={'height':'10vh'},color="light", outline=True),
        ]),
        dbc.Row([
            dbc.Card([ 
                dbc.CardBody([
                    html.Div([
                    html.H1(""),
                    html.H3(""),
                ], style={'textAlign': 'center'}) 
                     ], style={'textLeft':'center', })      
                      ], style={'height':'10vh'},color="light", outline=True),
        ]),
        dbc.Row([  
            dbc.Col([
            dash_table.DataTable(
        id='datatable-interactivity',
        columns=[
            {"name": i, "id": i, "deletable": True, "selectable": True, "hideable": True}
            if i == "iso_alpha3" or i == "year" or i == "id"
            else {"name": i, "id": i, "deletable": True, "selectable": True}
            for i in final_df.columns
        ],
        data=final_df.to_dict('records'),  # the contents of the table html.H3(os.path.basename(Name_file1)),
        filter_action="native",     # allow filtering of data by user ('native') or not ('none')
        sort_action="native",       # enables data to be sorted per-column by user or not ('none')
        sort_mode="single",         # sort across 'multi' or 'single' columns
        row_deletable=True,         # choose if user can delete a row (True) or not (False)
        selected_rows=[],           # indices of rows that user selects
        page_action="native",       # all data is passed to the table up-front or not ('none')
        page_current=0,             # page number that user is on
        style_cell={                # ensure adequate header width when text is shorter than cell's text
            'minWidth': 200, 'maxWidth': 95, 'width': 95
        },
        style_data_conditional=[
        {
            'if': {'row_index': 'odd'},
            'backgroundColor': 'rgb(248, 248, 248)'
        }
    ],
        style_header={
        'backgroundColor': 'rgb(230, 230, 230)',
        'fontWeight': 'bold'
    })]),
    dbc.Col([
        dbc.Card([ 
            dbc.CardBody([
                html.Div([
                    html.H4("Number of common variants:",style={'color': 'black', 'fontSize': 15}),
                    html.H4(len(final_df)),
                    html.H4("Number of matches:", style={'color': 'black', 'fontSize': 15}),
                    html.H4(final_df.Matches.value_counts()[1]),
                    ], 
                    style={'textAlign': 'left'})
                    ])
                    ],
                    style={'height':'8vh'},color="light", outline=True),
                    ])
                    ])
                    ])



  


if __name__ == '__main__':
    app.run_server(debug=False)


