In [1]:
#This code determines the frequency of pathogenic or LOF PRKN mutations in from SNPs in WES and CNVs from microarray

import pandas as pd
import time
import os
import numpy as np
import scipy.stats as stats


#Read in SNP csv file
SNP = pd.read_csv('/path/to/PRKN_variants_WES_touse.csv', na_values='NaN', header=0)
SNP.drop(columns = "number_of_var", inplace=True)
SNP['WESTotal'] = SNP.drop('IID', axis=1).sum(axis=1)

#Read in DEL1 csv file
DEL1 = pd.read_csv('/path/to/del_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DEL1', 'DEL1_Exon', 'F'])
DEL1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DUP1 csv file
DUP1 = pd.read_csv('/path/todup_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DUP1', 'DUP1_Exon', 'F'])
DUP1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DELDUP2 csv file
DELDUP2 = pd.read_csv('/path/to/2DELDUP.csv', na_values='NaN', header=0)

#Merge SNP and DEL
df = pd.merge(SNP, DEL1, on ='IID', how ='left') 
df = pd.merge(df, DUP1, on ='IID', how ='left')
df = pd.merge(df, DELDUP2, on ='IID', how ='left') 

#Fix na in in DEL1 and DUP1 columns
df['DEL1'] = df['DEL1'].fillna(0)
df['DEL1_Exon'] = df['DEL1_Exon'].fillna('')
df['DUP1'] = df['DUP1'].fillna(0)
df['DUP1_Exon'] = df['DUP1_Exon'].fillna('')
df['2DELDUP'] = df['2DELDUP'].fillna(0)

#Read in key to annotate
WESkey = pd.read_csv('/path/to/WESKey.csv', na_values='NaN', header=0)

df['CNVTotal'] = df['DEL1'] + df['DUP1'] + df['2DELDUP']
df['MutTotal'] = df['CNVTotal'] + df['WESTotal']

df

dfcol = df.columns
dfcol = dfcol[1:]

COLUMN_NAMES = ['ID','AA_N','AA_f','AB_N','AB_f','BB_N','BB_f','ABorBB_N','ABorBB_f','Allelic_Frequency']
results = pd.DataFrame(columns=COLUMN_NAMES)

for x in dfcol:

    test = df[x].value_counts()
    
    testlist = []
    
    testlist.append(x)

    try:
        a1 = test[0]
        testlist.append(a1)
        
    except:
        a1 = 0
        testlist.append(a1)
    
    try:
        a = test[0]/test.sum()
        testlist.append(a)
        
    except:
        a = 0
        testlist.append(a)

    try:
        b1 =  test[1]
        testlist.append(b1)
            
    except:
        b1 =  0
        testlist.append(b1)
        
    try:
        b =  test[1]/test.sum()
        testlist.append(b)
            
    except:
        b =  0
        testlist.append(b)

    try:
        c1 = test[2] 
        testlist.append(c1)
            
    except:
        c1 = 0
        testlist.append(c1)
                
    try:
        c = test[2]/test.sum() 
        testlist.append(c)
            
    except:
        c = 0
        testlist.append(c)

    try:
         testlist.append((b1+c1))
            
    except:
         testlist.append(0)

    try:
         testlist.append((b+c))
            
    except:
         testlist.append(0)
    
    try:
         testlist.append(((b/2)+c))
            
    except:
         testlist.append(0) 
    
    results_length = len(results)
    
    try:
        results.loc[results_length] = testlist
    except:
        pass
    
#Annotate the results table
results = pd.merge(results, WESkey, on ='ID', how ='left') 

#Write results to file
results.to_csv('/path/to/WESMutation_Frequency_Table.csv')
results


Unnamed: 0,ID,AA_N,AA_f,AB_N,AB_f,BB_N,BB_f,ABorBB_N,ABorBB_f,Allelic_Frequency,...,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13,Otherinfo14,Otherinfo15
0,chr6:161350101:A:G_G,200596,0.999975,5,2.5e-05,0,0.0,5,2.5e-05,1.2e-05,...,chr6_161350101_A_G,A,G,49.0,.,AF=1.2e-05;AQ=49,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:26:26,0:50:0,78,779:.."
1,chr6:161350163:C:T_T,200628,0.99999,2,1e-05,0,0.0,2,1e-05,5e-06,...,chr6_161350163_C_T,C,T,45.0,.,AF=5e-06;AQ=45,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:18:18,0:50:0,108,1079:.."
2,chr6:161350208:C:T_T,200497,0.999377,125,0.000623,0,0.0,125,0.000623,0.000312,...,chr6_161350208_C_T,C,T,47.0,.,AF=0.000314;AQ=47,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:18:18,0:50:0,108,1079:.."
3,chr6:161360089:AT:A_A,199266,0.99997,6,3e-05,0,0.0,6,3e-05,1.5e-05,...,chr6_161360089_AT_A,AT,A,38.0,.,AF=5e-06;AQ=38,GT:DP:AD:GQ:PL:RNC,"./.:14:14,0:0:0,42,419:II","0/0:16:16,0:48:0,48,479:..","0/0:14:14,0:42:0,42,419:.."
4,chr6:161360132:GT:G_G,200627,0.99999,2,1e-05,0,0.0,2,1e-05,5e-06,...,chr6_161360132_GT_G,GT,G,49.0,.,AF=5e-06;AQ=49,GT:DP:AD:GQ:PL:RNC,"0/0:16:16,0:48:0,57,569:..","0/0:16:16,0:48:0,48,479:..","0/0:16:16,0:48:0,81,809:.."
5,chr6:161548898:G:A_A,200629,0.999995,1,5e-06,0,0.0,1,5e-06,2e-06,...,chr6_161548898_G_A,G,A,43.0,.,AF=2e-06;AQ=43,GT:DP:AD:GQ:PL:RNC,"0/0:16:16,0:48:0,87,869:..","0/0:16:16,0:48:0,54,539:..","0/0:16:16,0:48:0,60,599:.."
6,chr6:161548965:GA:G_G,200627,0.99999,2,1e-05,0,0.0,2,1e-05,5e-06,...,chr6_161548965_GA_G,GA,G,39.0,.,AF=5e-06;AQ=23,GT:DP:AD:GQ:PL:RNC,"0/0:16:16,0:48:0,87,869:..","0/0:16:16,0:48:0,54,539:..","0/0:16:16,0:48:0,60,599:.."
7,chr6:161548979:C:CACCA_CACCA,200628,0.999995,1,5e-06,0,0.0,1,5e-06,2e-06,...,chr6_161548979_C_CACCA,C,CACCA,44.0,.,AF=2e-06;AQ=44,GT:DP:AD:GQ:PL:RNC,"0/0:16:16,0:48:0,87,869:..","0/0:16:16,0:48:0,54,539:..","0/0:16:16,0:48:0,60,599:.."
8,chr6:161569397:CA:C_C,200589,0.999995,1,5e-06,0,0.0,1,5e-06,2e-06,...,chr6_161569397_CA_C,CA,C,40.0,.,AF=2e-06;AQ=37,GT:DP:AD:GQ:PL:RNC,"0/0:18:18,0:50:0,72,719:..","0/0:20:20,0:50:0,60,599:..","0/0:16:16,0:48:0,48,479:.."
9,chr6:161785793:C:G_G,200629,0.999995,1,5e-06,0,0.0,1,5e-06,2e-06,...,chr6_161785793_C_G,C,G,37.0,.,AF=2e-06;AQ=37,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,66,659:..","0/0:20:20,0:50:0,69,689:..","0/0:16:16,0:48:0,48,479:.."


In [234]:
#This code reports the frequency of mutations in WES and microarray dropping samples in which R275W or T240M was not detected by microarray.
#Also, reports the number of calls made by microarray that were not detected by WES, which are assumed to be erroneous calls using WES as gold standard.

import pandas as pd
import time
import os
import numpy as np
import scipy.stats as stats


#Read in SNP csv file
SNP = pd.read_csv('/path/to/PRKN_variants_WES_touse.csv', na_values='NaN', header=0)
SNP.drop(columns = "number_of_var", inplace=True)
SNP.rename(columns={"chr6:161785820:G:A_A": "p.R275W", "chr6:161973317:G:A_A": "p.T240M", "chr6:161785885:C:T_T":"p.C253Y", "chr6:161973403:T:A_A":"p.K211N"}, inplace=True)
SNP['WESTotal'] = SNP.drop('IID', axis=1).sum(axis=1)

#Read in SNP2 csv file
SNP2 = pd.read_csv('/path/to/PRKN_pathogenic.csv', na_values='NaN', header=0, names = ['FID','IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE', 'MicroR275W', 'C253Y', 'MicroT240M', 'K211N'])
SNP2.drop(columns=['FID','PAT','MAT','PHENOTYPE','C253Y','K211N','SEX'], axis=1, inplace=True)
SNP2['microSNPTotal'] = SNP2.drop('IID', axis=1).sum(axis=1)

#Read in DEL1 csv file
DEL1 = pd.read_csv('/path/to/del_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DEL1', 'DEL1_Exon', 'F'])
DEL1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DUP1 csv file
DUP1 = pd.read_csv('/path/to/dup_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DUP1', 'DUP1_Exon', 'F'])
DUP1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DELDUP2 csv file
DELDUP2 = pd.read_csv('/path/to/New_WES_Analysis/2DELDUP.csv', na_values='NaN', header=0)

#Merge SNP and DEL
df = pd.merge(SNP, DEL1, on ='IID', how ='left') 
df = pd.merge(df, SNP2, on ='IID', how ='left') 
df = pd.merge(df, DUP1, on ='IID', how ='left')
df = pd.merge(df, DELDUP2, on ='IID', how ='left') 

#Fix na in in DEL1 and DUP1 columns
df['DEL1'] = df['DEL1'].fillna(0)
df['DEL1_Exon'] = df['DEL1_Exon'].fillna('')
df['DUP1'] = df['DUP1'].fillna(0)
df['DUP1_Exon'] = df['DUP1_Exon'].fillna('')
df['2DELDUP'] = df['2DELDUP'].fillna(0)

#Read in key to annotate
WESkey = pd.read_csv('/path/to/WESKey.csv', na_values='NaN', header=0)

df['CNVTotal'] = df['DEL1'] + df['DUP1'] + df['2DELDUP']
df['MutTotal'] = df['CNVTotal'] + df['WESTotal']

df
df.dropna(inplace=True)

dfcol = df.columns
dfcol = dfcol[1:]

COLUMN_NAMES = ['ID','AA_N','AA_f','AB_N','AB_f','BB_N','BB_f','ABorBB_N','ABorBB_f','Allelic_Frequency']
results = pd.DataFrame(columns=COLUMN_NAMES)

for x in dfcol:

    test = df[x].value_counts()
    
    testlist = []
    
    testlist.append(x)

    try:
        a1 = test[0]
        testlist.append(a1)
        
    except:
        a1 = 0
        testlist.append(a1)
    
    try:
        a = test[0]/test.sum()
        testlist.append(a)
        
    except:
        a = 0
        testlist.append(a)

    try:
        b1 =  test[1]
        testlist.append(b1)
            
    except:
        b1 =  0
        testlist.append(b1)
        
    try:
        b =  test[1]/test.sum()
        testlist.append(b)
            
    except:
        b =  0
        testlist.append(b)

    try:
        c1 = test[2] 
        testlist.append(c1)
            
    except:
        c1 = 0
        testlist.append(c1)
                
    try:
        c = test[2]/test.sum() 
        testlist.append(c)
            
    except:
        c = 0
        testlist.append(c)

    try:
         testlist.append((b1+c1))
            
    except:
         testlist.append(0)

    try:
         testlist.append((b+c))
            
    except:
         testlist.append(0)
    
    try:
         testlist.append(((b/2)+c))
            
    except:
         testlist.append(0) 
    
    results_length = len(results)
    
    try:
        results.loc[results_length] = testlist
    except:
        pass
    
#Annotate the results table
results = pd.merge(results, WESkey, on ='ID', how ='left') 

#Determine number of 
df
df['FalseDet?'] = (((df['p.R275W'] != df['MicroR275W']) & (df['p.R275W'] == 0)) | ((df['p.T240M'] != df['MicroT240M']) & (df['p.T240M'] == 0)))
FalseDet = df.loc[df['FalseDet?']==True]
print('Number of SNPs misgenotyped as positive based on discordance with WES:', FalseDet['FalseDet?'].count())

results

Number of SNPs misgenotyped as positive based on discordance with WES: 22


Unnamed: 0,ID,AA_N,AA_f,AB_N,AB_f,BB_N,BB_f,ABorBB_N,ABorBB_f,Allelic_Frequency,...,Otherinfo6,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13,Otherinfo14,Otherinfo15
0,chr6:161350101:A:G_G,190038,0.999979,4,2.1e-05,0,0.0,4,2.1e-05,1.1e-05,...,chr6_161350101_A_G,A,G,49.0,.,AF=1.2e-05;AQ=49,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:26:26,0:50:0,78,779:.."
1,chr6:161350163:C:T_T,190040,0.999989,2,1.1e-05,0,0.0,2,1.1e-05,5e-06,...,chr6_161350163_C_T,C,T,45.0,.,AF=5e-06;AQ=45,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:18:18,0:50:0,108,1079:.."
2,chr6:161350208:C:T_T,189929,0.999405,113,0.000595,0,0.0,113,0.000595,0.000297,...,chr6_161350208_C_T,C,T,47.0,.,AF=0.000314;AQ=47,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:18:18,0:50:0,108,1079:.."
3,chr6:161360089:AT:A_A,190036,0.999968,6,3.2e-05,0,0.0,6,3.2e-05,1.6e-05,...,chr6_161360089_AT_A,AT,A,38.0,.,AF=5e-06;AQ=38,GT:DP:AD:GQ:PL:RNC,"./.:14:14,0:0:0,42,419:II","0/0:16:16,0:48:0,48,479:..","0/0:14:14,0:42:0,42,419:.."
4,chr6:161360132:GT:G_G,190040,0.999989,2,1.1e-05,0,0.0,2,1.1e-05,5e-06,...,chr6_161360132_GT_G,GT,G,49.0,.,AF=5e-06;AQ=49,GT:DP:AD:GQ:PL:RNC,"0/0:16:16,0:48:0,57,569:..","0/0:16:16,0:48:0,48,479:..","0/0:16:16,0:48:0,81,809:.."
5,chr6:161548898:G:A_A,190041,0.999995,1,5e-06,0,0.0,1,5e-06,3e-06,...,chr6_161548898_G_A,G,A,43.0,.,AF=2e-06;AQ=43,GT:DP:AD:GQ:PL:RNC,"0/0:16:16,0:48:0,87,869:..","0/0:16:16,0:48:0,54,539:..","0/0:16:16,0:48:0,60,599:.."
6,chr6:161548965:GA:G_G,190040,0.999989,2,1.1e-05,0,0.0,2,1.1e-05,5e-06,...,chr6_161548965_GA_G,GA,G,39.0,.,AF=5e-06;AQ=23,GT:DP:AD:GQ:PL:RNC,"0/0:16:16,0:48:0,87,869:..","0/0:16:16,0:48:0,54,539:..","0/0:16:16,0:48:0,60,599:.."
7,chr6:161548979:C:CACCA_CACCA,190041,0.999995,1,5e-06,0,0.0,1,5e-06,3e-06,...,chr6_161548979_C_CACCA,C,CACCA,44.0,.,AF=2e-06;AQ=44,GT:DP:AD:GQ:PL:RNC,"0/0:16:16,0:48:0,87,869:..","0/0:16:16,0:48:0,54,539:..","0/0:16:16,0:48:0,60,599:.."
8,chr6:161569397:CA:C_C,190041,0.999995,1,5e-06,0,0.0,1,5e-06,3e-06,...,chr6_161569397_CA_C,CA,C,40.0,.,AF=2e-06;AQ=37,GT:DP:AD:GQ:PL:RNC,"0/0:18:18,0:50:0,72,719:..","0/0:20:20,0:50:0,60,599:..","0/0:16:16,0:48:0,48,479:.."
9,chr6:161785793:C:G_G,190041,0.999995,1,5e-06,0,0.0,1,5e-06,3e-06,...,chr6_161785793_C_G,C,G,37.0,.,AF=2e-06;AQ=37,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,66,659:..","0/0:20:20,0:50:0,69,689:..","0/0:16:16,0:48:0,48,479:.."


In [80]:
#This code determines the frequency of all nonsyn PRKN mutations in WES 
import pandas as pd
import time
import os
import numpy as np
import scipy.stats as stats


#Read in SNP csv file
SNP = pd.read_csv('/path/to/PRKN_variants_AllNonsynMissense_INDI.csv', na_values='NaN', header=0)
SNP['WESTotal'] = SNP.drop('IID', axis=1).sum(axis=1)


#Read in key to annotate
WESkey = pd.read_csv('/path/to/New_WES_Analysis/NewWESKey.csv', na_values='NaN', header=0)

df = SNP

dfcol = df.columns
dfcol = dfcol[1:]

COLUMN_NAMES = ['ID','AA_N','AA_f','AB_N','AB_f','BB_N','BB_f','ABorBB_N','ABorBB_f','Allelic_Frequency']
results = pd.DataFrame(columns=COLUMN_NAMES)

for x in dfcol:

    test = df[x].value_counts()
    
    testlist = []
    
    testlist.append(x)

    try:
        a1 = test[0]
        testlist.append(a1)
        
    except:
        a1 = 0
        testlist.append(a1)
    
    try:
        a = test[0]/test.sum()
        testlist.append(a)
        
    except:
        a = 0
        testlist.append(a)

    try:
        b1 =  test[1]
        testlist.append(b1)
            
    except:
        b1 =  0
        testlist.append(b1)
        
    try:
        b =  test[1]/test.sum()
        testlist.append(b)
            
    except:
        b =  0
        testlist.append(b)

    try:
        c1 = test[2] 
        testlist.append(c1)
            
    except:
        c1 = 0
        testlist.append(c1)
                
    try:
        c = test[2]/test.sum() 
        testlist.append(c)
            
    except:
        c = 0
        testlist.append(c)

    try:
         testlist.append((b1+c1))
            
    except:
         testlist.append(0)

    try:
         testlist.append((b+c))
            
    except:
         testlist.append(0)
    
    try:
         testlist.append(((b/2)+c))
            
    except:
         testlist.append(0) 
    
    results_length = len(results)
    
    try:
        results.loc[results_length] = testlist
    except:
        pass
    
#Annotate the results table
results['ID'] = results['ID'].str.replace(':','_')
results['ID'] = results['ID'].str[:-2]
results = pd.merge(results, WESkey, on ='ID', how ='left') 

#Write results to file
results.to_csv('/path/to/WESnonSynVariant_Frequency_Table.csv')
results


Unnamed: 0,ID,AA_N,AA_f,AB_N,AB_f,BB_N,BB_f,ABorBB_N,ABorBB_f,Allelic_Frequency,...,Otherinfo5,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13,Otherinfo14,Otherinfo15
0,chr6_161350104_C_T,200604,0.999995,1,0.000005,0,0,1,0.000005,0.000002,...,161350104.0,C,T,37.0,.,AF=2e-06;AQ=37,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:26:26,0:50:0,78,779:.."
1,chr6_161350115_T_C,200628,0.999990,2,0.000010,0,0,2,0.000010,0.000005,...,161350115.0,T,C,42.0,.,AF=2e-06;AQ=42,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:26:26,0:50:0,78,779:.."
2,chr6_161350116_G_T,200628,0.999995,1,0.000005,0,0,1,0.000005,0.000002,...,161350116.0,G,T,43.0,.,AF=2e-06;AQ=43,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:26:26,0:50:0,78,779:.."
3,chr6_161350119_C_G,200627,0.999990,2,0.000010,0,0,2,0.000010,0.000005,...,161350119.0,C,G,41.0,.,AF=5e-06;AQ=41,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:34:34,0:42:0,42,899:.."
4,chr6_161350121_C_T,200628,0.999995,1,0.000005,0,0,1,0.000005,0.000002,...,161350121.0,C,T,43.0,.,AF=2e-06;AQ=43,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:18:18,0:50:0,108,1079:.."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,chr6_162443438_C_A,200629,0.999995,1,0.000005,0,0,1,0.000005,0.000002,...,162443438.0,C,A,43.0,.,AF=2e-06;AQ=43,GT:DP:AD:GQ:PL:RNC,"0/0:16:16,0:48:0,75,989:..","0/0:16:16,0:48:0,60,599:..","0/0:17:17,0:50:0,84,839:.."
247,chr6_162443458_T_A,200624,0.999975,5,0.000025,0,0,5,0.000025,0.000012,...,162443458.0,T,A,47.0,.,AF=1.2e-05;AQ=47,GT:DP:AD:GQ:PL:RNC,"0/0:16:16,0:48:0,75,989:..","0/0:16:16,0:48:0,60,599:..","0/0:17:17,0:50:0,84,839:.."
248,chr6_162727663_T_C,200621,0.999970,6,0.000030,0,0,6,0.000030,0.000015,...,162727663.0,T,C,43.0,.,AF=1.5e-05;AQ=43,GT:DP:AD:GQ:PL:RNC,"0/0:18:18,0:50:0,54,539:..","0/0:16:16,0:48:0,66,659:..","0/0:20:20,0:50:0,66,659:.."
249,chr6_162727665_T_C,200624,0.999990,2,0.000010,0,0,2,0.000010,0.000005,...,162727665.0,T,C,45.0,.,AF=5e-06;AQ=45,GT:DP:AD:GQ:PL:RNC,"0/0:18:18,0:50:0,54,539:..","0/0:16:16,0:48:0,66,659:..","0/0:20:20,0:50:0,66,659:.."


In [106]:
#This code determines the frequency of nonsyn PRKN mutations (excluding those with annotation other than VUS or not annotated in ClinVar) in WES 
import pandas as pd
import time
import os
import numpy as np
import scipy.stats as stats


#Read in SNP csv file
SNP = pd.read_csv('/path/to/PRKN_variants_AllNonsynMissense_INDI.csv', na_values='NaN', header=0)

tSNP = SNP.T

tSNP.columns = tSNP.iloc[0]
tSNP = tSNP.drop(['IID'])

df = tSNP

dfcol = df.columns

df = df.reset_index()

df = df.rename(columns={'index': 'ID'})

df['ID'] = df['ID'].str.replace(':','_')

df['ID'] = df['ID'].str[:-2]

#Read in key to annotate
WESkey = pd.read_csv('/path/to/NewWESKey.csv', na_values='NaN', header=0)
df = pd.merge(df, WESkey, on ='ID', how ='left') 

df['CLNSIG'].value_counts()
df = df.loc[(df['CLNSIG'] == '.') | (df['CLNSIG'] == 'Uncertain_significance')]

df = df.T
new_header = df.iloc[0] #grab the first row for the header
df = df[1:] #take the data less the header row
df.columns = new_header #set the header row as the df header

df['WESTotal'] = df.sum(axis=1)


dfcol = df.columns
dfcol = dfcol[1:]

COLUMN_NAMES = ['ID','AA_N','AA_f','AB_N','AB_f','BB_N','BB_f','ABorBB_N','ABorBB_f','Allelic_Frequency']
results = pd.DataFrame(columns=COLUMN_NAMES)

for x in dfcol:

    test = df[x].value_counts()
    
    testlist = []
    
    testlist.append(x)

    try:
        a1 = test[0]
        testlist.append(a1)
        
    except:
        a1 = 0
        testlist.append(a1)
    
    try:
        a = test[0]/test.sum()
        testlist.append(a)
        
    except:
        a = 0
        testlist.append(a)

    try:
        b1 =  test[1]
        testlist.append(b1)
            
    except:
        b1 =  0
        testlist.append(b1)
        
    try:
        b =  test[1]/test.sum()
        testlist.append(b)
            
    except:
        b =  0
        testlist.append(b)

    try:
        c1 = test[2] 
        testlist.append(c1)
            
    except:
        c1 = 0
        testlist.append(c1)
                
    try:
        c = test[2]/test.sum() 
        testlist.append(c)
            
    except:
        c = 0
        testlist.append(c)

    try:
         testlist.append((b1+c1))
            
    except:
         testlist.append(0)

    try:
         testlist.append((b+c))
            
    except:
         testlist.append(0)
    
    try:
         testlist.append(((b/2)+c))
            
    except:
         testlist.append(0) 
    
    results_length = len(results)
    
    try:
        results.loc[results_length] = testlist
    except:
        pass

results = pd.merge(results, WESkey, on ='ID', how ='left') 

#Write results to file
results.to_csv('/path/to/WESnonSynVariantVUSonly_Frequency_Table.csv')


SNP2 = df 

results

Unnamed: 0,ID,AA_N,AA_f,AB_N,AB_f,BB_N,BB_f,ABorBB_N,ABorBB_f,Allelic_Frequency,...,Otherinfo5,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,Otherinfo12,Otherinfo13,Otherinfo14,Otherinfo15
0,chr6_161350115_T_C,200628,0.999841,2,0.000010,3,0.000015,5,0.000025,0.000020,...,161350115.0,T,C,42.0,.,AF=2e-06;AQ=42,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:26:26,0:50:0,78,779:.."
1,chr6_161350116_G_T,200628,0.999846,1,0.000005,3,0.000015,4,0.000020,0.000017,...,161350116.0,G,T,43.0,.,AF=2e-06;AQ=43,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:26:26,0:50:0,78,779:.."
2,chr6_161350119_C_G,200627,0.999841,2,0.000010,3,0.000015,5,0.000025,0.000020,...,161350119.0,C,G,41.0,.,AF=5e-06;AQ=41,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:34:34,0:42:0,42,899:.."
3,chr6_161350121_C_T,200628,0.999846,1,0.000005,3,0.000015,4,0.000020,0.000017,...,161350121.0,C,T,43.0,.,AF=2e-06;AQ=43,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:18:18,0:50:0,108,1079:.."
4,chr6_161350125_T_G,200572,0.999576,55,0.000274,3,0.000015,58,0.000289,0.000152,...,161350125.0,T,G,49.0,.,AF=0.000135;AQ=49,GT:DP:AD:GQ:PL:RNC,"0/0:17:17,0:50:0,51,509:..","0/0:16:16,0:48:0,48,479:..","0/0:18:18,0:50:0,108,1079:.."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,chr6_162443438_C_A,200629,0.999846,1,0.000005,3,0.000015,4,0.000020,0.000017,...,162443438.0,C,A,43.0,.,AF=2e-06;AQ=43,GT:DP:AD:GQ:PL:RNC,"0/0:16:16,0:48:0,75,989:..","0/0:16:16,0:48:0,60,599:..","0/0:17:17,0:50:0,84,839:.."
214,chr6_162443458_T_A,200624,0.999826,5,0.000025,5,0.000025,10,0.000050,0.000037,...,162443458.0,T,A,47.0,.,AF=1.2e-05;AQ=47,GT:DP:AD:GQ:PL:RNC,"0/0:16:16,0:48:0,75,989:..","0/0:16:16,0:48:0,60,599:..","0/0:17:17,0:50:0,84,839:.."
215,chr6_162727663_T_C,200621,0.999821,6,0.000030,6,0.000030,12,0.000060,0.000045,...,162727663.0,T,C,43.0,.,AF=1.5e-05;AQ=43,GT:DP:AD:GQ:PL:RNC,"0/0:18:18,0:50:0,54,539:..","0/0:16:16,0:48:0,66,659:..","0/0:20:20,0:50:0,66,659:.."
216,chr6_162727665_T_C,200624,0.999841,2,0.000010,2,0.000010,4,0.000020,0.000015,...,162727665.0,T,C,45.0,.,AF=5e-06;AQ=45,GT:DP:AD:GQ:PL:RNC,"0/0:18:18,0:50:0,54,539:..","0/0:16:16,0:48:0,66,659:..","0/0:20:20,0:50:0,66,659:.."


In [199]:
#This code calculates the odds ratio and p-value for a single pathogenic or LOF PRKN mutations (detected by WES or microarray) in PD cases 


import pandas as pd
import time
import os
import numpy as np
import scipy.stats as stats


#Read in SNP csv file
SNP = pd.read_csv('/path/to/PRKN_variants_WES_touse.csv', na_values='NaN', header=0)
SNP.drop(columns = "number_of_var", inplace=True)
SNP['WESTotal'] = SNP.drop('IID', axis=1).sum(axis=1)

#Read in DEL1 csv file
DEL1 = pd.read_csv('/path/to/del_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DEL1', 'DEL1_Exon', 'F'])
DEL1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DUP1 csv file
DUP1 = pd.read_csv('/path/to/dup_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DUP1', 'DUP1_Exon', 'F'])
DUP1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DELDUP2 csv file
DELDUP2 = pd.read_csv('/path/to/2DELDUP.csv', na_values='NaN', header=0)

#Merge SNP and DEL
df = pd.merge(SNP, DEL1, on ='IID', how ='left') 
df = pd.merge(df, DUP1, on ='IID', how ='left')
df = pd.merge(df, DELDUP2, on ='IID', how ='left') 

#Fix na in in DEL1 and DUP1 columns
df['DEL1'] = df['DEL1'].fillna(0)
df['DEL1_Exon'] = df['DEL1_Exon'].fillna('')
df['DUP1'] = df['DUP1'].fillna(0)
df['DUP1_Exon'] = df['DUP1_Exon'].fillna('')
df['2DELDUP'] = df['2DELDUP'].fillna(0)

df['CNVTotal'] = df['DEL1'] + df['DUP1'] + df['2DELDUP']
df['MutTotal'] = df['CNVTotal'] + df['WESTotal']

#Read in PDStatus csv file
PDStatus = pd.read_csv('/path/to/PD_April2021_with_covariates.csv', na_values='NaN', header=0)
PDStatus['PD'] = 1

FPDStatus = pd.read_csv('/path/to/ALL_indi_with_PD_parent_UNIQUE.csv', na_values='NaN', header=0)
FPDStatus['FPD'] = 1

#Merge SNP and DEL
df2 = pd.merge(df, PDStatus, on ='IID', how ='left') 
df2 = pd.merge(df2, FPDStatus, on = 'IID', how='left')

df2['PD'] = df2['PD'].fillna(0)
df2['FPD'] = df2['FPD'].fillna(0)

df2['PD'].value_counts()

#Limit to cases with only 0 or 1 detected PRKN path or LOF variant
df2 = df2.loc[df2['MutTotal'] <= 1.0]

#PD in WESTotal
PDWESTotal = df2.loc[(df2['WESTotal'] == 1) & (df2['PD'] == 1)]
PDWESTotalc = PDWESTotal['PD'].count()

NoPDWESTotal = df2.loc[(df2['WESTotal'] == 1) & (df2['PD'] == 0)]
NoPDWESTotalc = NoPDWESTotal['PD'].count()

PDNoWESTotal = df2.loc[(df2['WESTotal'] == 0) & (df2['PD'] == 1)]
PDNoWESTotalc = PDNoWESTotal['PD'].count()

NoPDNoWESTotal = df2.loc[(df2['WESTotal'] == 0) & (df2['PD'] == 0)]
NoPDNoWESTotalc = NoPDNoWESTotal['PD'].count()

testWESTotal, pvalueWESTotal = stats.fisher_exact([[PDWESTotalc, NoPDWESTotalc], [PDNoWESTotalc, NoPDNoWESTotalc]])


#PD in MutTotal
PDMutTotal = df2.loc[(df2['MutTotal'] == 1) & (df2['PD'] == 1)]
PDMutTotalc = PDMutTotal['PD'].count()

NoPDMutTotal = df2.loc[(df2['MutTotal'] == 1) & (df2['PD'] == 0)]
NoPDMutTotalc = NoPDMutTotal['PD'].count()

PDNoMutTotal = df2.loc[(df2['MutTotal'] == 0) & (df2['PD'] == 1)]
PDNoMutTotalc = PDNoMutTotal['PD'].count()

NoPDNoMutTotal = df2.loc[(df2['MutTotal'] == 0) & (df2['PD'] == 0)]
NoPDNoMutTotalc = NoPDNoMutTotal['PD'].count()

testMutTotal, pvalueMutTotal = stats.fisher_exact([[PDMutTotalc, NoPDMutTotalc], [PDNoMutTotalc, NoPDNoMutTotalc]])


#Make pandas dataframe with result
results = pd.DataFrame(np.array([['WES_SNP', PDWESTotalc, NoPDWESTotalc, PDNoWESTotalc, NoPDNoWESTotalc, testWESTotal, pvalueWESTotal], ['AllMut', PDMutTotalc, NoPDMutTotalc, PDNoMutTotalc, NoPDNoMutTotalc, testMutTotal, pvalueMutTotal]]), columns=['Mutation','Mut_Cases', 'Mut_CTRLs', 'NoMut_Cases','NoMut_CTRLs','Odds','p-value'])
results.set_index('Mutation', inplace=True)

#Write results to file
results.to_csv('/path/to/WESMutation_PD_Odds_Table.csv')
results

Unnamed: 0_level_0,Mut_Cases,Mut_CTRLs,NoMut_Cases,NoMut_CTRLs,Odds,p-value
Mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
WES_SNP,15,2517,1359,196715,0.8626329919023049,0.7149317662747136
AllMut,23,3648,1351,195584,0.9127481917228304,0.7617017238515316


In [197]:
#This code calculates the odds ratio and p-value for a single pathogenic or LOF PRKN mutations (detected by WES or microarray) in those with parent with PD

import pandas as pd
import time
import os
import numpy as np
import scipy.stats as stats


#Read in SNP csv file
SNP = pd.read_csv('/path/to/PRKN_variants_WES_touse.csv', na_values='NaN', header=0)
SNP.drop(columns = "number_of_var", inplace=True)
SNP['WESTotal'] = SNP.drop('IID', axis=1).sum(axis=1)

#Read in DEL1 csv file
DEL1 = pd.read_csv('/path/to/del_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DEL1', 'DEL1_Exon', 'F'])
DEL1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DUP1 csv file
DUP1 = pd.read_csv('/path/to/dup_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DUP1', 'DUP1_Exon', 'F'])
DUP1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DELDUP2 csv file
DELDUP2 = pd.read_csv('/path/to/2DELDUP.csv', na_values='NaN', header=0)

#Merge SNP and DEL
df = pd.merge(SNP, DEL1, on ='IID', how ='left') 
df = pd.merge(df, DUP1, on ='IID', how ='left')
df = pd.merge(df, DELDUP2, on ='IID', how ='left') 

#Fix na in in DEL1 and DUP1 columns
df['DEL1'] = df['DEL1'].fillna(0)
df['DEL1_Exon'] = df['DEL1_Exon'].fillna('')
df['DUP1'] = df['DUP1'].fillna(0)
df['DUP1_Exon'] = df['DUP1_Exon'].fillna('')
df['2DELDUP'] = df['2DELDUP'].fillna(0)

df['CNVTotal'] = df['DEL1'] + df['DUP1'] + df['2DELDUP']
df['MutTotal'] = df['CNVTotal'] + df['WESTotal']

#Read in PDStatus csv file
PDStatus = pd.read_csv('/path/to/PD_April2021_with_covariates.csv', na_values='NaN', header=0)
PDStatus['PD'] = 1

FPDStatus = pd.read_csv('/path/to/ALL_indi_with_PD_parent_UNIQUE.csv', na_values='NaN', header=0)
FPDStatus['FPD'] = 1

#Merge SNP and DEL
df2 = pd.merge(df, PDStatus, on ='IID', how ='left') 
df2 = pd.merge(df2, FPDStatus, on = 'IID', how='left')

df2['PD'] = df2['PD'].fillna(0)
df2['FPD'] = df2['FPD'].fillna(0)

df2['PD'].value_counts()

#Limit to cases with only 0 or 1 detected PRKN path or LOF variant
df2 = df2.loc[df2['MutTotal'] <= 1.0]

#PD in WESTotal
PDWESTotal = df2.loc[(df2['WESTotal'] == 1) & (df2['FPD'] == 1)]
PDWESTotalc = PDWESTotal['FPD'].count()

NoPDWESTotal = df2.loc[(df2['WESTotal'] == 1) & (df2['FPD'] == 0)]
NoPDWESTotalc = NoPDWESTotal['FPD'].count()

PDNoWESTotal = df2.loc[(df2['WESTotal'] == 0) & (df2['FPD'] == 1)]
PDNoWESTotalc = PDNoWESTotal['FPD'].count()

NoPDNoWESTotal = df2.loc[(df2['WESTotal'] == 0) & (df2['FPD'] == 0)]
NoPDNoWESTotalc = NoPDNoWESTotal['FPD'].count()

testWESTotal, pvalueWESTotal = stats.fisher_exact([[PDWESTotalc, NoPDWESTotalc], [PDNoWESTotalc, NoPDNoWESTotalc]])


#PD in MutTotal
PDMutTotal = df2.loc[(df2['MutTotal'] == 1) & (df2['FPD'] == 1)]
PDMutTotalc = PDMutTotal['FPD'].count()

NoPDMutTotal = df2.loc[(df2['MutTotal'] == 1) & (df2['FPD'] == 0)]
NoPDMutTotalc = NoPDMutTotal['FPD'].count()

PDNoMutTotal = df2.loc[(df2['MutTotal'] == 0) & (df2['FPD'] == 1)]
PDNoMutTotalc = PDNoMutTotal['FPD'].count()

NoPDNoMutTotal = df2.loc[(df2['MutTotal'] == 0) & (df2['FPD'] == 0)]
NoPDNoMutTotalc = NoPDNoMutTotal['FPD'].count()

testMutTotal, pvalueMutTotal = stats.fisher_exact([[PDMutTotalc, NoPDMutTotalc], [PDNoMutTotalc, NoPDNoMutTotalc]])


#Make pandas dataframe with result
results = pd.DataFrame(np.array([['WES_SNP', PDWESTotalc, NoPDWESTotalc, PDNoWESTotalc, NoPDNoWESTotalc, testWESTotal, pvalueWESTotal], ['AllMut', PDMutTotalc, NoPDMutTotalc, PDNoMutTotalc, NoPDNoMutTotalc, testMutTotal, pvalueMutTotal]]), columns=['Mutation','Mut_Cases', 'Mut_CTRLs', 'NoMut_Cases','NoMut_CTRLs','Odds','p-value'])
results.set_index('Mutation', inplace=True)

#Write results to file
results.to_csv('/path/to/WESMutation_FPD_Odds_Table.csv')
results

Unnamed: 0_level_0,Mut_Cases,Mut_CTRLs,NoMut_Cases,NoMut_CTRLs,Odds,p-value
Mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
WES_SNP,104,2428,7390,190684,1.105234599496626,0.3161780771194071
AllMut,152,3519,7342,189593,1.1154041077858152,0.1876711012182919


In [271]:
#This cell writes to .csv file table with only biallelic PRKN patients

results = df2.loc[(df2['MutTotal'] == 2)]

#Write results to file
results.to_csv('/path/to/WESMutation_Biallelic.csv')

In [188]:
#This code checks concordance between WES and microarray calls for R275W, T240M, C253Y, and K211N variants 

import pandas as pd
import time
import os
import numpy as np
import scipy.stats as stats


#Read in SNP csv file
WESSNP = pd.read_csv('/path/to/PRKN_variants_WES_touse.csv', na_values='NaN', header=0)
WESSNP.drop(columns = "number_of_var", inplace=True)
WESSNP.rename(columns={"chr6:161785820:G:A_A": "p.R275W", "chr6:161973317:G:A_A": "p.T240M", "chr6:161785885:C:T_T":"p.C253Y", "chr6:161973403:T:A_A":"p.K211N"}, inplace=True)
WESSNP = WESSNP[['IID','p.R275W','p.T240M','p.C253Y','p.K211N']]

#Read in SNP from microarray
SNP = pd.read_csv('/path/to/PRKN_pathogenic.csv', na_values='NaN', header=0, names = ['FID','IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE', 'R275W', 'C253Y', 'T240M', 'K211N'])
SNP.drop(columns=['FID','PAT','MAT','PHENOTYPE', 'SEX'], axis=1, inplace=True)

df = pd.merge(WESSNP, SNP, on ='IID', how ='left') 

df['R275W_match?'] = df['p.R275W'] == df['R275W']
df['T240M_match?'] = df['p.T240M'] == df['T240M']
df['C253Y_match?'] = df['p.C253Y'] == df['C253Y']
df['K211N_match?'] = df['p.K211N'] == df['K211N']

df.dropna(inplace=True)

mR275Wmismtach = df.loc[((df['R275W'] == 1) & (df['R275W_match?'] == False))]
mR275Wmatch = df.loc[((df['R275W'] == 1) & (df['R275W_match?'] == True))]

mT240Mmismtach = df.loc[((df['T240M'] == 1) & (df['T240M_match?'] == False))]
mT240Mmatch = df.loc[((df['T240M'] == 1) & (df['T240M_match?'] == True))]

mC253Ymismtach = df.loc[((df['C253Y'] == 1) & (df['C253Y_match?'] == False))]
mC253Ymatch = df.loc[((df['C253Y'] == 1) & (df['C253Y_match?'] == True))]

mK211Nmismtach = df.loc[((df['K211N'] == 1) & (df['K211N_match?'] == False))]
mK211Nmatch = df.loc[((df['K211N'] == 1) & (df['K211N_match?'] == True))]

print('C253Y-', 'mismatch:', mC253Ymismtach['C253Y_match?'].count(), ', match:', mC253Ymatch['C253Y_match?'].count())
print('C253Y discordant rate:', mC253Ymismtach['C253Y_match?'].count()/(mC253Ymismtach['C253Y_match?'].count() + mC253Ymatch['C253Y_match?'].count()))
print('C253Y concordant rate:', mC253Ymatch['C253Y_match?'].count()/(mC253Ymismtach['C253Y_match?'].count() + mC253Ymatch['C253Y_match?'].count()))

print('K211N-', 'mismatch:', mK211Nmismtach['K211N_match?'].count(), ', match:', mK211Nmatch['K211N_match?'].count())
print('K211N discordant rate:', mK211Nmismtach['K211N_match?'].count()/(mK211Nmismtach['K211N_match?'].count() + mK211Nmatch['K211N_match?'].count()))
print('K211N concordant rate:', mK211Nmatch['K211N_match?'].count()/(mK211Nmismtach['K211N_match?'].count() + mK211Nmatch['K211N_match?'].count()))

print('R275W-', 'mismatch:', mR275Wmismtach['R275W_match?'].count(), ', match:', mR275Wmatch['R275W_match?'].count())
print('R275W discordant rate:', mR275Wmismtach['R275W_match?'].count()/(mR275Wmismtach['R275W_match?'].count() + mR275Wmatch['R275W_match?'].count()))
print('R275W concordant rate:', mR275Wmatch['R275W_match?'].count()/(mR275Wmismtach['R275W_match?'].count() + mR275Wmatch['R275W_match?'].count()))

print('T240M-', 'mismatch:', mT240MWmismtach['R275W_match?'].count(), ', match:', mT240Mmatch['R275W_match?'].count())
print('T240M discordant rate:', mT240Mmismtach['T240M_match?'].count()/(mT240Mmismtach['T240M_match?'].count() + mT240Mmatch['T240M_match?'].count()))
print('T240M concordant rate:', mT240Mmatch['T240M_match?'].count()/(mT240Mmismtach['T240M_match?'].count() + mT240Mmatch['T240M_match?'].count()))

C253Y- mismatch: 43 , match: 2
C253Y discordant rate: 0.9555555555555556
C253Y concordant rate: 0.044444444444444446
K211N- mismatch: 3 , match: 7
K211N discordant rate: 0.3
K211N concordant rate: 0.7
R275W- mismatch: 21 , match: 1431
R275W discordant rate: 0.014462809917355372
R275W concordant rate: 0.9855371900826446
T240M- mismatch: 0 , match: 74
T240M discordant rate: 0.0
T240M concordant rate: 1.0


In [328]:
#This code determines the frequency of pathogenic or LOF PRKN mutations in from SNPs in microarray and CNVs from microarray

import pandas as pd
import time
import os
import numpy as np
import scipy.stats as stats


#Read in SNP csv file
SNP = pd.read_csv('/path/to/PRKN_pathogenic.csv', na_values='NaN', header=0, names = ['FID','IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE', 'R275W', 'C253Y', 'T240M', 'K211N'])
SNP.drop(columns=['FID','PAT','MAT','PHENOTYPE','C253Y','K211N','SEX'], axis=1, inplace=True)
SNP['WESTotal'] = SNP.drop('IID', axis=1).sum(axis=1)

#Read in DEL1 csv file
DEL1 = pd.read_csv('/path/to/del_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DEL1', 'DEL1_Exon', 'F'])
DEL1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DUP1 csv file
DUP1 = pd.read_csv('/path/to/dup_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DUP1', 'DUP1_Exon', 'F'])
DUP1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DELDUP2 csv file
DELDUP2 = pd.read_csv('/path/to/2DELDUP.csv', na_values='NaN', header=0)

#Merge SNP and DEL
df = pd.merge(SNP, DEL1, on ='IID', how ='left') 
df = pd.merge(df, DUP1, on ='IID', how ='left')
df = pd.merge(df, DELDUP2, on ='IID', how ='left') 

#Fix na in in DEL1 and DUP1 columns
df['DEL1'] = df['DEL1'].fillna(0)
df['DEL1_Exon'] = df['DEL1_Exon'].fillna('')
df['DUP1'] = df['DUP1'].fillna(0)
df['DUP1_Exon'] = df['DUP1_Exon'].fillna('')
df['2DELDUP'] = df['2DELDUP'].fillna(0)

SNP['WESTotal'] = SNP.drop('IID', axis=1).sum(axis=1)

df['CNVTotal'] = df['DEL1'] + df['DUP1'] + df['2DELDUP']
df['MutTotal'] = df['CNVTotal'] + df['WESTotal']

df

dfcol = df.columns
dfcol = dfcol[1:]

COLUMN_NAMES = ['ID','AA_N','AA_f','AB_N','AB_f','BB_N','BB_f','ABorBB_N','ABorBB_f','Allelic_Frequency']
results = pd.DataFrame(columns=COLUMN_NAMES)

for x in dfcol:

    test = df[x].value_counts()
    
    testlist = []
    
    testlist.append(x)

    try:
        a1 = test[0]
        testlist.append(a1)
        
    except:
        a1 = 0
        testlist.append(a1)
    
    try:
        a = test[0]/test.sum()
        testlist.append(a)
        
    except:
        a = 0
        testlist.append(a)

    try:
        b1 =  test[1]
        testlist.append(b1)
            
    except:
        b1 =  0
        testlist.append(b1)
        
    try:
        b =  test[1]/test.sum()
        testlist.append(b)
            
    except:
        b =  0
        testlist.append(b)

    try:
        c1 = test[2] 
        testlist.append(c1)
            
    except:
        c1 = 0
        testlist.append(c1)
                
    try:
        c = test[2]/test.sum() 
        testlist.append(c)
            
    except:
        c = 0
        testlist.append(c)

    try:
         testlist.append((b1+c1))
            
    except:
         testlist.append(0)

    try:
         testlist.append((b+c))
            
    except:
         testlist.append(0)
    
    try:
         testlist.append(((b/2)+c))
            
    except:
         testlist.append(0) 
    
    results_length = len(results)
    
    try:
        results.loc[results_length] = testlist
    except:
        pass

    
#Write results to file
results.to_csv('/path/to/Microarray_Frequency_Table.csv')
results

Unnamed: 0,ID,AA_N,AA_f,AB_N,AB_f,BB_N,BB_f,ABorBB_N,ABorBB_f,Allelic_Frequency
0,R275W,479457,0.9922,3759,0.007779,10,2.1e-05,3769,0.0078,0.00391
1,T240M,487615,0.999565,212,0.000435,0,0.0,212,0.000435,0.000217
2,WESTotal,484397,0.991851,3969,0.008127,11,2.3e-05,3980,0.008149,0.004086
3,DEL1,486929,0.997035,1448,0.002965,0,0.0,1448,0.002965,0.001482
4,DEL1_Exon,486929,0.997035,533,0.001091,225,0.000461,758,0.001552,0.001006
5,DUP1,487144,0.997475,1233,0.002525,0,0.0,1233,0.002525,0.001262
6,DUP1_Exon,487144,0.997475,783,0.001603,142,0.000291,925,0.001894,0.001092
7,2DELDUP,488371,0.999988,0,0.0,6,1.2e-05,6,1.2e-05,1.2e-05
8,CNVTotal,485690,0.994498,2681,0.00549,6,1.2e-05,2687,0.005502,0.002757
9,MutTotal,481729,0.986388,6612,0.013539,36,7.4e-05,6648,0.013612,0.006843


In [194]:
#This code calculates the odds ratio and p-value for a single pathogenic or LOF PRKN mutations (detected by microarray only) in PD cases 


import pandas as pd
import time
import os
import numpy as np
import scipy.stats as stats


#Read in SNP csv file
SNP = pd.read_csv('/path/to/PRKN_pathogenic.csv', na_values='NaN', header=0, names = ['FID','IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE', 'R275W', 'C253Y', 'T240M', 'K211N'])
SNP.drop(columns=['FID','PAT','MAT','PHENOTYPE','C253Y','K211N','SEX'], axis=1, inplace=True)
SNP['WESTotal'] = SNP.drop('IID', axis=1).sum(axis=1)

#Read in DEL1 csv file
DEL1 = pd.read_csv('/path/to/del_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DEL1', 'DEL1_Exon', 'F'])
DEL1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DUP1 csv file
DUP1 = pd.read_csv('/path/to/dup_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DUP1', 'DUP1_Exon', 'F'])
DUP1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DELDUP2 csv file
DELDUP2 = pd.read_csv('/path/to/2DELDUP.csv', na_values='NaN', header=0)

#Merge SNP and DEL
df = pd.merge(SNP, DEL1, on ='IID', how ='left') 
df = pd.merge(df, DUP1, on ='IID', how ='left')
df = pd.merge(df, DELDUP2, on ='IID', how ='left') 

#Fix na in in DEL1 and DUP1 columns
df['DEL1'] = df['DEL1'].fillna(0)
df['DEL1_Exon'] = df['DEL1_Exon'].fillna('')
df['DUP1'] = df['DUP1'].fillna(0)
df['DUP1_Exon'] = df['DUP1_Exon'].fillna('')
df['2DELDUP'] = df['2DELDUP'].fillna(0)

df['CNVTotal'] = df['DEL1'] + df['DUP1'] + df['2DELDUP']
df['MutTotal'] = df['CNVTotal'] + df['WESTotal']

#Read in PDStatus csv file
PDStatus = pd.read_csv('/path/to/PD_April2021_with_covariates.csv', na_values='NaN', header=0)
PDStatus['PD'] = 1

FPDStatus = pd.read_csv('/path/to/ALL_indi_with_PD_parent_UNIQUE.csv', na_values='NaN', header=0)
FPDStatus['FPD'] = 1

#Merge SNP and DEL
df2 = pd.merge(df, PDStatus, on ='IID', how ='left') 
df2 = pd.merge(df2, FPDStatus, on = 'IID', how='left')

df2['PD'] = df2['PD'].fillna(0)
df2['FPD'] = df2['FPD'].fillna(0)

df2['PD'].value_counts()

#Limit to cases with only 0 or 1 detected PRKN path or LOF variant
df2 = df2.loc[df2['MutTotal'] <= 1.0]

#PD in T240M
PDT240M = df2.loc[(df2['T240M'] == 1) & (df2['PD'] == 1)]
PDT240Mc = PDT240M['PD'].count()

NoPDT240M = df2.loc[(df2['T240M'] == 1) & (df2['PD'] == 0)]
NoPDT240Mc = NoPDT240M['PD'].count()

PDNoT240M = df2.loc[(df2['T240M'] == 0) & (df2['PD'] == 1)]
PDNoT240Mc = PDNoT240M['PD'].count()

NoPDNoT240M = df2.loc[(df2['T240M'] == 0) & (df2['PD'] == 0)]
NoPDNoT240Mc = NoPDNoT240M['PD'].count()

testT240M, pvalueT240M = stats.fisher_exact([[PDT240Mc, NoPDT240Mc], [PDNoT240Mc, NoPDNoT240Mc]])

#PD in R275W
PDR275W = df2.loc[(df2['R275W'] == 1) & (df2['PD'] == 1)]
PDR275Wc = PDR275W['PD'].count()

NoPDR275W = df2.loc[(df2['R275W'] == 1) & (df2['PD'] == 0)]
NoPDR275Wc = NoPDR275W['PD'].count()

PDNoR275W = df2.loc[(df2['R275W'] == 0) & (df2['PD'] == 1)]
PDNoR275Wc = PDNoR275W['PD'].count()

NoPDNoR275W = df2.loc[(df2['R275W'] == 0) & (df2['PD'] == 0)]
NoPDNoR275Wc = NoPDNoR275W['PD'].count()

testR275W, pvalueR275W = stats.fisher_exact([[PDR275Wc, NoPDR275Wc], [PDNoR275Wc, NoPDNoR275Wc]])

#PD in DEL1
PDDEL1 = df2.loc[(df2['DEL1'] == 1) & (df2['PD'] == 1)]
PDDEL1c = PDDEL1['PD'].count()

NoPDDEL1 = df2.loc[(df2['DEL1'] == 1) & (df2['PD'] == 0)]
NoPDDEL1c = NoPDDEL1['PD'].count()

PDNoDEL1 = df2.loc[(df2['DEL1'] == 0) & (df2['PD'] == 1)]
PDNoDEL1c = PDNoDEL1['PD'].count()

NoPDNoDEL1 = df2.loc[(df2['DEL1'] == 0) & (df2['PD'] == 0)]
NoPDNoDEL1c = NoPDNoDEL1['PD'].count()

testDEL1, pvalueDEL1 = stats.fisher_exact([[PDDEL1c, NoPDDEL1c], [PDNoDEL1c, NoPDNoDEL1c]])

#PD in DUP1
PDDUP1 = df2.loc[(df2['DUP1'] == 1) & (df2['PD'] == 1)]
PDDUP1c = PDDUP1['PD'].count()

NoPDDUP1 = df2.loc[(df2['DUP1'] == 1) & (df2['PD'] == 0)]
NoPDDUP1c = NoPDDUP1['PD'].count()

PDNoDUP1 = df2.loc[(df2['DUP1'] == 0) & (df2['PD'] == 1)]
PDNoDUP1c = PDNoDUP1['PD'].count()

NoPDNoDUP1 = df2.loc[(df2['DUP1'] == 0) & (df2['PD'] == 0)]
NoPDNoDUP1c = NoPDNoDUP1['PD'].count()

testDUP1, pvalueDUP1 = stats.fisher_exact([[PDDUP1c, NoPDDUP1c], [PDNoDUP1c, NoPDNoDUP1c]])

#PD in AnyCNV
PDCNV = df2.loc[((df2['DUP1'] == 1) | (df2['DEL1'] == 1)) & (df2['PD'] == 1)]
PDCNVc = PDCNV['PD'].count()

NoPDCNV = df2.loc[((df2['DUP1'] == 1) | (df2['DEL1'] == 1)) & (df2['PD'] == 0)]
NoPDCNVc = NoPDCNV['PD'].count()

PDNoCNV = df2.loc[((df2['DUP1'] == 0) & (df2['DEL1'] == 0)) & (df2['PD'] == 1)]
PDNoCNVc = PDNoCNV['PD'].count()

NoPDNoCNV = df2.loc[((df2['DUP1'] == 0) & (df2['DEL1'] == 0)) & (df2['PD'] == 0)]
NoPDNoCNVc = NoPDNoCNV['PD'].count()

testCNV, pvalueCNV = stats.fisher_exact([[PDCNVc, NoPDCNVc], [PDNoCNVc, NoPDNoCNVc]])

#PD in WESTotal
PDWESTotal = df2.loc[(df2['WESTotal'] == 1) & (df2['PD'] == 1)]
PDWESTotalc = PDWESTotal['PD'].count()

NoPDWESTotal = df2.loc[(df2['WESTotal'] == 1) & (df2['PD'] == 0)]
NoPDWESTotalc = NoPDWESTotal['PD'].count()

PDNoWESTotal = df2.loc[(df2['WESTotal'] == 0) & (df2['PD'] == 1)]
PDNoWESTotalc = PDNoWESTotal['PD'].count()

NoPDNoWESTotal = df2.loc[(df2['WESTotal'] == 0) & (df2['PD'] == 0)]
NoPDNoWESTotalc = NoPDNoWESTotal['PD'].count()

testWESTotal, pvalueWESTotal = stats.fisher_exact([[PDWESTotalc, NoPDWESTotalc], [PDNoWESTotalc, NoPDNoWESTotalc]])


#PD in MutTotal
PDMutTotal = df2.loc[(df2['MutTotal'] == 1) & (df2['PD'] == 1)]
PDMutTotalc = PDMutTotal['PD'].count()

NoPDMutTotal = df2.loc[(df2['MutTotal'] == 1) & (df2['PD'] == 0)]
NoPDMutTotalc = NoPDMutTotal['PD'].count()

PDNoMutTotal = df2.loc[(df2['MutTotal'] == 0) & (df2['PD'] == 1)]
PDNoMutTotalc = PDNoMutTotal['PD'].count()

NoPDNoMutTotal = df2.loc[(df2['MutTotal'] == 0) & (df2['PD'] == 0)]
NoPDNoMutTotalc = NoPDNoMutTotal['PD'].count()

testMutTotal, pvalueMutTotal = stats.fisher_exact([[PDMutTotalc, NoPDMutTotalc], [PDNoMutTotalc, NoPDNoMutTotalc]])


#Make pandas dataframe with result
results = pd.DataFrame(np.array([['T240M', PDT240Mc, NoPDT240Mc, PDNoT240Mc, NoPDNoT240Mc, testT240M, pvalueT240M],['R275W', PDR275Wc, NoPDR275Wc, PDNoR275Wc, NoPDNoR275Wc, testR275W, pvalueR275W], ['MicroarraySNP', PDWESTotalc, NoPDWESTotalc, PDNoWESTotalc, NoPDNoWESTotalc, testWESTotal, pvalueWESTotal], ['DEL1', PDDEL1c, NoPDDEL1c, PDNoDEL1c, NoPDNoDEL1c, testDEL1, pvalueDEL1], ['DUP1', PDDUP1c, NoPDDUP1c, PDNoDUP1c, NoPDNoDUP1c, testDUP1, pvalueDUP1],['CNV', PDCNVc, NoPDCNVc, PDNoCNVc, NoPDNoCNVc, testCNV, pvalueCNV],['AllMut', PDMutTotalc, NoPDMutTotalc, PDNoMutTotalc, NoPDNoMutTotalc, testMutTotal, pvalueMutTotal]]), columns=['Mutation','Mut_Cases', 'Mut_CTRLs', 'NoMut_Cases','NoMut_CTRLs','Odds','p-value'])
results.set_index('Mutation', inplace=True)

#Write results to file
results.to_csv('/path/to/MicroarrayMutation_PD_Odds_Table.csv')
results

Unnamed: 0_level_0,Mut_Cases,Mut_CTRLs,NoMut_Cases,NoMut_CTRLs,Odds,p-value
Mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
T240M,3,208,3455,484125,2.0210049538016253,0.1896202033731787
R275W,34,3705,3389,476062,1.2890882584721786,0.1413468103455112
MicroarraySNP,37,3913,3428,480963,1.326669456835531,0.1042337970515129
DEL1,11,1422,3454,483454,1.0827443181309182,0.7511073348307502
DUP1,7,1222,3458,483654,0.801192045959037,0.7322393651321926
CNV,18,2644,3447,482232,0.9524140573464528,1.0
AllMut,55,6557,3410,478319,1.1765780967889523,0.237088751787433


In [195]:
#This code calculates the odds ratio and p-value for a single pathogenic or LOF PRKN mutations (detected by microarray only) in participants with or without a parent with PD


import pandas as pd
import time
import os
import numpy as np
import scipy.stats as stats


#Read in SNP csv file
SNP = pd.read_csv('/path/to/PRKN_pathogenic.csv', na_values='NaN', header=0, names = ['FID','IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE', 'R275W', 'C253Y', 'T240M', 'K211N'])
SNP.drop(columns=['FID','PAT','MAT','PHENOTYPE','C253Y','K211N','SEX'], axis=1, inplace=True)
SNP['WESTotal'] = SNP.drop('IID', axis=1).sum(axis=1)

#Read in DEL1 csv file
DEL1 = pd.read_csv('/path/to/del_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DEL1', 'DEL1_Exon', 'F'])
DEL1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DUP1 csv file
DUP1 = pd.read_csv('/path/to/dup_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DUP1', 'DUP1_Exon', 'F'])
DUP1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DELDUP2 csv file
DELDUP2 = pd.read_csv('/path/to/2DELDUP.csv', na_values='NaN', header=0)

#Merge SNP and DEL
df = pd.merge(SNP, DEL1, on ='IID', how ='left') 
df = pd.merge(df, DUP1, on ='IID', how ='left')
df = pd.merge(df, DELDUP2, on ='IID', how ='left') 

#Fix na in in DEL1 and DUP1 columns
df['DEL1'] = df['DEL1'].fillna(0)
df['DEL1_Exon'] = df['DEL1_Exon'].fillna('')
df['DUP1'] = df['DUP1'].fillna(0)
df['DUP1_Exon'] = df['DUP1_Exon'].fillna('')
df['2DELDUP'] = df['2DELDUP'].fillna(0)

df['CNVTotal'] = df['DEL1'] + df['DUP1'] + df['2DELDUP']
df['MutTotal'] = df['CNVTotal'] + df['WESTotal']

#Read in PDStatus csv file
PDStatus = pd.read_csv('/path/to/PD_April2021_with_covariates.csv', na_values='NaN', header=0)
PDStatus['FPD'] = 1

FPDStatus = pd.read_csv('/path/to/ALL_indi_with_PD_parent_UNIQUE.csv', na_values='NaN', header=0)
FPDStatus['PD'] = 1

#Merge SNP and DEL
df2 = pd.merge(df, PDStatus, on ='IID', how ='left') 
df2 = pd.merge(df2, FPDStatus, on = 'IID', how='left')

df2['PD'] = df2['PD'].fillna(0)
df2['FPD'] = df2['FPD'].fillna(0)

df2['PD'].value_counts()

#Limit to cases with only 0 or 1 detected PRKN path or LOF variant
df2 = df2.loc[df2['MutTotal'] <= 1.0]

#PD in T240M
PDT240M = df2.loc[(df2['T240M'] == 1) & (df2['PD'] == 1)]
PDT240Mc = PDT240M['PD'].count()

NoPDT240M = df2.loc[(df2['T240M'] == 1) & (df2['PD'] == 0)]
NoPDT240Mc = NoPDT240M['PD'].count()

PDNoT240M = df2.loc[(df2['T240M'] == 0) & (df2['PD'] == 1)]
PDNoT240Mc = PDNoT240M['PD'].count()

NoPDNoT240M = df2.loc[(df2['T240M'] == 0) & (df2['PD'] == 0)]
NoPDNoT240Mc = NoPDNoT240M['PD'].count()

testT240M, pvalueT240M = stats.fisher_exact([[PDT240Mc, NoPDT240Mc], [PDNoT240Mc, NoPDNoT240Mc]])

#PD in R275W
PDR275W = df2.loc[(df2['R275W'] == 1) & (df2['PD'] == 1)]
PDR275Wc = PDR275W['PD'].count()

NoPDR275W = df2.loc[(df2['R275W'] == 1) & (df2['PD'] == 0)]
NoPDR275Wc = NoPDR275W['PD'].count()

PDNoR275W = df2.loc[(df2['R275W'] == 0) & (df2['PD'] == 1)]
PDNoR275Wc = PDNoR275W['PD'].count()

NoPDNoR275W = df2.loc[(df2['R275W'] == 0) & (df2['PD'] == 0)]
NoPDNoR275Wc = NoPDNoR275W['PD'].count()

testR275W, pvalueR275W = stats.fisher_exact([[PDR275Wc, NoPDR275Wc], [PDNoR275Wc, NoPDNoR275Wc]])

#PD in DEL1
PDDEL1 = df2.loc[(df2['DEL1'] == 1) & (df2['PD'] == 1)]
PDDEL1c = PDDEL1['PD'].count()

NoPDDEL1 = df2.loc[(df2['DEL1'] == 1) & (df2['PD'] == 0)]
NoPDDEL1c = NoPDDEL1['PD'].count()

PDNoDEL1 = df2.loc[(df2['DEL1'] == 0) & (df2['PD'] == 1)]
PDNoDEL1c = PDNoDEL1['PD'].count()

NoPDNoDEL1 = df2.loc[(df2['DEL1'] == 0) & (df2['PD'] == 0)]
NoPDNoDEL1c = NoPDNoDEL1['PD'].count()

testDEL1, pvalueDEL1 = stats.fisher_exact([[PDDEL1c, NoPDDEL1c], [PDNoDEL1c, NoPDNoDEL1c]])

#PD in DUP1
PDDUP1 = df2.loc[(df2['DUP1'] == 1) & (df2['PD'] == 1)]
PDDUP1c = PDDUP1['PD'].count()

NoPDDUP1 = df2.loc[(df2['DUP1'] == 1) & (df2['PD'] == 0)]
NoPDDUP1c = NoPDDUP1['PD'].count()

PDNoDUP1 = df2.loc[(df2['DUP1'] == 0) & (df2['PD'] == 1)]
PDNoDUP1c = PDNoDUP1['PD'].count()

NoPDNoDUP1 = df2.loc[(df2['DUP1'] == 0) & (df2['PD'] == 0)]
NoPDNoDUP1c = NoPDNoDUP1['PD'].count()

testDUP1, pvalueDUP1 = stats.fisher_exact([[PDDUP1c, NoPDDUP1c], [PDNoDUP1c, NoPDNoDUP1c]])

#PD in AnyCNV
PDCNV = df2.loc[((df2['DUP1'] == 1) | (df2['DEL1'] == 1)) & (df2['PD'] == 1)]
PDCNVc = PDCNV['PD'].count()

NoPDCNV = df2.loc[((df2['DUP1'] == 1) | (df2['DEL1'] == 1)) & (df2['PD'] == 0)]
NoPDCNVc = NoPDCNV['PD'].count()

PDNoCNV = df2.loc[((df2['DUP1'] == 0) & (df2['DEL1'] == 0)) & (df2['PD'] == 1)]
PDNoCNVc = PDNoCNV['PD'].count()

NoPDNoCNV = df2.loc[((df2['DUP1'] == 0) & (df2['DEL1'] == 0)) & (df2['PD'] == 0)]
NoPDNoCNVc = NoPDNoCNV['PD'].count()

testCNV, pvalueCNV = stats.fisher_exact([[PDCNVc, NoPDCNVc], [PDNoCNVc, NoPDNoCNVc]])

#PD in WESTotal
PDWESTotal = df2.loc[(df2['WESTotal'] == 1) & (df2['PD'] == 1)]
PDWESTotalc = PDWESTotal['PD'].count()

NoPDWESTotal = df2.loc[(df2['WESTotal'] == 1) & (df2['PD'] == 0)]
NoPDWESTotalc = NoPDWESTotal['PD'].count()

PDNoWESTotal = df2.loc[(df2['WESTotal'] == 0) & (df2['PD'] == 1)]
PDNoWESTotalc = PDNoWESTotal['PD'].count()

NoPDNoWESTotal = df2.loc[(df2['WESTotal'] == 0) & (df2['PD'] == 0)]
NoPDNoWESTotalc = NoPDNoWESTotal['PD'].count()

testWESTotal, pvalueWESTotal = stats.fisher_exact([[PDWESTotalc, NoPDWESTotalc], [PDNoWESTotalc, NoPDNoWESTotalc]])


#PD in MutTotal
PDMutTotal = df2.loc[(df2['MutTotal'] == 1) & (df2['PD'] == 1)]
PDMutTotalc = PDMutTotal['PD'].count()

NoPDMutTotal = df2.loc[(df2['MutTotal'] == 1) & (df2['PD'] == 0)]
NoPDMutTotalc = NoPDMutTotal['PD'].count()

PDNoMutTotal = df2.loc[(df2['MutTotal'] == 0) & (df2['PD'] == 1)]
PDNoMutTotalc = PDNoMutTotal['PD'].count()

NoPDNoMutTotal = df2.loc[(df2['MutTotal'] == 0) & (df2['PD'] == 0)]
NoPDNoMutTotalc = NoPDNoMutTotal['PD'].count()

testMutTotal, pvalueMutTotal = stats.fisher_exact([[PDMutTotalc, NoPDMutTotalc], [PDNoMutTotalc, NoPDNoMutTotalc]])


#Make pandas dataframe with result
results = pd.DataFrame(np.array([['T240M', PDT240Mc, NoPDT240Mc, PDNoT240Mc, NoPDNoT240Mc, testT240M, pvalueT240M],['R275W', PDR275Wc, NoPDR275Wc, PDNoR275Wc, NoPDNoR275Wc, testR275W, pvalueR275W], ['MicroarraySNP', PDWESTotalc, NoPDWESTotalc, PDNoWESTotalc, NoPDNoWESTotalc, testWESTotal, pvalueWESTotal], ['DEL1', PDDEL1c, NoPDDEL1c, PDNoDEL1c, NoPDNoDEL1c, testDEL1, pvalueDEL1], ['DUP1', PDDUP1c, NoPDDUP1c, PDNoDUP1c, NoPDNoDUP1c, testDUP1, pvalueDUP1],['CNV', PDCNVc, NoPDCNVc, PDNoCNVc, NoPDNoCNVc, testCNV, pvalueCNV],['AllMut', PDMutTotalc, NoPDMutTotalc, PDNoMutTotalc, NoPDNoMutTotalc, testMutTotal, pvalueMutTotal]]), columns=['Mutation','Mut_Cases', 'Mut_CTRLs', 'NoMut_Cases','NoMut_CTRLs','Odds','p-value'])
results.set_index('Mutation', inplace=True)

#Write results to file
results.to_csv('/path/to/MicroarrayMutation_FPD_Odds_Table.csv')
results


Unnamed: 0_level_0,Mut_Cases,Mut_CTRLs,NoMut_Cases,NoMut_CTRLs,Odds,p-value
Mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
T240M,9,202,17644,469936,1.186677770021391,0.5775949679519168
R275W,135,3604,17363,462088,0.9968938376175038,1.0
MicroarraySNP,144,3806,17531,466860,1.0075664179657666,0.9318019186661882
DEL1,65,1368,17610,469298,1.266241685512285,0.0654163270876398
DUP1,43,1186,17632,469480,0.9653821964051258,0.8785908906394373
CNV,108,2554,17567,468112,1.1268212685572665,0.2310247294794069
AllMut,252,6360,17423,464306,1.055904849261278,0.3888563232544414


In [45]:
#This code determines what proportion of mutations are identified by microarray using WES SNPs + microarray CNVs as gold standard

import pandas as pd
import time
import os
import numpy as np
import scipy.stats as stats


#Read in SNP csv file
SNP = pd.read_csv('/path/to/PRKN_variants_WES_touse.csv', na_values='NaN', header=0)
SNP.drop(columns = "number_of_var", inplace=True)
SNP['WESTotal'] = SNP.drop('IID', axis=1).sum(axis=1)

#Read in DEL1 csv file
DEL1 = pd.read_csv('/path/to/del_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DEL1', 'DEL1_Exon', 'F'])
DEL1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DUP1 csv file
DUP1 = pd.read_csv('/path/to/dup_list_1only.csv', na_values='NaN', header=0, names = ['IID','B', 'C', 'DUP1', 'DUP1_Exon', 'F'])
DUP1.drop(columns=['B','C','F'], axis=1, inplace=True)

#Read in DELDUP2 csv file
DELDUP2 = pd.read_csv('/path/to/2DELDUP.csv', na_values='NaN', header=0)

#Read in mSNP csv file
mSNP = pd.read_csv('/path/to/PRKN_pathogenic.csv', na_values='NaN', header=0, names = ['FID','IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE', 'R275W', 'C253Y', 'T240M', 'K211N'])
mSNP.drop(columns=['FID','PAT','MAT','PHENOTYPE','C253Y','K211N','SEX'], axis=1, inplace=True)
mSNP['mSNPTotal'] = mSNP.drop('IID', axis=1).sum(axis=1)


#Read in PDStatus csv file
PDStatus = pd.read_csv('/path/to/PD_April2021_with_covariates.csv', na_values='NaN', header=0)
PDStatus['FPD'] = 1

FPDStatus = pd.read_csv('/path/to/ALL_indi_with_PD_parent_UNIQUE.csv', na_values='NaN', header=0)
FPDStatus['PD'] = 1

#Merge SNP and DEL
df2 = pd.merge(df, PDStatus, on ='IID', how ='left') 
df2 = pd.merge(df2, FPDStatus, on = 'IID', how='left')

#Merge SNP and DEL
df = pd.merge(SNP, DEL1, on ='IID', how ='left') 
df = pd.merge(df, DUP1, on ='IID', how ='left')
df = pd.merge(df, DELDUP2, on ='IID', how ='left')
df = pd.merge(df, mSNP, on ='IID', how ='left')
df = pd.merge(df, PDStatus, on ='IID', how ='left') 
df = pd.merge(df, FPDStatus, on = 'IID', how='left')


#Fix na in in DEL1 and DUP1 columns
df['DEL1'] = df['DEL1'].fillna(0)
df['DEL1_Exon'] = df['DEL1_Exon'].fillna('')
df['DUP1'] = df['DUP1'].fillna(0)
df['DUP1_Exon'] = df['DUP1_Exon'].fillna('')
df['2DELDUP'] = df['2DELDUP'].fillna(0)
df['PD'] = df['PD'].fillna(0)
df['FPD'] = df['FPD'].fillna(0)

#Read in key to annotate
WESkey = pd.read_csv('/path/to/WESKey.csv', na_values='NaN', header=0)

df['CNVTotal'] = df['DEL1'] + df['DUP1'] + df['2DELDUP']
df['MutTotal'] = df['CNVTotal'] + df['WESTotal']
df['microTotal'] = df['DEL1'] + df['DUP1'] + df['2DELDUP'] + df['R275W'] + df['T240M'] 

twomuts = df.loc[df['MutTotal'] >= 2]
anymut = df.loc[df['MutTotal'] >= 1]

print('Number of mutations detected by microarray in biallelic PRKN participants:')
print(twomuts['microTotal'].value_counts())
print('Number of mutations detected by WES + microarray in biallelic PRKN participants:')
print(twomuts['MutTotal'].value_counts())
print('Number of mutations detected by microarray in 1 and 2 PRKN mutation carriers:')
print(anymut['microTotal'].value_counts())
print('Number of mutations detected by WES + microarray in 1 and 2 PRKN mutation carriers:')
print(anymut['MutTotal'].value_counts())

posmicro = anymut[(anymut['microTotal']>=1)]
negmicro = anymut[(anymut['microTotal']==0)]
SNPmicro = anymut[(anymut['R275W']>=1) | (anymut['T240M']>=1)]

print('Proportion of PRKN mutations detected by microarray:')
print(posmicro['MutTotal'].sum() / anymut['MutTotal'].sum())
print('Proportion of PRKN SNPs detected by microarray:')
print(SNPmicro['microTotal'].sum() / anymut['MutTotal'].sum())


Number of mutations detected by microarray in biallelic PRKN participants:
2.0    13
1.0     8
0.0     3
Name: microTotal, dtype: int64
Number of mutations detected by WES + microarray in biallelic PRKN participants:
2.0    24
Name: MutTotal, dtype: int64
Number of mutations detected by microarray in 1 and 2 PRKN mutation carriers:
1.0    2679
0.0     954
2.0      13
Name: microTotal, dtype: int64
Number of mutations detected by WES + microarray in 1 and 2 PRKN mutation carriers:
1.0    3671
2.0      24
Name: MutTotal, dtype: int64
Proportion of PRKN mutations detected by microarray:
0.7294971766603926
Proportion of PRKN SNPs detected by microarray:
0.42350094111320247
