In [1]:
import pandas as pd

# Merge Filtered ClinVar Variants with Harvard PGP study population variants

In [35]:
# Harvard PGP variants in study population that are identified as P/LP in ClinVar Variants Summary #
path = '/Users/jerenolsen/Desktop/All_Tests/GBC_Querying/hvd-pgp-ms-population-variants.tsv'
df_study = pd.read_csv(path, sep = '\t')

In [36]:
df_study

Unnamed: 0,chromosome,start_position,end_position,reference_bases,alternate_bases.alt,alternate_bases.AF,alternate_bases.DR2,names,quality,filter,IMP,genotype,phaseset,DS,sample_name,sample_id
0,1,25629819,25629820,T,G,0.4793,0.92,rs121912763,,PASS,True,1/0,*,0.96,hu5D24B4,577641578821216468
1,1,25629819,25629820,T,G,0.5000,1.00,rs121912763,,PASS,,0/1,*,1.00,hu4BA913,3734609915449686100
2,1,216424302,216424303,A,G,0.4363,0.77,rs45555435,,PASS,True,0/1,*,0.87,hu61EBEE,7589298601817165598
3,1,216424302,216424303,A,G,0.4814,0.93,rs45555435,,PASS,True,0/1,*,0.96,hu0486D6,129822115180450508
4,1,216424302,216424303,A,G,0.5000,1.00,rs45555435,,PASS,,0/1,*,1.00,huA49E22,7296257421245505846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11963,22,50964235,50964236,G,A,0.9684,0.03,rs11479,,PASS,True,1/1,*,1.94,hu05993A,7563897775230814227
11964,22,51063655,51063656,C,T,0.4616,0.86,rs148352371,,PASS,True,1/0,*,0.92,huE9E383,8517581573938360347
11965,22,38379542,38379543,G,A,0.5000,1.00,rs73415876,,PASS,,0/1,*,1.00,hu92C40A,1224008132487914232
11966,22,38379542,38379543,G,A,0.5000,1.00,rs73415876,,PASS,,1/0,*,1.00,hu604D39,4738226870084993710


In [37]:
#df_study[df_study['GeneSymbol'] == 'APC']['names'].value_counts()

In [38]:
def read_clinvar_variants(variants_path):
    sample = pd.read_csv(variants_path, header=0, nrows = 2, sep = '\t')
    cols = sample.columns
    dtypes = {col:str for col in cols}
    df = pd.read_csv(variants_path, names = cols, dtype = dtypes, sep = '\t', comment = '#')
    
    return df

def filter_variants(df):
    # SNPs only
    df = df[df['Type'] == 'single nucleotide variant']
    df = df[df['ReferenceAlleleVCF'].str.len() == 1]
    df = df[df['AlternateAlleleVCF'].str.len() == 1]

    # Filter for P/LP variants
    accepted_significance = ['Pathogenic','Likely pathogenic']
    df = df[df['ClinicalSignificance'].isin(accepted_significance)]
    
    # Only consider positions from GRCh37 Assembly
    df = df[df['Assembly'] == 'GRCh37']
    
    # Remove X,Y, MT chromosomes
    df = df[~df['Chromosome'].isin(['X','Y','MT'])]
    
    df.reset_index(drop=True, inplace=True)
    
    
    return df

In [46]:
clinvar_variants_path = '/Users/jerenolsen/Desktop/All_Tests/HVD-PGP Population Analysis/Deep Phenotyping copy/ClinVar Variants/variant_summary.txt'
df_clinvar = read_clinvar_variants(clinvar_variants_path)

In [47]:
df_clinvar = filter_variants(df_clinvar)

In [48]:
df_clinvar['GeneSymbol'].nunique()

4640

### Merge Study variants with Clinvar information

In [34]:
df = clinvar_rsid_geno

In [42]:
def add_merge_col1(clinvar_rsid_geno):
    df = clinvar_rsid_geno
    df['ref'] = df['Name'].str.split('>').str.get(0)
    df['change'] = df['Name'].str.split('>').str.get(1)
    df['ref'] = df['ref'].str.get(-1)
    df['change'] = df['change'].str.get(0)
    df['merge_col'] = 'rs'+df['RS# (dbSNP)']+'_'+df['ref']+df['change']
    return df
    
def add_merge_col2(df_study):
    df = df_study
    df['merge_col'] = df['names']+"_"+df['reference_bases'] + df['alternate_bases.alt']
    return df

In [43]:
df_study

Unnamed: 0,chromosome,start_position,end_position,reference_bases,alternate_bases.alt,alternate_bases.AF,alternate_bases.DR2,names,quality,filter,IMP,genotype,phaseset,DS,sample_name,sample_id
0,1,25629819,25629820,T,G,0.4793,0.92,rs121912763,,PASS,True,1/0,*,0.96,hu5D24B4,577641578821216468
1,1,25629819,25629820,T,G,0.5000,1.00,rs121912763,,PASS,,0/1,*,1.00,hu4BA913,3734609915449686100
2,1,216424302,216424303,A,G,0.4363,0.77,rs45555435,,PASS,True,0/1,*,0.87,hu61EBEE,7589298601817165598
3,1,216424302,216424303,A,G,0.4814,0.93,rs45555435,,PASS,True,0/1,*,0.96,hu0486D6,129822115180450508
4,1,216424302,216424303,A,G,0.5000,1.00,rs45555435,,PASS,,0/1,*,1.00,huA49E22,7296257421245505846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11963,22,50964235,50964236,G,A,0.9684,0.03,rs11479,,PASS,True,1/1,*,1.94,hu05993A,7563897775230814227
11964,22,51063655,51063656,C,T,0.4616,0.86,rs148352371,,PASS,True,1/0,*,0.92,huE9E383,8517581573938360347
11965,22,38379542,38379543,G,A,0.5000,1.00,rs73415876,,PASS,,0/1,*,1.00,hu92C40A,1224008132487914232
11966,22,38379542,38379543,G,A,0.5000,1.00,rs73415876,,PASS,,1/0,*,1.00,hu604D39,4738226870084993710


In [49]:
df_study['rsid integer'] = df_study['names'].str[2:]
clinvar_rsid_geno = df_clinvar[['Name','RS# (dbSNP)', 'GeneSymbol', 'ClinicalSignificance']]

clinvar_rsid_geno = add_merge_col1(clinvar_rsid_geno)
df_study = add_merge_col2(df_study)

df_study = df_study.merge(clinvar_rsid_geno, left_on = 'merge_col', right_on = 'merge_col')
outpath = '/Users/jerenolsen/Desktop/All_Tests/GBC_Querying/hvd_pgp_variants&genes.tsv'
df_study.to_csv(outpath, sep='\t')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ref'] = df['Name'].str.split('>').str.get(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['change'] = df['Name'].str.split('>').str.get(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ref'] = df['ref'].str.get(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try