# ~ Beagle 5.4 Homozygous Refererence Imputation Accuracy Experiment ~
# Part 2

In [1]:
#This script loads in the golden reference chromosome 1 region and the imputed chromosome 1 region.
#It subsets both dataframes to leave the variant positions that are common between both files
#A perfect imputation of the homozygous reference region would result in all alleles in the imputed file being
    # Homozygous reference, therefore a measurement is taken to show how many positions are non homo-ref

In [3]:
import pandas as pd

In [4]:
golden_ref_path = '/Users/jerenolsen/Desktop/All_Tests/Beagle_Imputation_exp/chr1_subsection_experiment_golden_reference.vcf'
imputed_path = '/Users/jerenolsen/Desktop/All_Tests/Beagle_Imputation_exp/imputed_out.vcf'

In [5]:
def read_file(filepath):
    col_line = None
    header = []
    with open(filepath, 'r') as f:
        for line in f:
            if line[0:2] == '##':
                header.append(line)
            elif line[0] == '#':
                col_line = col_line
                break

    cols = line.strip('\n').rsplit('\t')
    col_types = {col:str for col in cols}
    #pd.read_csv(cgi_path, sep='\t', names = cols, header = None, comment = '#', dtype=col_types, chunksize=100000)
    df = pd.read_csv(filepath, sep = '\t', names = cols, header=None, comment='#' , dtype=col_types)
    
    return df

In [6]:
df_golden_ref = read_file(golden_ref_path)
df_imputed = read_file(imputed_path)

In [7]:
df_golden_ref

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NA00001
0,1,11431,.,C,C,.,.,.,GT,0/0
1,1,11432,.,T,T,.,.,.,GT,0/0
2,1,11433,.,G,G,.,.,.,GT,0/0
3,1,11434,.,C,C,.,.,.,GT,0/0
4,1,11435,.,C,C,.,.,.,GT,0/0
...,...,...,...,...,...,...,...,...,...,...
8353396,1,9676810,.,A,A,.,.,.,GT,0/0
8353397,1,9676811,.,T,T,.,.,.,GT,0/0
8353398,1,9676812,.,G,G,.,.,.,GT,0/0
8353399,1,9676813,.,C,C,.,.,.,GT,0/0


In [8]:
df_imputed

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NA00001
0,1,13110,rs540538026,G,A,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
1,1,13116,rs62635286,T,G,.,PASS,DR2=0.00;AF=0.0321;IMP,GT:DS,0|0:0.06
2,1,13118,rs200579949,A,G,.,PASS,DR2=0.00;AF=0.0321;IMP,GT:DS,0|0:0.06
3,1,13273,rs531730856,G,C,.,PASS,DR2=0.01;AF=0.0431;IMP,GT:DS,0|0:0.09
4,1,13284,rs548333521,G,A,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
...,...,...,...,...,...,...,...,...,...,...
5550,1,997598,rs542296068,T,G,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
5551,1,997769,rs113991671,CG,C,.,PASS,DR2=1.00;AF=0.5000;IMP,GT:DS,0|1:1
5552,1,997775,rs541364701,G,C,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
5553,1,997901,rs117976668,C,T,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0


In [9]:
def filter_imputed(df):
    
    #Only consider SNPs and unambiguos variants
    df = df[df['REF'].str.len() == 1]
    df = df[df['ALT'].str.len() == 1]
    
    #Only consider imputed positions
    df = df[df['INFO'].str.contains('IMP')]
    
    
    df.reset_index(drop=True, inplace=True)
    return df
    

In [10]:
df_imputed_filtered = filter_imputed(df_imputed)

In [11]:
df_imputed_filtered

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NA00001
0,1,13110,rs540538026,G,A,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
1,1,13116,rs62635286,T,G,.,PASS,DR2=0.00;AF=0.0321;IMP,GT:DS,0|0:0.06
2,1,13118,rs200579949,A,G,.,PASS,DR2=0.00;AF=0.0321;IMP,GT:DS,0|0:0.06
3,1,13273,rs531730856,G,C,.,PASS,DR2=0.01;AF=0.0431;IMP,GT:DS,0|0:0.09
4,1,13284,rs548333521,G,A,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
...,...,...,...,...,...,...,...,...,...,...
4802,1,997594,rs376410450,C,T,.,PASS,DR2=0.01;AF=0.0135;IMP,GT:DS,0|0:0.03
4803,1,997598,rs542296068,T,G,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
4804,1,997775,rs541364701,G,C,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
4805,1,997901,rs117976668,C,T,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0


In [12]:
imputed_positions = df_imputed_filtered['POS'].values.tolist()

In [13]:
def subset_golden_ref(df, positions):
    
    df = df[df['POS'].isin(positions)]
    
    df.reset_index(drop=True, inplace=True)
    return df

In [14]:
golden_ref_subset = subset_golden_ref(df_golden_ref,imputed_positions)

In [15]:
golden_ref_subset

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NA00001
0,1,13380,.,A,A,.,.,.,GT,0/0
1,1,13483,.,G,G,.,.,.,GT,0/0
2,1,13494,.,C,C,.,.,.,GT,0/0
3,1,13550,.,T,T,.,.,.,GT,0/0
4,1,15585,.,T,T,.,.,.,GT,0/0
...,...,...,...,...,...,...,...,...,...,...
3286,1,997233,.,A,A,.,.,.,GT,0/0
3287,1,997594,.,A,A,.,.,.,GT,0/0
3288,1,997598,.,G,G,.,.,.,GT,0/0
3289,1,997901,.,C,C,.,.,.,GT,0/0


In [16]:
positions = golden_ref_subset['POS'].values.tolist()

In [17]:
def subset_imputed_on_golden_ref(df, positions):
    df = df[df['POS'].isin(positions)]
    df.reset_index(drop=True, inplace=True)
    return df

In [18]:
df_imputed_filtered_subset = subset_imputed_on_golden_ref(df_imputed_filtered, positions)

In [19]:
df_imputed_filtered_subset

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NA00001
0,1,13380,rs571093408,C,G,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
1,1,13483,rs554760071,G,C,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
2,1,13494,rs574697788,A,G,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
3,1,13550,rs554008981,G,A,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
4,1,15585,rs533630043,G,A,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
...,...,...,...,...,...,...,...,...,...,...
3286,1,997233,rs570518763,T,A,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
3287,1,997594,rs376410450,C,T,.,PASS,DR2=0.01;AF=0.0135;IMP,GT:DS,0|0:0.03
3288,1,997598,rs542296068,T,G,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0
3289,1,997901,rs117976668,C,T,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0|0:0


In [20]:
df_imputed_filtered_subset['NA00001'] = df_imputed_filtered_subset['NA00001'].str.replace('|','/')
df_imputed_filtered_subset['NA00001'] = df_imputed_filtered_subset['NA00001'].str[0:3]

  df_imputed_filtered_subset['NA00001'] = df_imputed_filtered_subset['NA00001'].str.replace('|','/')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_imputed_filtered_subset['NA00001'] = df_imputed_filtered_subset['NA00001'].str.replace('|','/')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_imputed_filtered_subset['NA00001'] = df_imputed_filtered_subset['NA00001'].str[0:3]


In [21]:
df_imputed_filtered_subset

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NA00001
0,1,13380,rs571093408,C,G,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0/0
1,1,13483,rs554760071,G,C,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0/0
2,1,13494,rs574697788,A,G,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0/0
3,1,13550,rs554008981,G,A,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0/0
4,1,15585,rs533630043,G,A,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0/0
...,...,...,...,...,...,...,...,...,...,...
3286,1,997233,rs570518763,T,A,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0/0
3287,1,997594,rs376410450,C,T,.,PASS,DR2=0.01;AF=0.0135;IMP,GT:DS,0/0
3288,1,997598,rs542296068,T,G,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0/0
3289,1,997901,rs117976668,C,T,.,PASS,DR2=0.00;AF=0.0000;IMP,GT:DS,0/0


In [22]:
total_positions = len(df_imputed_filtered_subset)

In [28]:
counts = df_imputed_filtered_subset['NA00001'].value_counts()
counts

0/0    3261
0/1      25
1/0       4
1/1       1
Name: NA00001, dtype: int64

In [24]:
test_homo_ref = df_imputed_filtered_subset['NA00001'].value_counts()[0]

In [25]:
golden_homo_ref = golden_ref_subset['NA00001'].value_counts()[0]

In [26]:
x1 = golden_homo_ref
x2 = test_homo_ref

In [27]:
str(round(abs(x2-x1)/x1*100,2))+'%'

'0.91%'

In [33]:
from tabulate import tabulate

data = [
    ["Total Positions", total_positions],
    ["Homozygous Reference", counts[0]],
    ["Heterozygous Alternate", counts[1]+counts[2]],
    ["Homozygous Alternate", counts[3]],
    ["Changed Percentage", f"{str(round(abs(x2-x1)/x1*100,2))}%"]
]

In [36]:
print(tabulate(data, headers=["CHR1 Subregion Alleles",'Measure'], tablefmt="fancy_grid", numalign="right"))

╒══════════════════════════╤═══════════╕
│ CHR1 Subregion Alleles   │ Measure   │
╞══════════════════════════╪═══════════╡
│ Total Positions          │ 3291      │
├──────────────────────────┼───────────┤
│ Homozygous Reference     │ 3261      │
├──────────────────────────┼───────────┤
│ Heterozygous Alternate   │ 29        │
├──────────────────────────┼───────────┤
│ Homozygous Alternate     │ 1         │
├──────────────────────────┼───────────┤
│ Changed Percentage       │ 0.91%     │
╘══════════════════════════╧═══════════╛
