In [None]:
import pandas as pd
import time
import os
import numpy as np

In [None]:
test_path = '/Users/jerenolsen/Desktop/GWAS_Summary_Stats/categorical-20002-both_sexes-1330_first_100000.tsv'
gwas_path = '/Users/jerenolsen/Desktop/GWAS_Summary_Stats/GERD-icd10-K21-both_sexes.tsv'

In [None]:
def get_cols(filepath):
    line1 = None
    with open(filepath, 'r') as f:
        for line in f:
            line1 = line
            break        
            
    line1 = line1.strip('\n').split('\t')
            
    f.close()
    
    return line1

def get_types(filepath):
    col_line = None
    val_line = None
    
    df = pd.read_csv(filepath, nrows=3, sep='\t')

    cols = df.columns
    types = df.dtypes
    type_dict = {col:[dtype] for col,dtype in zip(cols,types)}

    df_types = pd.DataFrame.from_dict(type_dict)
    
    return df_types

In [None]:
df_types = get_types(gwas_path) #Recast these types later

In [None]:
df_types['chr'] = np.dtype(str)

In [None]:
type(df_types.loc[0]['ref'])

In [None]:
line1 = get_cols(gwas_path)
cols = line1
cols_to_keep = [col for col in line1 if 'AFR' not in col and 'CSA' not in col and 'meta' not in col and 'MID' not in col and 'EAS' not in col and 'AMR' not in col]
dtype = {col:str for col in cols}

In [None]:
cols

In [None]:
cols_to_keep

In [None]:
chunks = pd.read_csv(gwas_path, names = cols,header=0,dtype=dtype, sep='\t', chunksize = 100000)
filtered_chunks = []
i = 0
for chunk in chunks:
    filtered_chunks.append(chunk[cols_to_keep])
    i+=1
    print(f"Chunk {i} finished")

In [None]:
df = pd.concat(filtered_chunks)

In [None]:
df 

In [None]:
# Target col names: rsid,chr,pos,a0,a1,beta,beta_se,N,p

In [None]:
def rename_pattern_ldpred2(df, df_types):
    rename_pattern = {'ref': 'a0',
                     'alt': 'a1',
                     'beta_EUR': 'beta',
                     'se_EUR': 'beta_se',
                     'neglog10_pval_EUR': 'p'}
    df.rename(columns = rename_pattern,inplace = True)
    df_types.rename(columns = rename_pattern, inplace = True)
    
def rename_pattern_prsice(df, df_types):
    rename_pattern = {'chr':'CHR',
                              'pos':'BP',
                              'ref': 'ref', #A2
                             'alt': 'alt', #A1
                             'beta_EUR': 'BETA',
                             'se_EUR': 'SE',
                             'neglog10_pval_EUR': 'P'}
    df.rename(columns = rename_pattern,inplace = True)
    df_types.rename(columns = rename_pattern, inplace = True)

In [None]:
rename_pattern_prsice(df, df_types)

In [None]:
#Sample size per variant = 361,194 (http://www.nealelab.is/uk-biobank/faq)

In [None]:
df['N'] = 361194
df_types['N'] = np.dtype(np.int64)

In [None]:
df = df[['CHR','BP','ref','alt','BETA','SE','P']]
#df = df[['chr','pos', 'a0', 'a1', 'beta', 'beta_se','N', 'p']]

In [None]:
df

In [None]:
#Filter out non-snps

In [None]:
def filter_snps_ldpred2(df):
    df = df[df['a0'].str.len() == 1]
    df = df[df['a1'].str.len() == 1]
    df = df[df['chr']!='X']
    df.reset_index(drop=True, inplace=True)
    
    return df
    
def filter_snps_prsice(df):
    df = df[df['ref'].str.len() == 1]
    df = df[df['alt'].str.len() == 1]
    df = df[df['CHR']!='X']
    df = df[df['CHR']!='Y']
    df.reset_index(drop=True, inplace=True)
    
    return df

In [None]:
df = filter_snps_prsice(df)

In [None]:
df

In [None]:
# Remove positions that are missing a beta, beta_se or p value

In [None]:
def filter_missing_vals_ldpred2(df):
    df = df[df['beta'].isna()==False]
    df.reset_index(drop=True, inplace=True)
    
    return df
    
def filter_missing_vals_prsice(df):
    df = df[df['BETA'].isna()==False]
    df.reset_index(drop=True, inplace=True)
    
    return df

In [None]:
df = filter_missing_vals_prsice(df)

In [None]:
mask = df['SE'].isna()==False
mask.value_counts()

In [None]:
mask = df['P'].isna()==False
mask.value_counts()

In [None]:
def remove_dups_prsice(df):
    df['unique_id'] = df['CHR']+df['BP']
    df['unique_id_counts'] = df['unique_id'].map(df['unique_id'].value_counts())
    df = df[df['unique_id_counts'] == 1]
    df.reset_index(drop=True, inplace=True)
    df.drop(columns = ['unique_id','unique_id_counts'], inplace=True)
    return df

In [None]:
df = remove_dups_prsice(df)

In [None]:
def handle_sci_notation(df):
    df['BETA'] = df['BETA'].astype(float)
    df['SE'] = df['SE'].astype(float)
    df['P'] = df['P'].astype(float)
    return df

In [None]:
df = handle_sci_notation(df)

In [None]:
def remove_p_vals(df):
    #Remove p values greater than 1.0
    df = df[df['P'] <=1.0]
    df = df.reset_index(drop=True)
    return df

In [None]:
df = remove_p_vals(df)

In [None]:
#Recast types
#for col in df.columns:
    #df[col].astype(df_types[col].values[0])

In [None]:
df.head(3)

In [None]:
def write_gwas(df, condition_name, outdir):
    filename = 'UKB_GWAS_SumStats_{}_processed.txt'.format(condition_name)
    outpath = os.path.join(outdir,filename)
    df.to_csv(outpath,sep='\t', index=False)

In [None]:
outdir = '/Users/jerenolsen/Desktop/GWAS_Summary_Stats'
condition_name = 'GERD'
write_gwas(df, condition_name, outdir)