In [1]:
# notes

# # divide index 0 by index 1 for first row to find proportion of sequences methylated
# (cancer_beta[0:1, 0] / cancer_beta[0:1, 1])[0]

# # create a dict of outputs from prop_meth function
# proportion_methylated = {
#     'cancer': prop_meth(cancer_beta_df),
#     'control': prop_meth(control_beta_df),
#     'cancerM': prop_meth(cancerM_beta_df)
#     }

# # create a column in cancer_beta_df that is the proportion of methylated sequences out of total sequences
# cancer_beta_df['prop_meth'] = cancer_beta_df['num_meth'].div(cancer_beta_df['total_reads'])

In [3]:
import numpy as np
import pandas as pd

cancer = "/Users/meghansleeper/Desktop/farm-files/data/tissue-samples/271/merged/wgbstools-out/cancer-SRX381569-merged.beta"
cancerM = "/Users/meghansleeper/Desktop/farm-files/data/tissue-samples/271/merged/wgbstools-out/cancerM-SRX381585-merged.beta"
control = "/Users/meghansleeper/Desktop/farm-files/data/tissue-samples/271/merged/wgbstools-out/control-SRX381553-merged.beta"

cancer_beta = np.fromfile(cancer, dtype=np.uint8).reshape((-1, 2))
cancerM_beta = np.fromfile(cancerM, dtype=np.uint8).reshape((-1, 2))
control_beta = np.fromfile(control, dtype=np.uint8).reshape((-1, 2))

cancer_beta_df = pd.DataFrame(cancer_beta, columns=['num_meth', 'total_reads'])
cancerM_beta_df = pd.DataFrame(cancerM_beta, columns=['num_meth', 'total_reads'])
control_beta_df = pd.DataFrame(control_beta, columns=['num_meth', 'total_reads'])


In [4]:
def calc_beta_info(beta):
    length = len(beta)
    mean_coverage = np.mean(beta[:, 1])
    median_coverage = np.median(beta[:, 1])
    std_coverage = np.std(beta[:, 1])
    quantile_1 = np.quantile(beta[:, 1], 0.25)
    quantile_2 = np.quantile(beta[:, 1], 0.50)
    quantile_3 = np.quantile(beta[:, 1], 0.75)
    max_coverage = np.max(beta[:, 1])
    min_coverage = np.min(beta[:, 1])

    print("BETA SUMMARY: \n Rows: {0} (each is a CpG site)".format(length))
    print(" Columns: [# of methylated sequences, # of sequences total] \n", beta)
    print(" \n STATS FOR SEQ COVERAGE BY CPG SITE: \n   Mean:", mean_coverage)
    print("   Median:", median_coverage)
    print("   Standard deviation:", std_coverage)
    print("   25th, 50th, and 75th percentiles: {0}, {1}, {2}.".format(quantile_1, quantile_2, quantile_3))
    print("   Maximum: {} (limited to 255 by unit8 format of beta file)".format(max_coverage))
    print("   Minimum:", min_coverage)
    print("\n")

    return length, mean_coverage, median_coverage, std_coverage, quantile_1, quantile_2, quantile_3, max_coverage, min_coverage
    

In [5]:
file_info = {
    'cancer': calc_beta_info(cancer_beta),
    'control': calc_beta_info(control_beta),
    'cancerM': calc_beta_info(cancerM_beta)
    }

beta_info_df = pd.DataFrame.from_dict(file_info, orient='index', 
                            columns=['length', 'mean_cov', 'median_cov', 'std_cov',
                                    'quant_1', 'quant_2', 'quant_3', 'max_cov', 'min_cov'])

BETA SUMMARY: 
 Rows: 29152891 (each is a CpG site)
 Columns: [# of methylated sequences, # of sequences total] 
 [[ 50  92]
 [ 55  71]
 [ 61  71]
 ...
 [  0 255]
 [  0 255]
 [  0   0]]
 
 STATS FOR SEQ COVERAGE BY CPG SITE: 
   Mean: 42.06582606164171
   Median: 41.0
   Standard deviation: 26.127430377592113
   25th, 50th, and 75th percentiles: 23.0, 41.0, 59.0.
   Maximum: 255 (limited to 255 by unit8 format of beta file)
   Minimum: 0


BETA SUMMARY: 
 Rows: 29152891 (each is a CpG site)
 Columns: [# of methylated sequences, # of sequences total] 
 [[ 53 102]
 [ 56  75]
 [ 67  73]
 ...
 [  0 255]
 [  1 255]
 [  0   0]]
 
 STATS FOR SEQ COVERAGE BY CPG SITE: 
   Mean: 42.95211140466309
   Median: 38.0
   Standard deviation: 33.71693101392165
   25th, 50th, and 75th percentiles: 15.0, 38.0, 65.0.
   Maximum: 255 (limited to 255 by unit8 format of beta file)
   Minimum: 0


BETA SUMMARY: 
 Rows: 29152891 (each is a CpG site)
 Columns: [# of methylated sequences, # of sequences total] 


In [6]:
# divide number of sequences methylated by total sequences to find proportion of sequences methylated
def prop_meth(beta_df):
    beta_df['prop_meth'] = beta_df['num_meth'].div(beta_df['total_reads'])
    return beta_df


In [51]:
cancer_beta_df = prop_meth(cancer_beta_df)
control_beta_df = prop_meth(control_beta_df)
cancerM_beta_df = prop_meth(cancerM_beta_df)

In [55]:
# combine the three dataframes into one dataframe
beta_df = pd.concat([cancer_beta_df, control_beta_df, cancerM_beta_df], axis=1, keys=['cancer', 'control', 'cancerM'])

beta_df.head()


Unnamed: 0_level_0,cancer,cancer,cancer,control,control,control,cancerM,cancerM,cancerM
Unnamed: 0_level_1,num_meth,total_reads,prop_meth,num_meth,total_reads,prop_meth,num_meth,total_reads,prop_meth
0,50,92,0.543478,53,102,0.519608,47,87,0.54023
1,55,71,0.774648,56,75,0.746667,58,69,0.84058
2,61,71,0.859155,67,73,0.917808,60,70,0.857143
3,68,72,0.944444,66,72,0.916667,65,72,0.902778
4,62,73,0.849315,58,68,0.852941,63,71,0.887324
