# using the PON coverage, calculate the coverage difference of tumor samples 

In [1]:
# get the code
import sys
sys.path.append('../code')
from coverage import get_coverage

### prepare the files

In [11]:
# set the paths
home = '/Users/mahtin'
home = '/Users/martinscience'
testdata = f"{home}/Dropbox/Icke/Work/somVar/testdata"
tooldata = f"{home}/Dropbox/Icke/Work/somVar/tooldata"
cnvdata = f"{tooldata}/myCNVdata"
shell_path = "../shell"
static_path = f"{home}/Dropbox/Icke/Work/static"
bed_path = f"{static_path}/bed_files/SureSelect/hg38"

In [45]:
cnvdata

'/Users/martinscience/Dropbox/Icke/Work/somVar/tooldata/myCNVdata'

### make the config

In [12]:
# shell paths
mawk_tools = ['bamCoverage', 'filterBed', 'rollingCoverage']
mawk_tool_dict = {tool:f"{shell_path}/{tool}.mawk" for tool in mawk_tools}
config = {
    'rollingWindowSize': 100,
    'bedfile': f"{bed_path}/SS_HAEv7_hg38_Covered.bed",
    'q': 20
}
# add tool shell paths to config
config.update(mawk_tool_dict)


In [4]:
ls f"{testdata}/bam"

ls: f/Users/martinscience/Dropbox/Icke/Work/somVar/testdata/bam: No such file or directory


In [85]:
bam_file = "003_A.bam"

chrom = "chr7"
bam_chrom_file = f"{testdata}/bam/" + bam_file.replace(".bam", f".{chrom}.bam")

bam_df = get_coverage(bam_chrom_file, chrom, config)

[1m$ samtools view -q 20 /Users/martinscience/Dropbox/Icke/Work/somVar/testdata/bam/003_A.chr7.bam chr7 | ../shell/bamCoverage.mawk | ../shell/rollingCoverage.mawk 100 | ../shell/filterBed.mawk /Users/martinscience/Dropbox/Icke/Work/static/bed_files/SureSelect/hg38/SS_HAEv7_hg38_Covered.bed chr7 1[0m


In [87]:
cov_df = bam_df.copy()

In [88]:
cov_df['normCov'] = cov_df['Coverage'] / cov_df['Coverage'].mean() * 250
cov_df = cov_df.iloc[:,:5]
cov_df

Unnamed: 0,Chr,Pos,Coverage,ExonPos,normCov
0,chr7,35350,24.26,967,9.991828
1,chr7,35400,22.36,1017,9.209286
2,chr7,35450,12.33,1067,5.078287
3,chr7,36200,1.58,1112,0.650746
4,chr7,36250,4.58,1162,1.886338
...,...,...,...,...,...
46496,chr7,159144650,473.44,2423334,194.993032
46497,chr7,159144700,357.99,2423384,147.443299
46498,chr7,159144750,271.64,2423434,111.878817
46499,chr7,159144800,212.69,2423484,87.599417


In [89]:
chrom = "chr7"
chrom_cov_file = f"{cnvdata}/chromCov/{chrom}.bedCov"
chrom_cov_df = pd.read_csv(chrom_cov_file, sep='\t').rename(columns={'meanCov':'PONmeanCov', 'std': 'PONstd'})
chrom_cov_df

Unnamed: 0,Chr,Pos,ExonPos,PONmeanCov,PONstd
0,chr7,35350,967,7.714566,8.019915
1,chr7,35400,1017,11.643514,13.194251
2,chr7,35450,1067,12.112316,14.815315
3,chr7,36200,1112,7.552480,10.131597
4,chr7,36250,1162,6.778501,8.852529
...,...,...,...,...,...
47121,chr7,159144650,2423334,154.801458,64.090454
47122,chr7,159144700,2423384,123.519995,50.822211
47123,chr7,159144750,2423434,102.960494,41.161186
47124,chr7,159144800,2423484,81.492084,33.819413


### merge cov_df with chrom_cov_df for quantification of stuff

In [90]:
cov_df = cov_df.merge(chrom_cov_df, on=['Chr', 'Pos', 'ExonPos'], how="outer").fillna(0)
cov_df

Unnamed: 0,Chr,Pos,Coverage,ExonPos,normCov,PONmeanCov,PONstd
0,chr7,35350,24.26,967,9.991828,7.714566,8.019915
1,chr7,35400,22.36,1017,9.209286,11.643514,13.194251
2,chr7,35450,12.33,1067,5.078287,12.112316,14.815315
3,chr7,36200,1.58,1112,0.650746,7.552480,10.131597
4,chr7,36250,4.58,1162,1.886338,6.778501,8.852529
...,...,...,...,...,...,...,...
47128,chr7,148383800,0.00,2178409,0.000000,1.056953,2.937661
47129,chr7,148383850,0.00,2178459,0.000000,0.476071,1.419659
47130,chr7,158316850,0.00,2405137,0.000000,0.722697,1.465440
47131,chr7,158316900,0.00,2405187,0.000000,0.591153,1.423600


In [91]:
cov_df['ploidy'] = cov_df['normCov'] / cov_df['PONmeanCov'] * 2
cov_df.query('ploidy > 100')

Unnamed: 0,Chr,Pos,Coverage,ExonPos,normCov,PONmeanCov,PONstd,ploidy
14637,chr7,63558600,0.28,734206,0.115322,0.0,0.0,inf
14638,chr7,63558650,1.27,734256,0.523068,0.0,0.0,inf
17583,chr7,74736650,0.28,890591,0.115322,0.0,0.0,inf
29288,chr7,102560750,0.1,1492453,0.041186,0.0,0.0,inf
29307,chr7,102567950,0.47,1493407,0.193576,0.0,0.0,inf
29447,chr7,102690450,0.93,1504604,0.383034,0.0,0.0,inf
39830,chr7,142038500,1.88,2030881,0.774305,0.009261,0.04722,167.224732
40138,chr7,142870450,0.06,2084479,0.024712,0.0,0.0,inf
41061,chr7,144183600,0.77,2149014,0.317136,0.003504,0.017865,181.034222


### filter out certain stuff before applying rolling average

In [98]:
select_df = cov_df.query('ploidy < 100')
select_df['rollingPloidy'] = select_df['ploidy'].rolling(500).mean()
select_df.to_csv(f"{cnvdata}/output/{bam_file}.diff.csv", sep='\t', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  select_df['rollingPloidy'] = select_df['ploidy'].rolling(500).mean()
