In [3]:
import gzip

import pandas as pd

pd.set_option("display.max_columns", None)

Read LOH positions

In [4]:
df = pd.read_csv("../intersect_mmml_onek1k_with_header_adapted.vcf.gz", sep="\t", compression="gzip", low_memory=False)
len(df)

18889487

In [5]:
df = df.drop(columns=["Unnamed: 29", "position_dummy", "#CHR", "POS", "POS_DUMMY"])

In [6]:
df["position_code"] = df["#CHROM"].astype(str) + "-" + df["pos"].astype(str)

In [7]:
df.head()

Unnamed: 0,#CHROM,pos,REF,ALT,INFO,sample_control,sample_tumor,start,end,genotype,TCN,PID,normal_genotype,tumor_genotype,quality_score,reads_normal,reads_tumor,CELL_TYPE,RSID,GENE,GENE_ID,A1,A2,A2_FREQ_ONEK1K,A2_FREQ_HRC,position_code
0,9,21448448,G,A,BRF=0.13;FR=0.5;HP=1;HapScore=2;MGOF=18;MMLQ=3...,"1/0:-38.46,0,-91.36:8:99:38:13","1/0:-32.21,0,-45.31:18:99:27:11",21341573,21731170,1:0,1.0,4100049,11,10,22.796598,38,27,"Naïve/Immature B Cell,Memory B Cell,CD4 Effect...",rs7864960,"{'PTPLAD2', 'RP11-70L8.4', 'RP11-408N14.1'}","{'ENSG00000188921', 'ENSG00000265194', 'ENSG00...",G,A,0.49054,0.493363,9-21448448
1,9,21456776,G,A,BRF=0.18;FR=0.5;HP=2;HapScore=1;MGOF=9;MMLQ=37...,"1/0:-69.42,0,-74.92:5:99:46:23","1/0:-9.51,0,-59.71:9:95:23:4",21341573,21731170,1:0,1.0,4100049,11,10,68.389793,46,23,"Naïve/Immature B Cell,Memory B Cell,CD4 Effect...",rs10491569,"{'PTPLAD2', 'RP11-70L8.4', 'RP11-408N14.1'}","{'ENSG00000188921', 'ENSG00000265194', 'ENSG00...",G,A,0.28734,0.273914,9-21456776
2,9,21471499,C,T,BRF=0.1;FR=0.5;HP=2;HapScore=2;MGOF=11;MMLQ=37...,"0/1:-71.26,0,-46.26:11:99:35:20","0/1:-17.58,0,-59.78:11:99:25:5",21341573,21731170,1:0,1.0,4100049,11,10,68.389793,35,25,"Naïve/Immature B Cell,Memory B Cell,CD4 Effect...",rs1412395,"{'PTPLAD2', 'RP11-70L8.4', 'RP11-408N14.1'}","{'ENSG00000188921', 'ENSG00000265194', 'ENSG00...",C,T,0.60961,0.612796,9-21471499
3,9,21471726,C,T,BRF=0.17;FR=0.5;HP=1;HapScore=2;MGOF=12;MMLQ=4...,"0/1:-72.59,0,-64.49:7:99:42:22","0/1:-10.98,0,-55.68:12:99:24:6",21341573,21731170,1:0,1.0,4100049,11,10,54.711835,42,24,"Naïve/Immature B Cell,Memory B Cell,CD4 Effect...",rs2383192,"{'PTPLAD2', 'RP11-70L8.4', 'RP11-408N14.1'}","{'ENSG00000188921', 'ENSG00000265194', 'ENSG00...",C,T,0.53553,0.531583,9-21471726
4,9,21473817,A,G,BRF=0.14;FR=0.3913;HP=4;HapScore=1;MGOF=12;MML...,"0/1:-85.27,0,-104.07:12:99:63:28","0/1:-0.31,0,-95.51:3:5:29:2",21341573,21731170,1:0,1.0,4100049,11,10,99.0,63,29,"Naïve/Immature B Cell,Memory B Cell,CD4 Effect...",rs10965014,"{'PTPLAD2', 'RP11-70L8.4', 'RP11-408N14.1'}","{'ENSG00000188921', 'ENSG00000265194', 'ENSG00...",A,G,0.6105,0.60425,9-21473817


Analyse LOH positions

In [8]:
no_distinct_positions = df["position_code"].nunique()
no_distinct_positions

4115418

In [9]:
df["#CHROM"].nunique()

22

In [10]:
no_distinct_patients = df["PID"].nunique()
no_distinct_patients

233

In [11]:
unique_combinations = len(df.drop_duplicates(['PID','position_code']).index)
unique_combinations

18672344

Merge position count

In [12]:
ser = pd.read_json("../intersect_position_count_adapted.json", typ="series", dtype={str})
df_position_count = ser.to_frame("position_count")
df_position_count = df_position_count.reset_index(names="pos")
df_position_count.shape

(4115418, 2)

In [13]:
df_position_count.head()

Unnamed: 0,pos,position_count
0,9-21448448,12
1,9-21456776,13
2,9-21471499,15
3,9-21471726,17
4,9-21473817,14


In [14]:
df = df.merge(df_position_count, how="inner", left_on="position_code", right_on="pos")
len(df)

18889487

Set threshold

In [27]:
threshold = 20
df_thresholded = df[df["position_count"] >= threshold]
len(df_thresholded)

2550487

Analyze thresholded dataframe

In [28]:
df_thresholded["PID"].nunique()

224

In [29]:
df_thresholded["#CHROM"].nunique()

15

In [30]:
df_thresholded["#CHROM"].unique()

array([ 9, 14,  3,  1,  6, 17, 22, 10, 13, 15, 16,  2, 18,  8, 19])

In [31]:
df_thresholded["position_code"].nunique()

105490

Write to csv

In [32]:
df_thresholded.to_csv(f"../LOH_pos_thresholded_adapted/LOH_positions_threshold_{threshold}.tsv.gz", sep="\t", header=True, index=False, compression="gzip")