In [1]:
import gzip

import pandas as pd

pd.set_option("display.max_columns", None)

Read LOH positions

In [2]:
with open("../mmml_onek1k_all_patients_columns.txt", mode="rt") as columns_file:
    columns_list = columns_file.readlines()

columns_list = [col.rstrip("\n") for col in columns_list]

columns_list[1] = "pos"
columns_list = columns_list[0:18]
columns_list

['#CHROM',
 'pos',
 'position_dummy',
 'REF',
 'ALT',
 'INFO',
 'sample_control',
 'sample_tumor',
 'start',
 'end',
 'genotype',
 'TCN',
 'PID',
 'normal_genotype',
 'tumor_genotype',
 'quality_score',
 'reads_normal',
 'reads_tumor']

In [3]:
df = pd.read_csv("../LOH_positions_mmml.vcf.gz", sep="\t", header=None, names=columns_list, compression="gzip", low_memory=False)
len(df)

28270385

In [4]:
df.head()

Unnamed: 0,#CHROM,pos,position_dummy,REF,ALT,INFO,sample_control,sample_tumor,start,end,genotype,TCN,PID,normal_genotype,tumor_genotype,quality_score,reads_normal,reads_tumor
0,9,21353407,21353408,G,T,BRF=0.15;FR=0.4997;HP=20;HapScore=1;MGOF=48;MM...,"0/1:-3.69,0,-93.84:30:37:34:5","0/1:-3.12,0,-74.9:48:31:29:4",21341573,21731170,1:0,1.0,4100049,11,10,95.745711,34.0,29
1,9,21361365,21361366,T,A,BRF=0.16;FR=0.5;HP=2;HapScore=3;MGOF=43;MMLQ=3...,"1/0:-34.53,0,-35.16:38:99:49:10","1/0:-5.09,0,-30.15:43:51:27:2",21341573,21731170,1:0,1.0,4100049,11,10,99.0,49.0,27
2,9,21361367,21361368,A,T,BRF=0.16;FR=0.5;HP=2;HapScore=3;MGOF=43;MMLQ=3...,"1/0:-26.24,0,-48.75:38:99:49:8","1/0:-5.09,0,-39.67:43:51:27:1",21341573,21731170,1:0,1.0,4100049,11,10,99.0,49.0,27
3,9,21376039,21376040,T,C,BRF=0.09;FR=0.5;HP=2;HapScore=1;MGOF=7;MMLQ=42...,"1/0:-77.69,0,-82.49:6:99:48:23","1/0:-61.18,0,-6.38:7:64:24:20",21341573,21731170,1:0,1.0,4100049,11,1,72.949113,48.0,24
4,9,21395908,21395909,T,C,BRF=0.34;FR=0.5;HP=4;HapScore=2;MGOF=14;MMLQ=3...,"1/0:-61.8,0,-61.4:7:99:37:18","1/0:-22.98,0,-54.38:14:99:27:10",21341573,21731170,1:0,1.0,4100049,11,10,31.915237,37.0,27


In [5]:
df = df.drop(columns=["position_dummy"])

Analyse LOH positions

In [6]:
df["position_code"] = df["#CHROM"].astype(str) + "-" + df["pos"].astype(str)

In [7]:
no_distinct_positions = df["position_code"].nunique()
no_distinct_positions

7636988

In [8]:
df["#CHROM"].nunique()

22

In [9]:
no_distinct_patients = df["PID"].nunique()
no_distinct_patients

242

Merge position count

In [10]:
ser = pd.read_json("../LOH_position_count_mmml_all_patients_extra_columns.json", typ="series", dtype={str})
df_position_count = ser.to_frame("position_count")
df_position_count = df_position_count.reset_index(names="pos")
df_position_count.shape

(7636988, 2)

In [11]:
df_position_count.head()

Unnamed: 0,pos,position_count
0,9-21353407,2
1,9-21361365,16
2,9-21361367,8
3,9-21376039,1
4,9-21395908,1


In [12]:
df = df.merge(df_position_count, how="inner", left_on="position_code", right_on="pos")
len(df)

28270385

Set threshold

In [25]:
threshold = 20
df_thresholded = df[df["position_count"] >= threshold]
len(df_thresholded)

3610841

Analyze thresholded dataframe

In [26]:
df_thresholded["PID"].nunique()

237

In [27]:
df_thresholded["#CHROM"].nunique()

12

In [28]:
df_thresholded["#CHROM"].unique()

array([ 9, 14,  1,  6, 16, 17, 22,  2, 10, 15,  8, 19])

In [29]:
df_thresholded["position_code"].nunique()

145064

Write to csv

In [30]:
df_thresholded.to_csv(f"../LOH_pos_mmml_thresholded/LOH_positions_threshold_{threshold}.tsv.gz", sep="\t", header=True, index=False, compression="gzip")