In [None]:
import gzip

import pandas as pd

pd.set_option("display.max_columns", None)

Read LOH positions

In [None]:
with open("../mmml_onek1k_all_patients_columns.txt", mode="rt") as columns_file:
    columns_list = columns_file.readlines()

columns_list = [col.rstrip("\n") for col in columns_list]

columns_list[1] = "pos"
columns_list = columns_list[0:18]
columns_list

In [None]:
df = pd.read_csv("../LOH_positions_mmml.vcf.gz", sep="\t", header=None, names=columns_list, compression="gzip", low_memory=False)
len(df)

In [None]:
df.head()

In [None]:
df = df.drop(columns=["position_dummy"])

Analyse LOH positions

In [None]:
df["position_code"] = df["#CHROM"].astype(str) + "-" + df["pos"].astype(str)

In [None]:
no_distinct_positions = df["position_code"].nunique()
no_distinct_positions

In [None]:
df["#CHROM"].nunique()

In [None]:
no_distinct_patients = df["PID"].nunique()
no_distinct_patients

Merge position count

In [None]:
ser = pd.read_json("../LOH_position_count_mmml_all_patients_extra_columns.json", typ="series", dtype={str})
df_position_count = ser.to_frame("position_count")
df_position_count = df_position_count.reset_index(names="pos")
df_position_count.shape

In [None]:
df_position_count.head()

In [None]:
df = df.merge(df_position_count, how="inner", left_on="position_code", right_on="pos")
len(df)

Set threshold

In [None]:
threshold = 10
df_thresholded = df[df["position_count"] >= threshold]
len(df_thresholded)

Analyze thresholded dataframe

In [None]:
df_thresholded["PID"].nunique()

In [None]:
df_thresholded["#CHROM"].nunique()

In [None]:
df_thresholded["#CHROM"].unique()

In [None]:
df_thresholded["position_code"].nunique()

Write to csv

In [None]:
df_thresholded.to_csv(f"../LOH_pos_mmml_thresholded/LOH_positions_threshold_{threshold}.tsv.gz", sep="\t", header=True, index=False, compression="gzip")