# import

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# !pip install pyensembl
from pyensembl import EnsemblRelease
from scipy.stats import poisson
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
import matplotlib.pyplot as plt

# Load Files

In [2]:
# Load File
DATA_PATH = Path('../../Data')
metadata_processed = pd.read_csv(Path(DATA_PATH, 'processed/processed_metadata.txt'), sep = '\t').fillna("NaN")
mutations = pd.read_csv(Path(DATA_PATH, 'Original/TCGA.HNSC.mutations.txt'), sep = '\t').fillna("NaN")

# patient list

In [3]:
patients = metadata_processed[metadata_processed.tumor_site.isin(["mouth","Larynx","tongue"])].patient_id.tolist()

In [4]:
with open("patients.txt", "w") as f:
    for patient in patients:
        f.write(f"{patient}\n")

# Mutations processing

In [5]:
def classify_dn_ds(vclass):
    if vclass == 'Silent':
        return 'synonymous'
    else:
        return 'nonsynonymous'

In [6]:
# 1. filter to pass: kept high-quality mutations
mutations = mutations[mutations['FILTER'] == 'PASS']

# 2. remove deprecated vairant callers and missing callers
# from mutations['CALLERS'], remove str matching 'RADIA' and 'SOMATICSNIPER'
mutations['CALLERS'] = mutations['CALLERS'].replace(to_replace='*', value='')
mutations['CALLERS'] = mutations['CALLERS'].replace(to_replace='RADIA', value='')
mutations['CALLERS'] = mutations['CALLERS'].replace(to_replace='SOMATICSNIPER', value='')
# in mutations['CALLERS'], if 2 '|' are next to each other, replace with ''
mutations['CALLERS'] = mutations['CALLERS'].replace(to_replace='\\|\\|', value='', regex=True)
# count the number of '|' in mutations['CALLERS'] and store in a new column 'CALLERS_COUNT_ADJUSTED'
mutations['CALLERS_COUNT_ADJUSTED'] = mutations['CALLERS'].str.count('\\|')

# 3. Filter to consensus mutations
mutations = mutations[mutations['CALLERS_COUNT_ADJUSTED'] > 1]

# 4. Filter by n_depth >=10
mutations = mutations[mutations['n_depth'] >= 10] 

# 5. Filter by hypermutators > 1000 mutations per patient
# Identify patients with >1000 mutations
hypermutators = mutations['patient_id'].value_counts()
hypermutators = hypermutators[hypermutators > 1000].index
# Remove those patients
mutations = mutations[~mutations['patient_id'].isin(hypermutators)]

# 6. Only SNP mutations should be analyzed
mutations = mutations[mutations['Variant_Type'] == 'SNP']

# 7. additional column on mutation type and trinucleotide contextx
# dn/ds mutation
mutations.loc[:, '_dn_ds_mutation_type'] = mutations['Variant_Classification'].apply(classify_dn_ds)
# trinucleotide
mutations.loc[:, '_trinucleotide'] = mutations['CONTEXT'].str.slice(4, 7)
# 16 type context
mutations['_16_type_context'] = mutations['CONTEXT'].str[4] + '_' + mutations['CONTEXT'].str[6]
# 6 substitution 
# mutations['_substitution'] = mutations.apply(
#     lambda row: get_substitution(row['Reference_Allele'], row['Tumor_Seq_Allele2']), axis=1
# )
mutations['_substitution'] = mutations['Reference_Allele'] + "->" + mutations['Tumor_Seq_Allele2']
mutations['_96_class'] = mutations['_16_type_context'] + '__' + mutations['_substitution']

mutations = mutations[mutations._dn_ds_mutation_type == "nonsynonymous"]
mutations = mutations[mutations.Variant_Classification != "Intron"]
mutations = mutations[mutations.Variant_Classification != "3'UTR"] # untranslated
mutations = mutations[mutations.Variant_Classification != "5'UTR"]
mutations = mutations[mutations.Variant_Classification != "RNA"] # RNA region
mutations = mutations[mutations.Variant_Classification != "3'Flank"] # flanking region
mutations = mutations[mutations.Variant_Classification != "5'Flank"]

mutations = mutations[mutations['patient_id'].isin(patients)]

mutations.to_csv("mutations_filtered.csv", index=True)

# Identify High-impact mutation based on PolyPhen Score

In [7]:
# Step 1: Extract PolyPhen score
mutations['polyphen_score'] = mutations['PolyPhen'].str.extract(r"\(([\d.]+)\)").astype(float)

# Step 2: Filter to only high PolyPhen mutations
high_impact = mutations[mutations['polyphen_score'] > 0.85]