In [1]:
import pandas as pd
import numpy as np
from SigProfilerExtractor import sigpro as sig

In [2]:
def classify_dn_ds(vclass):
    if vclass == 'Silent':
        return 'synonymous'
    else:
        return 'nonsynonymous'

def get_substitution(ref, alt):
    valid_bases = {'A', 'C', 'G', 'T'}
    if ref not in valid_bases or alt not in valid_bases:
        return None

    if ref in ['C', 'T']:
        return ref + ">" + alt
    else:
        # Reverse complement both ref and alt
        complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
        norm_ref = complement[ref]
        norm_alt = complement[alt]
        return norm_ref + ">" + norm_alt


In [3]:
mutations = pd.read_csv('/cellar/users/dhalmos/class/BNFO285/BNFO285_Projects/project_2/project_2_data/TCGA_HNSC_mutations_cleaned.txt', sep = '\t')

In [4]:
mutations = mutations[mutations['Variant_Type'] == 'SNP']

# 7. additional column on mutation type and trinucleotide contextx
# dn/ds mutation
mutations.loc[:, '_dn_ds_mutation_type'] = mutations['Variant_Classification'].apply(classify_dn_ds)
# trinucleotide
mutations.loc[:, '_trinucleotide'] = mutations['CONTEXT'].str.slice(4, 7)
# 16 type context
mutations['_16_type_context'] = mutations['CONTEXT'].str[4] + '_' + mutations['CONTEXT'].str[6]
#mutations['_substitution'] = mutations['Reference_Allele'] + ">" + mutations['Tumor_Seq_Allele2']
mutations['_substitution'] = mutations.apply(
    lambda row: get_substitution(row['Reference_Allele'], row['Tumor_Seq_Allele2']), axis=1
)
mutations['_96_class'] = mutations['_16_type_context'].apply(lambda x: x[0]) + '[' + mutations['_substitution'] + ']' + mutations['_16_type_context'].apply(lambda x: x[-1])

In [5]:
mutation_matrix = mutations[['patient_id', '_96_class']].groupby(by = ['_96_class', 'patient_id']).size().unstack().fillna(0)

In [6]:
mutation_matrix.to_csv('./mutation_matrix.csv', sep = '\t')

In [7]:
path_to_example_table = sig.importdata("matrix")
data = './mutation_matrix.csv'
sig.sigProfilerExtractor("matrix", "example_output", data, opportunity_genome="GRCh37", gpu = True, exome = True, minimum_signatures=2, maximum_signatures=10)


************** Reported Current Memory Use: 0.79 GB *****************

Extracting signature 2 for mutation type 96
The matrix normalizing cutoff is 9600


process 2 continues please wait... 
execution time: 41 seconds 

process 2 continues please wait... 
execution time: 35 seconds 

process 2 continues please wait... 
execution time: 24 seconds 

process 2 continues please wait... 
execution time: 24 seconds 

process 2 continues please wait... 
execution time: 41 seconds 

process 2 continues please wait... 
execution time: 24 seconds 

process 2 continues please wait... 
execution time: 24 seconds 

process 2 continues please wait... 
execution time: 35 seconds 

process 2 continues please wait... 
execution time: 41 seconds 

process 2 continues please wait... 
execution time: 24 seconds 

process 2 continues please wait... 
execution time: 24 seconds 

process 2 continues please wait... 
execution time: 35 seconds 

process 2 continues please wait... 
execution time: 29 seconds 
