In [None]:
# %%
from cbio_py import cbio_mod as cb

studies = cb.getAllStudies()
print(f"Total number of studies: {len(studies)}")

# get the 5 study IDs with the most samples
sorted_samples = sorted(studies, key=lambda x: x['allSampleCount'], reverse=True)
print("Top 5 studies with the most samples:")
for studies in sorted_samples[:5]:
    print(f"Study ID: {studies['studyId']}, Sample Count: {studies['allSampleCount']}, Cancer Type: {studies['cancerTypeId']}")
    
print("Overall sample count across all studies:", sum(int(study['allSampleCount']) for study in sorted_samples[:5]))

# %%
col_studies = []
count = 0
for study in sorted_samples:
    if study['cancerTypeId'] in ['coad', 'read', 'coadread']:
        print(f"Study ID: {study['studyId']}, Sample Count: {study['allSampleCount']}, Cancer Type: {study['cancerTypeId']}")
        count += 1
        col_studies.append(study)
        
print(f"Total number of studies with 'coad', 'read', or 'coadread' cancer types: {count}")
print(f"Total sample count for these studies: {sum(int(study['allSampleCount']) for study in col_studies)}")

# %%
from bravado.client import SwaggerClient
cbioportal = SwaggerClient.from_url('https://www.cbioportal.org/api/v2/api-docs',
                                config={"validate_requests":False,"validate_responses":False,"validate_swagger_spec":False})

gene_mut = []
total_mutations = 0

for study in sorted_samples[:5]:
    # get the mutations for the first sample in each of the top 5 studies
    sample_list = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
        molecularProfileId=f"{study['studyId']}_mutations",
        sampleListId=f"{study['studyId']}_all"
    ).result()
    print(len(sample_list), "mutations found for study:", study['studyId'])
    total_mutations += len(sample_list)
    gene_mut.extend(sample_list)
    
# for study in col_studies:
#     # get the mutations for the first sample in each of the 'coad', 'read', or 'coadread' studies
#     sample_list = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
#         molecularProfileId=f"{study['studyId']}_mutations",
#         sampleListId=f"{study['studyId']}_all"
#     ).result()
#     print(len(sample_list), "mutations found for study:", study['studyId'])
#     total_mutations += len(sample_list)
#     gene_mut.extend(sample_list)
    
print("Total mutations considered:", total_mutations)

# %%
gene_list = cb.getAllGenes(return_type = 'dict')
gene_dict = {gene['entrezGeneId']: gene for gene in gene_list}

# %%
# output the mutations to a file
gene_set = set()
with open('../data/top_5_study_mutations.txt', 'w') as f:
    for mutation in gene_mut:
        if not mutation.entrezGeneId:
            print(mutation)
        gene = gene_dict[mutation.entrezGeneId]['hugoGeneSymbol']
        sample_id = mutation.sampleId
        f.write(f"{gene}\t{sample_id}\n")
        gene_set.add(mutation.entrezGeneId)

# output the unique genes to a file
with open('../data/unique_genes_mutated.txt', 'w') as f:
    for gene in gene_set:
        f.write(f"{gene_dict[gene]['hugoGeneSymbol']}\n")

# %%
# create a dataframe of the mutations
import pandas as pd
mut_df = pd.read_csv('../data/top_5_study_mutations.txt', sep='\t', header=None, names=['Gene', 'Sample'])
mut_df

# %%
gene_mut_dict = {}
for i in range(len(mut_df)):
    if mut_df.iloc[i]['Sample'] not in gene_mut_dict:
        gene_mut_dict[mut_df.iloc[i]['Sample']] = set()
    gene_mut_dict[mut_df.iloc[i]['Sample']].add(mut_df.iloc[i]['Gene'])
    
gene_mut_dict

# %%
# Extract unique genes and samples
samples = list(gene_mut_dict.keys())
genes = sorted(set(g for genes in gene_mut_dict.values() for g in genes))

# Create a DataFrame initialized with zeros
df = pd.DataFrame(0, index=samples, columns=genes)

# Populate the DataFrame
for sample, mutated_genes in gene_mut_dict.items():
    df.loc[sample, list(mutated_genes)] = 1

df

# %%
# data analysis
# top 20 genes with the most mutations
import matplotlib.pyplot as plt

top_genes = df.sum().sort_values(ascending=False).head(20)

# in the plot, we want to show the number of mutations for each gene
top_genes.plot(kind='barh', figsize=(10, 6))
plt.title('Top 20 Genes with the Most Mutations')
plt.xlabel('Number of Mutations')
# in each bar, we want to show the gene count/total samples
for index, value in enumerate(top_genes):
    plt.text(value, index, str(value), va='center')
plt.xticks(rotation=45)
plt.gca().invert_yaxis()  # invert y axis to have the highest count on top
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.xlabel('Number of Mutations')
plt.ylabel('Gene')
plt.tight_layout()
plt.savefig('../data/top_20_genes_mutations.png')
plt.show()


# %%
# save df to a file
df.to_csv('../data/mutation_matrix.csv', index=True)

# %%
sample_cd = {}

for study in sorted_samples[:5]:
    print(f"Getting clinical data for study: {study['studyId']}")
    data = cb.getAllClinicalDataInStudy(studyId=study['studyId'])
    print(f"Number of clinical data samples in study {study['studyId']}: {len(data)}")
    
    for sample in data:
        if sample['sampleId'] not in sample_cd:
            sample_cd[sample['sampleId']] = []
            
        sample_cd[sample['sampleId']].append(sample)
        
    print(f"Total number of clinical data for now: {len(sample_cd)}")
        
# write the clinical data to a file
with open('../data/clinical_data.txt', 'w') as f:
    for sample, data in sample_cd.items():
        for d in data:
            f.write(f"{sample}\t{d['clinicalAttributeId']}\t{d['studyId']}\t{d['value']}\n")

# %%
mutation_matrix = pd.read_csv('../data/mutation_matrix.csv')
for sampleid in mutation_matrix['Sample_ID'].values.tolist():
    if sampleid not in sample_cd:
        print(f"Sample ID {sampleid} not found in clinical data for the top 5 studies.")
        
# extend 2 columns to the mutation matrix: msi, stage, initialize with nan
mutation_matrix['MSI'] = float('nan')
mutation_matrix['Stage'] = float('nan')
mutation_matrix

# %%
total_case = len(mutation_matrix)
msi_count = 0
stage_count = 0

for id in mutation_matrix['Sample_ID'].values.tolist():
    for cd_data in sample_cd[id]:
        if cd_data['clinicalAttributeId'] == 'PATHOLOGICAL_GROUP' or cd_data['clinicalAttributeId'] == 'CLINICAL_STAGE':
            if cd_data['clinicalAttributeId'] == 'PATHOLOGICAL_GROUP':
            # for pathological group, we want to use the value as stage
                mutation_matrix.loc[mutation_matrix['Sample_ID'] == id, 'Stage'] = cd_data['value']
            elif cd_data['clinicalAttributeId'] == 'CLINICAL_STAGE':
                mutation_matrix.loc[mutation_matrix['Sample_ID'] == id, 'Stage'] = cd_data['value']
            stage_count += 1
            
        if cd_data['clinicalAttributeId'] == 'MSI_TYPE':
            mutation_matrix.loc[mutation_matrix['Sample_ID'] == id, 'MSI'] = cd_data['value']
            msi_count += 1

print(f"Total samples with stage data: {stage_count}/{total_case}")
print(f"Total samples with msi data: {msi_count}/{total_case}")
        
# # sampleId=P-0001534-T01-IM3&studyId=msk_chord_2024
# cb.getAllClinicalDataOfSampleInStudy(studyId='msk_chord_2024', sampleId='P-0001534-T01-IM3')

# %%
mutation_matrix.to_csv('../data/mutation_matrix_with_clinical_data.csv', index=False)

# %%
from bravado.client import SwaggerClient
cbioportal = SwaggerClient.from_url('https://www.cbioportal.org/api/v2/api-docs',
                                config={"validate_requests":False,"validate_responses":False,"validate_swagger_spec":False})

gene_mut = []
total_mutations = 0


# get the mutations for the first sample in each of the top 5 studies
sample_list = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
    molecularProfileId=f"msk_chord_2024_mutations",
    sampleListId=f"msk_chord_2024_all"
).result()
print(len(sample_list), "mutations found for study:", 'msk_chord_2024')
total_mutations += len(sample_list)
gene_mut.extend(sample_list)
    
print("Total mutations considered:", total_mutations)

# %%
gene_mut[0]

# %%
gene_list = cb.getAllGenes(return_type = 'dict')
gene_dict = {gene['entrezGeneId']: gene for gene in gene_list}

# output the mutations to a file
gene_set = set()
with open('../data/msk_2024_mutations.txt', 'w') as f:
    f.write("Gene\tSample\tPatient\tmutationType\tvariantType\tproteinChange\tchr\tstartPosition\tendPosition\treferenceAllele\tvariantAllele\tproteinPosStart\tproteinPosEnd\tmutationStatus\ttumorAltCount\ttumorRefCount\tnormalAltCount\tnormalRefCount\n")
    for mutation in gene_mut:
        if not mutation.entrezGeneId:
            print(mutation)
        gene = gene_dict[mutation.entrezGeneId]['hugoGeneSymbol']
        sample_id = mutation.sampleId
        f.write(f"{gene}\t{sample_id}\t{mutation.patientId}\t{mutation.mutationType}\t{mutation.variantType}\t{mutation.proteinChange}\t{mutation.chr}\t{mutation.startPosition}\t{mutation.endPosition}\t{mutation.referenceAllele}\t{mutation.variantAllele}\t{mutation.proteinPosStart}\t{mutation.proteinPosEnd}\t{mutation.mutationStatus}\t{mutation.tumorAltCount}\t{mutation.tumorRefCount}\t{mutation.normalAltCount}\t{mutation.normalRefCount}\n")
        gene_set.add(mutation.entrezGeneId)

# %%
data = cb.getAllClinicalDataInStudy(studyId='msk_chord_2024')
data[:50]


# %%
data_1 = cb.getClinicalAttributesByStudyId(studyId='msk_chord_2024')
data_1

# %%
data_2 = cb.getAllPatientsInStudy(studyId='msk_chord_2024')
data_2

# %%
data3 = cb.getClinicalAttributeInStudy(studyId='msk_chord_2024', clinicalAttributeId='OS_MONTHS')
data3

# %%
data_4 = cb.getAllClinicalDataOfPatientInStudy(studyId='msk_chord_2024', patientId='P-0002773')
data_4

# %%
import pandas as pd

df = pd.read_csv('../data/msk_2024_mutations.txt', sep='\t')
df.head()

print(len(df['Sample'].unique()), "unique samples in the mutation data")
print(len(df['Patient'].unique()), "unique patients in the mutation data")

# how many patient has more than 1 sample
patient_sample_count = df.groupby('Patient')['Sample'].nunique()
print("Number of patients with more than 1 sample:")
print(patient_sample_count[patient_sample_count > 1].count())
print("Number of patients with only 1 sample:")
print(patient_sample_count[patient_sample_count == 1].count())
print("Number of patients with less 1 sample:")
print(patient_sample_count[patient_sample_count < 1].count())

# exclude the patients with more than 1 sample
df_single = df[df['Patient'].isin(patient_sample_count[patient_sample_count == 1].index)]
print(len(df_single['Sample'].unique()), "unique samples in the mutation data after excluding patients with more than 1 sample")
print(len(df_single['Patient'].unique()), "unique patients in the mutation data after excluding patients with more than 1 sample")

# %%
print(f"Mutation Type: {df_single['mutationType'].unique()} ({len(df_single['mutationType'].unique())})")   
print(f"Variant Type: {df_single['variantType'].unique()} ({len(df_single['variantType'].unique())})")
print(f"Protein Change: {df_single['proteinChange'].unique()} ({len(df_single['proteinChange'].unique())})")
print(f"Chromosome: {df_single['chr'].unique()} ({len(df_single['chr'].unique())})")
print(f"Start Position: {df_single['startPosition'].unique()} ({len(df_single['startPosition'].unique())})")
print(f"End Position: {df_single['endPosition'].unique()} ({len(df_single['endPosition'].unique())})")
print(f"Reference Allele: {df_single['referenceAllele'].unique()} ({len(df_single['referenceAllele'].unique())})")
print(f"Variant Allele: {df_single['variantAllele'].unique()} ({len(df_single['variantAllele'].unique())})")
print(f"Protein Position Start: {df_single['proteinPosStart'].unique()} ({len(df_single['proteinPosStart'].unique())})")
print(f"Protein Position End: {df_single['proteinPosEnd'].unique()} ({len(df_single['proteinPosEnd'].unique())})")
print(f"Mutation Status: {df_single['mutationStatus'].unique()} ({len(df_single['mutationStatus'].unique())})")
print(f"Tumor Alt Count: {df_single['tumorAltCount'].unique()} ({len(df_single['tumorAltCount'].unique())})")
print(f"Tumor Ref Count: {df_single['tumorRefCount'].unique()} ({len(df_single['tumorRefCount'].unique())})")
print(f"Normal Alt Count: {df_single['normalAltCount'].unique()} ({len(df_single['normalAltCount'].unique())})")
print(f"Normal Ref Count: {df_single['normalRefCount'].unique()} ({len(df_single['normalRefCount'].unique())})")

df_single

# %%
# select mutationType, variantType, chr, startPosition, endPosition, proteinPosStart, proteinPosEnd, tumorAlleleFreq = tumorAltCount/(tumorAltCount + tumorRefCount), normalAlleleFreq = normalAltCount/(normalAltCount + normalRefCount)
df_single['tumorAlleleFreq'] = df_single['tumorAltCount'] / (df_single['tumorAltCount'] + df_single['tumorRefCount'])
df_single['normalAlleleFreq'] = df_single['normalAltCount'] / (df_single['normalAltCount'] + df_single['normalRefCount'])

df_clean = df_single[['Gene', 'Sample', 'Patient', 'mutationType', 'variantType', 'chr', 'startPosition', 'endPosition',
                       'proteinPosStart', 'proteinPosEnd', 'tumorAlleleFreq', 'normalAlleleFreq']]
df_clean.to_csv('../data/msk_2024_mutations_clean.csv', index=False)
df_clean.head()

# %%
df['chr'] = df['chr'].astype(str)  # ensure string type if numeric chromosome
categorical_cols = ['mutationType', 'variantType', 'chr']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)


