In [10]:
import pandas as pd
import numpy as np
#--------------------------------------------------
# Cleaning TCGA phenotype data
#--------------------------------------------------
# Read data
clinical_phenotype_df = pd.read_csv(
    "/Users/lynettewilson/Desktop/mRNA_Cancer_Prediction_Model/TCGA_data/phenotype_data", 
    sep="\t"
)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
print(f"Original shape: {clinical_phenotype_df.shape}")

# Filter patients: Adults (18+) with Primary Tumors
clinical_phenotype_df = clinical_phenotype_df[
    (clinical_phenotype_df['age_at_index.demographic'] >= 18) &  
    (clinical_phenotype_df['sample_type.samples'] == 'Primary Tumor')
]

# Define missing value indicators
missing_values = [
    'not reported', 'Not Reported', 'NOT REPORTED',
    'unknown', 'Unknown', 'UNKNOWN',
    'not available', 'Not Available', 'N/A', 'NA', 'n/a',
    'not applicable', 'Not Applicable',
    '--', '---', 'null', 'NULL', 'None',
    '[Not Available]', '[Not Applicable]', '[Unknown]',
    'Not Evaluated', 'not evaluated',
    '', ' ', '  '
]

# Replace with NaN
clinical_phenotype_df.replace(missing_values, np.nan, inplace=True)

# Remove columns with >90% missing
threshold = 0.1 * len(clinical_phenotype_df)
clinical_phenotype_df = clinical_phenotype_df.dropna(axis=1, thresh=threshold)

print(f"Shape before treatment cleaning: {clinical_phenotype_df.shape}")

# Cleaning up treatment types
clinical_phenotype_df['treatment_or_therapy.treatments.diagnoses'].replace("['yes', 'yes']", 'RT/TMZ', inplace=True)
clinical_phenotype_df['treatment_or_therapy.treatments.diagnoses'].replace("['no', 'no']", 'no treatment', inplace=True)
clinical_phenotype_df['treatment_or_therapy.treatments.diagnoses'].replace("['no', 'yes']", 'RT', inplace=True)
clinical_phenotype_df['treatment_or_therapy.treatments.diagnoses'].replace("['yes', 'no']", 'TMZ', inplace=True)

# Remove rows where treatment is still NaN (was "not reported")
clinical_phenotype_df = clinical_phenotype_df.dropna(
    subset=['treatment_or_therapy.treatments.diagnoses']
)

print(f"Shape after removing 'not reported' treatments: {clinical_phenotype_df.shape}")
print(f"\nUnique treatment values:")
print(clinical_phenotype_df['treatment_or_therapy.treatments.diagnoses'].unique())
print(f"\nTreatment distribution:")
print(clinical_phenotype_df['treatment_or_therapy.treatments.diagnoses'].value_counts())

RT_df = clinical_phenotype_df[clinical_phenotype_df['treatment_or_therapy.treatments.diagnoses'] == 'RT']
RT_and_TMZ_df = clinical_phenotype_df[clinical_phenotype_df['treatment_or_therapy.treatments.diagnoses'] == 'RT/TMZ']
TMZ_df = clinical_phenotype_df[clinical_phenotype_df['treatment_or_therapy.treatments.diagnoses'] == 'TMZ']
No_treatment_df = clinical_phenotype_df[clinical_phenotype_df['treatment_or_therapy.treatments.diagnoses'] == 'no treatment']

# Save cleaned data
output_path = '/Users/lynettewilson/Desktop/mRNA_Cancer_Prediction_Model/TCGA_data/clean_phenotype_data.csv'
clinical_phenotype_df.to_csv(output_path, index=False)

print(f"\nFinal shape: {clinical_phenotype_df.shape}")
print("Data saved successfully!")

Original shape: (636, 82)
Shape before treatment cleaning: (600, 69)
Shape after removing 'not reported' treatments: (600, 69)

Unique treatment values:
['RT/TMZ' "['not reported', 'yes']" 'RT' 'no treatment'
 "['not reported', 'not reported']" "['yes', 'not reported']" 'TMZ']

Treatment distribution:
treatment_or_therapy.treatments.diagnoses
RT/TMZ                              450
no treatment                         45
RT                                   27
['not reported', 'not reported']     26
TMZ                                  23
['yes', 'not reported']              15
['not reported', 'yes']              14
Name: count, dtype: int64

Final shape: (600, 69)
Data saved successfully!


  clinical_phenotype_df.replace(missing_values, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clinical_phenotype_df['treatment_or_therapy.treatments.diagnoses'].replace("['yes', 'yes']", 'RT/TMZ', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clinical_phenotype_df['treatment_or_therapy.t

In [43]:
#--------------------------------------------------
# Cleaning CGGA clinical data
#--------------------------------------------------

clinical_cgga_df = pd.read_csv(
    "/Users/lynettewilson/Desktop/mRNA_Cancer_Prediction_Model/CGGA_data/TCGA_clinical.txt", 
    sep="\t")

print(clinical_cgga_df.shape)

clinical_cgga_df = clinical_cgga_df[(clinical_cgga_df['Age'] >= 18) 
                                    &(clinical_cgga_df['Histology'] == 'GBM')
                                    &(clinical_cgga_df['IDH_mutation_status'] == 'Wildtype')]
print(clinical_cgga_df.shape)
clinical_cgga_df.replace(missing_values, np.nan, inplace=True)
print(clinical_cgga_df.shape)
output_path_2 = '/Users/lynettewilson/Desktop/mRNA_Cancer_Prediction_Model/TCGA_data/clean_cgga_clinical_data.csv'
clinical_cgga_df.to_csv(output_path_2, index=False)





(702, 9)
(139, 9)
(139, 9)
