In [2]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = 'dataset/METABRIC_RNA_Mutation.csv'  # Dataset source
data = pd.read_csv(file_path)

# Display basic information about the datasets
print("Dataset Shape:", data.shape)
print("\nFirst 5 Rows:\n", data.head())
print("\nData Types:\n", data.dtypes)
print("\nBasic Statistics:\n", data.describe())
print("\nMissing Values:\n", data.isnull().sum())


  data = pd.read_csv(file_path)


Dataset Shape: (1904, 693)

First 5 Rows:
    patient_id  age_at_diagnosis type_of_breast_surgery    cancer_type  \
0           0             75.65             MASTECTOMY  Breast Cancer   
1           2             43.19      BREAST CONSERVING  Breast Cancer   
2           5             48.87             MASTECTOMY  Breast Cancer   
3           6             47.68             MASTECTOMY  Breast Cancer   
4           8             76.97             MASTECTOMY  Breast Cancer   

                        cancer_type_detailed cellularity  chemotherapy  \
0           Breast Invasive Ductal Carcinoma         NaN             0   
1           Breast Invasive Ductal Carcinoma        High             0   
2           Breast Invasive Ductal Carcinoma        High             1   
3  Breast Mixed Ductal and Lobular Carcinoma    Moderate             1   
4  Breast Mixed Ductal and Lobular Carcinoma        High             1   

  pam50_+_claudin-low_subtype  cohort er_status_measured_by_ihc  ... mtap

In [4]:
miss_val = data.columns[data.isnull().any()]
print(miss_val)



Index(['type_of_breast_surgery', 'cancer_type_detailed', 'cellularity',
       'er_status_measured_by_ihc', 'neoplasm_histologic_grade',
       'tumor_other_histologic_subtype', 'primary_tumor_laterality',
       'mutation_count', 'oncotree_code', '3-gene_classifier_subtype',
       'tumor_size', 'tumor_stage', 'death_from_cancer'],
      dtype='object')


In [5]:
name_of_columns_with_missing_values = data.columns[data.isna().any()].tolist()
name_of_columns_with_missing_values

['type_of_breast_surgery',
 'cancer_type_detailed',
 'cellularity',
 'er_status_measured_by_ihc',
 'neoplasm_histologic_grade',
 'tumor_other_histologic_subtype',
 'primary_tumor_laterality',
 'mutation_count',
 'oncotree_code',
 '3-gene_classifier_subtype',
 'tumor_size',
 'tumor_stage',
 'death_from_cancer']

In [25]:
missing_columns = data.isnull().sum() # display missing value summary and status of all columns in the dataset 
missing_columns = missing_columns [missing_columns > 0] # filter only columns that have missing values with their count of missing entries
missing_columns

type_of_breast_surgery             22
cancer_type_detailed               15
cellularity                        54
er_status_measured_by_ihc          30
neoplasm_histologic_grade          72
tumor_other_histologic_subtype     15
primary_tumor_laterality          106
mutation_count                     45
oncotree_code                      15
3-gene_classifier_subtype         204
tumor_size                         20
tumor_stage                       501
death_from_cancer                   1
dtype: int64

In [7]:

metabric_data = data

# Filling missing values with mode for categorical columns
categorical_columns = [
    'type_of_breast_surgery', 'cancer_type_detailed', 'cellularity', 
    'er_status_measured_by_ihc', 'tumor_other_histologic_subtype', 
    'primary_tumor_laterality', 'oncotree_code', '3-gene_classifier_subtype', 
    'death_from_cancer'
]

for column in categorical_columns:
    mode_value = metabric_data[column].mode()[0]
    metabric_data[column].fillna(mode_value, inplace=True)

# Filling missing values with median for numerical columns
numerical_columns = ['mutation_count', 'tumor_size']

for column in numerical_columns:
    median_value = metabric_data[column].median()
    metabric_data[column].fillna(median_value, inplace=True)

# For 'neoplasm_histologic_grade' and 'tumor_stage', using mode to fill missing values
ordinal_columns = ['neoplasm_histologic_grade', 'tumor_stage']

for column in ordinal_columns:
    mode_value = metabric_data[column].mode()[0]
    metabric_data[column].fillna(mode_value, inplace=True)


metabric_data.to_csv('Missing_values_2.csv')


In [8]:
name_of_columns_with_missing_values = data.columns[data.isna().any()].tolist()
name_of_columns_with_missing_values # No missing value in the dataset


[]

In [76]:
metabric_data.columns.tolist()


['patient_id',
 'age_at_diagnosis',
 'type_of_breast_surgery',
 'cancer_type',
 'cancer_type_detailed',
 'cellularity',
 'chemotherapy',
 'pam50_+_claudin-low_subtype',
 'cohort',
 'er_status_measured_by_ihc',
 'er_status',
 'neoplasm_histologic_grade',
 'her2_status_measured_by_snp6',
 'her2_status',
 'tumor_other_histologic_subtype',
 'hormone_therapy',
 'inferred_menopausal_state',
 'integrative_cluster',
 'primary_tumor_laterality',
 'lymph_nodes_examined_positive',
 'mutation_count',
 'nottingham_prognostic_index',
 'oncotree_code',
 'overall_survival_months',
 'overall_survival',
 'pr_status',
 'radio_therapy',
 '3-gene_classifier_subtype',
 'tumor_size',
 'tumor_stage',
 'death_from_cancer',
 'brca1',
 'brca2',
 'palb2',
 'pten',
 'tp53',
 'atm',
 'cdh1',
 'chek2',
 'nbn',
 'nf1',
 'stk11',
 'bard1',
 'mlh1',
 'msh2',
 'msh6',
 'pms2',
 'epcam',
 'rad51c',
 'rad51d',
 'rad50',
 'rb1',
 'rbl1',
 'rbl2',
 'ccna1',
 'ccnb1',
 'cdk1',
 'ccne1',
 'cdk2',
 'cdc25a',
 'ccnd1',
 'cdk4',

In [39]:
columns_ending_with_mut = [col for col in metabric_data.columns if col.endswith('_mut')]
print("Number of mutation-related columns(end with '_mut:')", len(columns_ending_with_mut), "\nMutation columns are:")
print (columns_ending_with_mut)



Number of mutation-related columns(end with '_mut:') 173 
Mutation columns are:
['pik3ca_mut', 'tp53_mut', 'muc16_mut', 'ahnak2_mut', 'kmt2c_mut', 'syne1_mut', 'gata3_mut', 'map3k1_mut', 'ahnak_mut', 'dnah11_mut', 'cdh1_mut', 'dnah2_mut', 'kmt2d_mut', 'ush2a_mut', 'ryr2_mut', 'dnah5_mut', 'herc2_mut', 'pde4dip_mut', 'akap9_mut', 'tg_mut', 'birc6_mut', 'utrn_mut', 'tbx3_mut', 'col6a3_mut', 'arid1a_mut', 'lama2_mut', 'notch1_mut', 'cbfb_mut', 'ncor2_mut', 'col12a1_mut', 'col22a1_mut', 'pten_mut', 'akt1_mut', 'atr_mut', 'thada_mut', 'ncor1_mut', 'stab2_mut', 'myh9_mut', 'runx1_mut', 'nf1_mut', 'map2k4_mut', 'ros1_mut', 'lamb3_mut', 'arid1b_mut', 'erbb2_mut', 'sf3b1_mut', 'shank2_mut', 'ep300_mut', 'ptprd_mut', 'usp9x_mut', 'setd2_mut', 'setd1a_mut', 'thsd7a_mut', 'afdn_mut', 'erbb3_mut', 'rb1_mut', 'myo1a_mut', 'alk_mut', 'fanca_mut', 'adgra2_mut', 'ubr5_mut', 'pik3r1_mut', 'myo3a_mut', 'asxl2_mut', 'apc_mut', 'ctcf_mut', 'asxl1_mut', 'fancd2_mut', 'taf1_mut', 'kdm6a_mut', 'ctnna3_mut', '

In [54]:
# Select columns that end with '_mut'
columns_with_mut = metabric_data.filter(regex='_mut$')

# Save these columns to a new CSV file
columns_with_mut.to_csv('mutation_related_columns.csv', index=False)

#columns_with_mut.info()
#columns_with_mut.head(15)

# Re-trying to load and review the mutation-related data
file_path = 'mutation_related_columns.csv'

try:
    mutation_data = pd.read_csv(file_path)

    # Display basic information and the first few rows
    mutation_data_info = mutation_data.info()
    first_five_rows = mutation_data.head()
except Exception as e:
    mutation_data_info = str(e)
    first_five_rows = "Error loading data."

mutation_data_info, first_five_rows



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1904 entries, 0 to 1903
Columns: 173 entries, pik3ca_mut to siah1_mut
dtypes: object(173)
memory usage: 2.5+ MB


(None,
   pik3ca_mut tp53_mut muc16_mut ahnak2_mut kmt2c_mut syne1_mut gata3_mut  \
 0          0        0         0          0         0         0         0   
 1          0    H178P         0          0         0         0         0   
 2     H1047R        0         0          0         0         0         0   
 3      E542K        0         0          0         0         0         0   
 4          0    S241F         0          0         0         0         0   
 
   map3k1_mut ahnak_mut dnah11_mut  ... mtap_mut ppp2cb_mut smarcd1_mut  \
 0          0         0          0  ...        0          0           0   
 1          0         0          0  ...        0          0           0   
 2          0         0          0  ...        0          0           0   
 3          0         0          0  ...        0          0           0   
 4          0         0          0  ...        0          0           0   
 
   nras_mut ndfip1_mut hras_mut prps2_mut smarcb1_mut stmn2_mut siah1_mut  
 

In [None]:
# Modified approach to count the occurrences of each mutation
#mutation_counts = mutation_data.apply(lambda x: x.value_counts()).fillna(0)

#mutation_counts.head()
mutation_counts = pd.read_csv('sample_mut.csv')

#print(mutation_counts.sum())
print(mutation_counts.info())

In [62]:
import sklearn
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Data Cleaning
# Removing duplicate rows
metabric_data = metabric_data.drop_duplicates()

metabric_data.to_csv('duplicate_rows.csv')

print (metabric_data.info())
''' 
# Data Transformation
# One-hot encoding for categorical variables
categorical_features = ['categorical_column1', 'categorical_column2']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder='passthrough')
transformed_data = transformer.fit_transform(metabric_data)

# Data Normalization/Standardization
# Standardizing numerical features
numerical_features = metabric_data.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
metabric_data[numerical_features] = scaler.fit_transform(metabric_data[numerical_features])

# Checking the transformed data
print(metabric_data.head())

'''

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1904 entries, 0 to 1903
Columns: 693 entries, patient_id to siah1_mut
dtypes: float64(498), int64(5), object(190)
memory usage: 10.1+ MB
None


' \n# Data Transformation\n# One-hot encoding for categorical variables\ncategorical_features = [\'categorical_column1\', \'categorical_column2\']\none_hot = OneHotEncoder()\ntransformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder=\'passthrough\')\ntransformed_data = transformer.fit_transform(metabric_data)\n\n# Data Normalization/Standardization\n# Standardizing numerical features\nnumerical_features = metabric_data.select_dtypes(include=[\'int64\', \'float64\']).columns\nscaler = StandardScaler()\nmetabric_data[numerical_features] = scaler.fit_transform(metabric_data[numerical_features])\n\n# Checking the transformed data\nprint(metabric_data.head())\n\n'

In [16]:

# Loading the dataset after missing values
file_path = 'dataset/METABRIC_RNA_Mutation_After_Missing_Values_2.csv'
metabric_data = pd.read_csv(file_path)

# Filtering out columns that end with '_mut'
columns_to_remove = [col for col in metabric_data.columns if col.endswith('_mut')]
metabric_data_filtered = metabric_data.drop(columns=columns_to_remove)

# Checking the new shape of the dataset and displaying the first few rows
metabric_data_filtered.to_csv('METABRIC_RNA_Mutation_without_Mutation_Data_3.csv', index=False)

print (metabric_data_filtered.shape)
print (metabric_data_filtered.head(3))
print (metabric_data_filtered.info())



  metabric_data = pd.read_csv(file_path)


(1904, 520)
   patient_id  age_at_diagnosis type_of_breast_surgery    cancer_type  \
0           0             75.65             MASTECTOMY  Breast Cancer   
1           2             43.19      BREAST CONSERVING  Breast Cancer   
2           5             48.87             MASTECTOMY  Breast Cancer   

               cancer_type_detailed cellularity  chemotherapy  \
0  Breast Invasive Ductal Carcinoma        High             0   
1  Breast Invasive Ductal Carcinoma        High             0   
2  Breast Invasive Ductal Carcinoma        High             1   

  pam50_+_claudin-low_subtype  cohort er_status_measured_by_ihc  ...  srd5a1  \
0                 claudin-low     1.0                   Positve  ... -1.1877   
1                        LumA     1.0                   Positve  ... -0.4412   
2                        LumB     1.0                   Positve  ... -0.5381   

   srd5a2  srd5a3     st7    star    tnk2   tulp4 ugt2b15 ugt2b17  ugt2b7  
0 -0.0194 -1.6345 -0.2142 -0.5698 -1.

In [85]:
# Check to see no missing value in the dataset
file_path = 'dataset/METABRIC_RNA_Mutation_without_Mutation_Data_3.csv'
metabric_data_filtered = pd.read_csv(file_path)

name_of_columns_with_missing_values = metabric_data_filtered.columns[metabric_data_filtered.isna().any()].tolist()

print('There is NO missing values: ', name_of_columns_with_missing_values)

There is NO missing values:  []
