In [1]:
# read data
import kagglehub
import os
# scientific computing
import pandas as pd
import numpy as np
# data processing - pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer  # Required for IterativeImputer
from sklearn.impute import IterativeImputer

In [2]:
# read the data
def read_raw_data():
    path = kagglehub.dataset_download("kumarajarshi/life-expectancy-who")

    file_name = os.listdir(path)[0]

    raw_data = pd.read_csv(os.path.join(path, file_name))
    
    return raw_data

In [3]:
raw_data = read_raw_data()
raw_data.index.name = "index"

# store the raw data to data directory
raw_data.to_csv("../data/raw_data.csv")

raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

In [None]:
# split the orginal data into train and test data sets
def split_train_test(raw_data):
    """Split the raw data into train and test sets randomly and store them 
    in the data directory.

    Args:
        rawdata (_type_): a pandas data frame
    """
    raw_data_train = raw_data.sample(frac=0.8, random_state=42)
    raw_data_test = raw_data.drop(raw_data_train.index)
    
    store_path = "../data"
    raw_data_train.to_csv(store_path + "/raw_data_train.csv")
    raw_data_test.to_csv(store_path + "/raw_data_test.csv")
    
    print(f"The test and train data sets are stored into path: {store_path}")
    
    return None

In [5]:
split_train_test(raw_data)

The test and train data sets are stored into path: ../data


## Data Cleaning

**From this section, all the operations will be performed on the train data set.**

**data = raw_data_train**

The test data set will be ignored to simulate a real situation in real world.

After doing statistics, calculating estimates and training models on train data, we can repeat this steps or apply the fitted models on test data.

In [6]:
# handling the missing data in raw data
data = pd.read_csv("../data/raw_data_train.csv", index_col = "index")
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2350 entries, 2546 to 933
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2350 non-null   object 
 1   Year                             2350 non-null   int64  
 2   Status                           2350 non-null   object 
 3   Life expectancy                  2343 non-null   float64
 4   Adult Mortality                  2343 non-null   float64
 5   infant deaths                    2350 non-null   int64  
 6   Alcohol                          2194 non-null   float64
 7   percentage expenditure           2350 non-null   float64
 8   Hepatitis B                      1925 non-null   float64
 9   Measles                          2350 non-null   int64  
 10   BMI                             2322 non-null   float64
 11  under-five deaths                2350 non-null   int64  
 12  Polio                  

In [7]:
# categorize numerical and categorical variables
# Automatically detect column types in a data frame

def cols_categorize(dataframe):
    """Fetch the column names for different data types: numerical and categorical

    Args:
        dataframe (_type_): a data frame
    """
    numerical_cols = data.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_cols = data.select_dtypes(include=["object"]).columns.tolist()
    
    categorical_cols = [col for col in categorical_cols if dataframe[col].nunique() < 50]  # Adjust threshold
    # print("Numerical Columns:", numerical_cols)
    # print("Categorical Columns:", categorical_cols)
    return numerical_cols, categorical_cols

In [8]:
num_cols, cat_cols = cols_categorize(data)

In [9]:
# different numerical imputing methods
num_imputer_1 = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])
num_imputer_2 = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))  # Uses median instead of mean
])

num_imputer_3 = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=-1))  # Replace NaN with -1
])

num_imputer_4 = Pipeline([
    ("imputer", KNNImputer(n_neighbors=3))  # Uses 3 nearest neighbors
])

num_imputer_5 = Pipeline([
    ("imputer", IterativeImputer(max_iter=10, random_state=42))  # Uses regression models
])

num_imputers = [num_imputer_1, num_imputer_2, num_imputer_3, num_imputer_4, num_imputer_5]

# different categorical imputing methods
cat_imputer_1 = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

cat_imputer_2 = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),  # Replace NaN with "Unknown"
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

cat_imputers = [cat_imputer_1, cat_imputer_2]
# combining them to create complete pipelines
imputers = [
    ColumnTransformer([("num", num_imputer, num_cols),("cat", cat_imputer, cat_cols)]) 
for cat_imputer in cat_imputers
for num_imputer in num_imputers]

# identify all imputation strategies for testing
imputers = {i: imputer for i, imputer in enumerate(imputers)}
imputers

{0: ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('imputer', SimpleImputer())]),
                                  ['Year', 'Life expectancy ', 'Adult Mortality',
                                   'infant deaths', 'Alcohol',
                                   'percentage expenditure', 'Hepatitis B',
                                   'Measles ', ' BMI ', 'under-five deaths ',
                                   'Polio', 'Total expenditure', 'Diphtheria ',
                                   ' HIV/AIDS', 'GDP', 'Population',
                                   ' thinness  1-19 years',
                                   ' thinness 5-9 years',
                                   'Income composition of resources',
                                   'Schooling']),
                                 ('cat',
                                  Pipeline(steps=[('imputer',
                                                   SimpleImputer(strategy='most_frequ

In [None]:
imputers.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [43]:
import pandas as pd

# Load datasets
gene = pd.read_csv("../data/LUNG_Gene_Expression.txt", sep="\t", index_col=0)
methy = pd.read_csv("../data/LUNG_Methy_Expression.txt", sep="\t", index_col=0)
mirna = pd.read_csv("../data/LUNG_Mirna_Expression.txt", sep="\t", index_col=0)
survival = pd.read_csv("../data/LUNG_Survival.txt", sep="\t", index_col=0)


In [None]:
# sample p proportion of the data and use patient ID as index
p = 0.1
gene = gene.sample(frac=p-0.05, random_state=42)
methy = methy.sample(frac=p-0.05, random_state=42)
mirna = mirna.sample(frac=p, random_state=42)

# transpose the data
gene = gene.transpose()
methy = methy.transpose()
mirna = mirna.transpose()

In [46]:
# check the shape of these datasets
print(f"Gene shape: {gene.shape}")
print(f"Methy shape: {methy.shape}")
print(f"Mirna shape: {mirna.shape}")
print(f"Survival shape: {survival.shape}")

Gene shape: (107, 602)
Methy shape: (107, 1154)
Mirna shape: (107, 35)
Survival shape: (106, 2)


In [52]:
mirna[:5]

Unnamed: 0,hsa-mir-376b,hsa-mir-199a-2,hsa-mir-130b,hsa-mir-27a,hsa-mir-195,hsa-mir-205,hsa-mir-127,hsa-mir-139,hsa-mir-3940,hsa-mir-148b,...,hsa-mir-1226,hsa-mir-451,hsa-let-7g,hsa-mir-182,hsa-mir-181d,hsa-mir-99a,hsa-mir-26a-2,hsa-mir-3647,hsa-mir-10a,hsa-mir-197
,-0.334876,-0.172313,-0.679937,1.364239,0.382543,0.006666,-0.219129,-0.146419,1.930457,-0.05753,...,-0.466766,0.817659,0.75392,-1.268195,-0.323121,-0.504417,1.982942,0.616435,-0.485722,-1.059811
TCGA-18-3406-01A-01T-0981-13,-0.049197,1.414835,-0.506159,0.654084,3.175154,1.152833,0.101844,0.475922,-0.721311,0.070081,...,-0.674236,-0.684296,1.249058,-0.532685,-0.479207,-0.379486,0.844008,1.181708,0.002314,-0.09813
TCGA-18-3407-01A-01T-0981-13,-0.246955,-0.309156,0.271615,0.746274,-0.238343,0.627218,-0.359151,1.375691,-0.652345,1.399179,...,-0.705804,-0.460552,0.903193,-0.656288,-0.403529,-0.691073,0.940709,0.101999,-0.380596,-0.203658
TCGA-18-3408-01A-01T-0981-13,-0.312226,-0.357375,0.115522,2.337679,-0.022776,0.921633,-0.331414,0.102169,0.268803,2.302225,...,-0.495321,-0.806928,2.39332,2.922956,-0.259557,2.605881,0.115956,1.263532,-0.761069,0.246568
TCGA-18-3410-01A-01T-0981-13,-0.206257,0.009395,0.127858,3.123905,-0.365761,-0.040441,-0.279726,-0.676467,-0.702698,0.257927,...,-0.727189,-0.644856,0.526165,-0.272331,-0.086198,-0.621801,0.220237,0.415309,-0.355353,-0.342209


In [53]:
# connect the datasets
# Merge gene, methy, and mirna datasets
def merge_data(gene, methy, mirna, survival):
    # Merge gene and methy datasets
    merged_data = pd.merge(gene, methy, left_index=True, right_index=True, how ="inner")
    # Merge the result with mirna dataset
    merged_data = pd.merge(merged_data, mirna, left_index=True, right_index=True, how ="inner")
    merge_data = pd.merge(merged_data, survival, left_index=True, right_index=True, how ="inner")
    return merged_data

In [57]:
sur_index = survival.index.values
gene_index = gene.index.values

In [54]:
merge_data = merge_data(gene, methy, mirna, survival)

## Use another data set to implement iClusterVB on it

In [61]:
import kagglehub
import os
import pandas as pd
# Download latest version
path = kagglehub.dataset_download("yasserh/breast-cancer-dataset")

print("Path to dataset files:", path)
os.listdir(path)

Path to dataset files: /Users/gufeng/.cache/kagglehub/datasets/yasserh/breast-cancer-dataset/versions/1


['breast-cancer.csv']

In [62]:
data = pd.read_csv(path + "/" + os.listdir(path)[0])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [67]:
from scipy.stats import skew, kurtosis, normaltest
import numpy as np

# check the distribution of the vairables in the data
def evaluate_distribution(dataframe):
    """Calculate skewness, kurtosis, and perform normality test for each variable in the dataframe.

    Args:
        dataframe (pd.DataFrame): Input dataframe with numerical variables.

    Returns:
        pd.DataFrame: A dataframe containing skewness, kurtosis, and p-value of normality test for each variable.
    """
    stats = []
    for col in dataframe.select_dtypes(include=[np.number]).columns:
        col_data = dataframe[col].dropna()
        skewness = skew(col_data)
        kurt = kurtosis(col_data)
        _, p_value = normaltest(col_data)
        stats.append({"Variable": col, "Skewness": skewness, "Kurtosis": kurt, "Normality_p_value": p_value})
    
    return pd.DataFrame(stats)

# Example usage
distribution_stats = evaluate_distribution(data)
distribution_stats.head()

Unnamed: 0,Variable,Skewness,Kurtosis,Normality_p_value
0,id,6.456673,41.812813,4.156149e-155
1,radius_mean,0.939893,0.827584,1.286172e-16
2,texture_mean,0.648734,0.741145,4.685883e-10
3,perimeter_mean,0.988037,0.953165,3.595463e-18
4,area_mean,1.641391,3.609761,2.38604e-42


In [68]:
distribution_stats

Unnamed: 0,Variable,Skewness,Kurtosis,Normality_p_value
0,id,6.456673,41.812813,4.156149e-155
1,radius_mean,0.939893,0.827584,1.286172e-16
2,texture_mean,0.648734,0.741145,4.685883e-10
3,perimeter_mean,0.988037,0.953165,3.595463e-18
4,area_mean,1.641391,3.609761,2.38604e-42
5,smoothness_mean,0.45512,0.837945,6.925619e-07
6,compactness_mean,1.186983,1.62514,2.733343e-25
7,concavity_mean,1.397483,1.970592,1.870652e-31
8,concave points_mean,1.16809,1.04668,1.120701e-22
9,symmetry_mean,0.723695,1.266117,1.58984e-13


In [2]:
sim_data = pd.read_csv("../data/sim_data.csv", index_col=0)
sim_data.info()

NameError: name 'pd' is not defined

In [None]:
# adjust the column names to make it easier to work with
column_names = sim_data.columns.str.replace('.', '_', regex=False)
sim_data.columns = column_names

In [1]:
samples_sim_data = sim_data.iloc[:, np.random.random_integers(0, 500, 30)]
samples_sim_data.describe()

NameError: name 'sim_data' is not defined