# Import Required Libraries
Import the necessary libraries, including scikit-learn, pandas, and numpy.

In [11]:
## Import Required Libraries
import sys

import torch

sys.path.append("../src")
import numpy as np

import pandas as pd

from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    AdaBoostRegressor,
    BaggingRegressor,
)
import logging
from sklearn.linear_model import (
    BayesianRidge,
    ElasticNetCV,
    HuberRegressor,
    RidgeCV,
    LassoCV,
    LinearRegression,
)
import pandas as pd
from training.train_shallow import evaluate_shallow_model
from matplotlib import pyplot as plt

from utils.utils import load_config, split_data

# Load Config to ensure reproducibility and syncing with other scripts
config = load_config("../config.yaml")

# Set logging configurations
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(message)s",
)

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from preprocess.preprocess import perform_pca
from visualizations.visualizations import plot_pca_variance, create_pca_biplot, create_tsne_plot, create_coefficients_visualization

# Load and Preprocess Dataset
Load the gene expression derived TF dataset and preprocess it for model training.

In [21]:
## Load, Split and Preprocess Dataset
# Load the dataset
tf_df = pd.read_csv(config["data_paths"]["preprocessed_tf_file"])
gene_df = pd.read_csv(config["data_paths"]["preprocessed_gene_file"])

# Only sample a subset of the data for faster training
tf_df = tf_df.sample(n=1000, random_state=42)
gene_df = gene_df.sample(n=1000, random_state=42)

logging.debug(f"TF data shape: {tf_df.shape}, Gene data shape: {gene_df.shape}")

# Split the data into train, validation and test sets as well as features and target
X_tf_train, y_tf_train, X_tf_val, y_tf_val, X_tf_test, y_tf_test = split_data(
    tf_df, config, target_name="viability"
)

X_gene_train, y_gene_train, X_gene_val, y_gene_val, X_gene_test, y_gene_test = (
    split_data(gene_df, config, target_name="viability")
)

2024-12-11 10:36:39,704 - TF data shape: (1000, 683), Gene data shape: (1000, 978)


In [23]:
if config["preprocess"]["use_vt"]:
    # Separate features and target
    tf_features = tf_df.drop(columns=["viability"])
    tf_target = tf_df["viability"]
    gene_features = gene_df.drop(columns=["viability"])
    gene_target = gene_df["viability"]

    # Instantiate separate VarianceThreshold selectors for each dataset
    tf_selector = VarianceThreshold(threshold=config["preprocess"]["vt_threshold_tf"])
    gene_selector = VarianceThreshold(
        threshold=config["preprocess"]["vt_threshold_gene"]
    )

    # Apply VarianceThreshold to the features
    tf_features_selected = tf_selector.fit_transform(tf_features)
    gene_features_selected = gene_selector.fit_transform(gene_features)

    # Convert the results back to DataFrames
    tf_features_selected = pd.DataFrame(
        tf_features_selected, columns=tf_features.columns[tf_selector.get_support()]
    )
    gene_features_selected = pd.DataFrame(
        gene_features_selected,
        columns=gene_features.columns[gene_selector.get_support()],
    )

    # Concatenate the target column back to the selected features
    tf_df = pd.concat([tf_features_selected, tf_target.reset_index(drop=True)], axis=1)
    gene_df = pd.concat(
        [gene_features_selected, gene_target.reset_index(drop=True)], axis=1
    )

    # Log the shape of the datasets after applying VarianceThreshold
    logging.debug(
        f"TF data shape after VarianceThreshold: {tf_df.shape}, Gene data shape after VarianceThreshold: {gene_df.shape}"
    )

if config["preprocess"]["use_pca"]:
    # Step 1: Extract the original feature names
    tf_feature_names = X_tf_train.columns.to_list() 
    gene_feature_names = X_gene_train.columns.to_list()

    # Step 2: Perform PCA
    X_tf_train_pca, tf_pca = perform_pca(
        X_tf_train, config["preprocess"]["pca_var_tf"]
    )
    X_gene_train_pca, gene_pca = perform_pca(
        X_gene_train, config["preprocess"]["pca_var_gene"]
    )

    logging.debug(
        f"TF PCA Data shape: {X_tf_train_pca.shape}, Gene PCA Data shape: {X_gene_train_pca.shape}"
    )

    # Step 3: Scree Plot
    plot_pca_variance(tf_pca, "TF Data")
    plot_pca_variance(gene_pca, "Gene Data")

    # Step 4: 3D PCA Biplots
    create_pca_biplot(
        pca=tf_pca,
        X=X_tf_train_pca,
        Y=y_tf_train,
        features=tf_feature_names,
        dimension="2D",
        dataset_name="TF Dataset",
        top_n_loadings=10,
        sample_size=1000,
        loading_scale=10,
    )

    create_pca_biplot(
        pca=gene_pca,
        X=X_gene_train_pca,
        Y=y_gene_train,
        features=gene_feature_names,
        dimension="3D",
        dataset_name="Gene Dataset",
        top_n_loadings=10,
        sample_size=1000,
        loading_scale=10,
    )

    # Step 5: TSNE Plots
    create_tsne_plot(
        X_tf_train_pca,
        y_tf_train,
        target_column="viability",
        sample_size=1000,
        dimension="2D",
        dataset_name="TF Data TSNE",
    )

    create_tsne_plot(
        X_gene_train_pca,
        y_gene_train,
        target_column="viability",
        sample_size=1000,
        dimension="3D",
        dataset_name="Gene Data TSNE",
    )

2024-12-11 10:37:33,952 - TF PCA Data shape: (500, 349), Gene PCA Data shape: (500, 341)


In [14]:
# if config["preprocess"]["use_pca"]:	
#     # Set the X data variable to the PCA transformed data
#     X_tf_train = X_tf_train_pca
#     X_gene_train = X_gene_train_pca

#     # Also transform the validation and test data
#     X_tf_val = tf_pca.transform(X_tf_val)
#     X_tf_test = tf_pca.transform(X_tf_test)

#     X_gene_val = gene_pca.transform(X_gene_val)
#     X_gene_test = gene_pca.transform(X_gene_test)

# Implement Simple Regression Models
Implement the Ridge regression model and train it on the TF dataset.

In [15]:
from sklearn.base import clone
from visualizations.visualizations import create_coefficients_visualization
# Define models
models = {
    "Linear": (LinearRegression()),
    "Ridge": (RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0], cv=5)),
    "Lasso": (LassoCV(alphas=[0.01, 0.1, 1.0, 10.0], cv=5)),
    "Elastic Net": (ElasticNetCV(l1_ratio=[0.2, 0.4, 0.6, 0.8], alphas=[0.01, 0.1, 1.0, 10.0], cv=5)),
    "Bayesian Ridge": (BayesianRidge()),
    "Huber": (HuberRegressor()),
}

# Define feature sets
feature_sets = {
    "TF Data": (X_tf_train, y_tf_train, X_tf_test, y_tf_test),
    "Gene Data": (X_gene_train, y_gene_train, X_gene_test, y_gene_test),
}

class ModelPipeline:
    def __init__(self, models, feature_sets):
        self.models = models 
        self.feature_sets = feature_sets
        self.trained_models = {}  # Dictionary to store trained models
        self.results = {}  # Dictionary to store evaluation results

    def train_and_evaluate(self):
        for feature_name, (
            X_train,
            y_train,
            X_test,
            y_test,
        ) in self.feature_sets.items():
            for model_name, base_model in self.models.items():
                logging.debug(f"Training {model_name} on {feature_name}...")

                # Clone the base model to ensure no overwriting
                model = clone(base_model)

                # Train the cloned model
                trained_model = model.fit(X_train, y_train)

                # Store the trained model with a unique key
                self.trained_models[(feature_name, model_name)] = trained_model

                # Evaluate the model
                test_loss, metrics = evaluate_shallow_model(
                    trained_model, X_test, y_test
                )
                self.results[(feature_name, model_name)] = metrics

    def get_results(self):
        results_df = pd.DataFrame.from_dict(self.results, orient="index")
        results_df.index = pd.MultiIndex.from_tuples(
            results_df.index, names=["Feature Set", "Regression Model Type"]
        )
        return results_df.sort_index()

    def visualize_coefficients(self, top_n=10):
        create_coefficients_visualization(
            self.trained_models, self.feature_sets, top_n
        )


# Usage
pipeline = ModelPipeline(models, feature_sets)
pipeline.train_and_evaluate()
results_df = pipeline.get_results()
pipeline.visualize_coefficients(top_n=10)

2024-12-11 10:32:49,771 - Training Linear on TF Data...
2024-12-11 10:32:52,441 - Training Ridge on TF Data...
2024-12-11 10:32:58,847 - Training Lasso on TF Data...
2024-12-11 10:33:01,589 - Training Elastic Net on TF Data...
2024-12-11 10:33:11,489 - Training Bayesian Ridge on TF Data...
2024-12-11 10:33:13,432 - Training Huber on TF Data...

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html

2024-12-11 10:33:28,792 - Training Linear on Gene Data...
2024-12-11 10:33:30,208 - Training Ridge on Gene Data...
2024-12-11 10:33:40,366 - Training Lasso on Gene Data...
2024-12-11 10:33:44,368 - Training Elastic Net on Gene Data...
2024-12-11 10:33:59,753 - Training Bayesian Ridge on Gene Data...
2024-12-11 10:34:02,803 - Training Huber on Gene Data...


In [16]:
results_df.style.format(precision=3).set_caption(
    "Regression Model Evaluation Metrics"
).highlight_max(subset=["R²", "Pearson Correlation", "Weighted Score"], color="lightgreen").highlight_min(
    subset=["MAE", "MSE"], color="lightgreen"
)

Unnamed: 0_level_0,Unnamed: 1_level_0,MSE,MAE,R²,Pearson Correlation,Weighted Score
Feature Set,Regression Model Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Gene Data,Bayesian Ridge,0.036,0.127,0.403,0.635,0.519
Gene Data,Elastic Net,0.036,0.127,0.396,0.631,0.514
Gene Data,Huber,0.038,0.118,0.368,0.629,0.499
Gene Data,Lasso,0.039,0.134,0.343,0.596,0.469
Gene Data,Linear,0.036,0.128,0.398,0.631,0.515
Gene Data,Ridge,0.036,0.128,0.398,0.631,0.515
TF Data,Bayesian Ridge,0.034,0.128,0.426,0.653,0.54
TF Data,Elastic Net,0.036,0.13,0.395,0.63,0.513
TF Data,Huber,0.036,0.12,0.389,0.648,0.518
TF Data,Lasso,0.041,0.137,0.313,0.57,0.442


In [17]:
import plotly.express as px

# Bar plot for Weighted Score of Regression Models
fig = px.bar(
    results_df.reset_index(),
    x="Regression Model Type",
    y="Weighted Score",
    color="Feature Set",
    barmode="group",
    title="Weighted Score of Regression Models",
    labels={"Weighted Score": "Weighted Score", "Regression Model Type": "Models"},
)
fig.update_layout(
    xaxis=dict(title="Regression Models"),
    yaxis=dict(title="Weighted Score"),
    legend=dict(title="Feature Set"),
)
fig.show()

# Bar plot comparing feature sets for each model
fig = px.bar(
    results_df.reset_index(),
    x="Feature Set",
    y="Weighted Score",
    color="Regression Model Type",
    barmode="group",
    title="Weighted Score of Feature Sets by Models",
    labels={"Weighted Score": "Weighted Score", "Feature Set": "Feature Set"},
)
fig.update_layout(
    xaxis=dict(title="Feature Sets"),
    yaxis=dict(title="Weighted Score"),
    legend=dict(title="Regression Models"),
)
fig.show()

# Box plot for comparing feature sets across models
fig = px.box(
    results_df.reset_index(),
    x="Feature Set",
    y="Weighted Score",
    color="Feature Set",
    title="Weighted Score Distribution Across Feature Sets",
    labels={"Weighted Score": "Weighted Score", "Feature Set": "Feature Sets"},
    points="outliers", 
)
fig.update_layout(
    xaxis=dict(title="Feature Sets"),
    yaxis=dict(title="Weighted Score"),
    legend=dict(title="Feature Sets"),
)
fig.show()

In [18]:
# # Define ensemble models
# models = {
#     "Random Forest": RandomForestRegressor(random_state=42),
#     "Gradient Boosting": GradientBoostingRegressor(random_state=42),
#     "Bagging": BaggingRegressor(random_state=42),
#     "AdaBoost": AdaBoostRegressor(random_state=42),
#     "KNN": KNeighborsRegressor(),
#     "SVM": SVR(),
# }

# # Define hyperparameter distributions
# param_distributions = {
#     "Random Forest": {
#         "n_estimators": randint(50, 200).rvs(size=10),  # Generate iterable integers
#         "max_depth": randint(5, 20),
#         "min_samples_split": randint(2, 10),
#         "min_samples_leaf": randint(1, 10),
#     },
#     "Gradient Boosting": {
#         "n_estimators": randint(50, 200).rvs(size=10),
#         "learning_rate": uniform(0.01, 0.2),
#         "max_depth": randint(3, 15),
#         "min_samples_split": randint(2, 10),
#         "min_samples_leaf": randint(1, 10),
#     },
#     "Bagging": {
#         "n_estimators": randint(50, 200).rvs(size=10),
#         "max_samples": uniform(0.5, 1.0),
#         "max_features": uniform(0.5, 1.0),
#     },
#     "AdaBoost": {
#         "n_estimators": randint(50, 200).rvs(size=10),
#         "learning_rate": uniform(0.01, 1.0),
#         },
#     "KNN": {
#         "n_neighbors": randint(3, 20).rvs(size=10),  # Generate iterable neighbors
#         "weights": ["uniform", "distance"],
#         "p": [1, 2],  # 1: Manhattan, 2: Euclidean
#     },
#     "SVM": {
#         "C": uniform(0.1, 10.0).rvs(size=10),  # Generate random C values
#         "epsilon": uniform(0.01, 1.0).rvs(size=10),  # Generate random epsilon values
#         "kernel": ["linear", "rbf"],
#         "gamma": ["scale", "auto"],
#     },
# }

# # Initialize results dictionary
# results = {}

# # Model Training and Evaluation with RandomizedSearchCV
# for feature_name, (X_train, y_train, X_test, y_test) in feature_sets.items():
#     for model_name, base_model in models.items():
#         logging.debug(
#             f"Running RandomizedSearchCV for {model_name} on {feature_name}..."
#         )

#         # Define RandomizedSearchCV
#         search = RandomizedSearchCV(
#             estimator=base_model,
#             param_distributions=param_distributions[model_name],
#             n_iter=1,
#             scoring="neg_mean_squared_error",
#             cv=5,  # 5-fold cross-validation
#             random_state=42,
#             n_jobs=-1,  # Use all available cores
#         )

#         # Perform Randomized Search
#         search.fit(X_train, y_train)

#         # Get the best model and parameters
#         best_model = search.best_estimator_
#         best_params = search.best_params_

#         # Evaluate the model on the test set
#         test_loss, metrics = evaluate_shallow_model(best_model, X_test, y_test)

#         # Store evaluation results
#         results[(feature_name, model_name)] = {
#             **metrics,
#         }

# # Convert results into a DataFrame with a Multi-Index
# results_df = pd.DataFrame.from_dict(results, orient="index")
# results_df.index = pd.MultiIndex.from_tuples(
#     results_df.index, names=["Feature Set", "Regression Model Type"]
# )

# # Sort the index for better organization
# results_df = results_df.sort_index()

In [19]:
results_df.style.format(precision=3).set_caption(
    "Regression Model Evaluation Metrics"
).highlight_max(
    subset=["R²", "Pearson Correlation", "Weighted Score"], color="lightgreen"
).highlight_min(
    subset=["MAE", "MSE"], color="lightgreen"
)

Unnamed: 0_level_0,Unnamed: 1_level_0,MSE,MAE,R²,Pearson Correlation,Weighted Score
Feature Set,Regression Model Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Gene Data,Bayesian Ridge,0.036,0.127,0.403,0.635,0.519
Gene Data,Elastic Net,0.036,0.127,0.396,0.631,0.514
Gene Data,Huber,0.038,0.118,0.368,0.629,0.499
Gene Data,Lasso,0.039,0.134,0.343,0.596,0.469
Gene Data,Linear,0.036,0.128,0.398,0.631,0.515
Gene Data,Ridge,0.036,0.128,0.398,0.631,0.515
TF Data,Bayesian Ridge,0.034,0.128,0.426,0.653,0.54
TF Data,Elastic Net,0.036,0.13,0.395,0.63,0.513
TF Data,Huber,0.036,0.12,0.389,0.648,0.518
TF Data,Lasso,0.041,0.137,0.313,0.57,0.442


# Compare Results with Earlier Research
Compare the results of the models with earlier research and report findings in a baseline results summary.