In [1]:
from main import *
import numpy as np

# parameter setup

In [2]:
target_feature = "merged_support3"
target_feature_suffix = "_matrix.npy"
save_data_path = "./results"

pre_selection_methods = ["variance"] #"chi2", "f_classif"
n_pre_select_list = [1000000] #[2000000, 4000000, 8000000, 16000000, 32000000]#
n_pre_select_goal = 1000000

select_methods = ["xgb"]# ["random", "xgb", "rf", "variance", "chi2", "f_classif"] # Extra-trees # "mutual_info_classif"

select_feature_from_cache = False
n_select_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072] 

n_dim_reduce_list = [128, 256, 512, 1024, None]  ## list should always contain None to perform whole feature training after selection
ML_models = ["SVM"] #["SVM", "XGB", "RF", "DT", "KNN"]

hyper_params = {
    "SVM": [
            {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear']}  # grid search params
            ],
    "RF": [
        {'n_estimators': 500, 'max_features': 'sqrt', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1}, # current best for 1048576 features
    ],

    "XGB": [
        {'learning_rate': 0.1, 'n_estimators': 1000, 'max_depth': 3, 'gamma': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1, 'reg_alpha': 0}, # current best for 1048576 features
    ]
}
save_result_file_name = f"{target_feature}_results.xlsx"

# data loading

In [None]:
feature_data_path, sample_annotation_file = get_data_path(save_data_path)

dataset = data_loader(os.path.join(feature_data_path, target_feature + target_feature_suffix), 
                        sample_annotation_file)
(X, y_original, y), (train_indices, val_indices, test_indices), label_mapping = dataset.get_data()


# preliminary feature selection

In [None]:
result_combined = []
for pre_feature_select_method in pre_selection_methods:
    try:
        X_pre_selected_list, perf_metric_preselect = select_feature(X = X, y = y, method = pre_feature_select_method, n_list = n_pre_select_list, train_idx = train_indices, val_idx = val_indices) 
    except Exception as e:
        logging.error(f"An unexpected error occurred while pre_select_feature of {pre_feature_select_method}. {e.__class__.__name__}: {str(e)}")
        continue 

    for X_pre_selected, n_pre_select in zip(X_pre_selected_list, n_pre_select_list):
        if n_pre_select > n_pre_select_goal:
            try:
                X_pre_selected_final_list, _ = select_feature(X = X_pre_selected, y = y, method = "random", n_list = [n_pre_select_goal], train_idx = train_indices, val_idx = val_indices) 
            except Exception as e:
                logging.error(f"An unexpected error occurred while random selection after pre_select_feature. {e.__class__.__name__}: {str(e)}")
                continue 
            X_pre_selected_final = X_pre_selected_final_list[0]
            logging.info(f" - Further selecting feature by random from {n_pre_select} to {n_pre_select_goal} variants. X_pre_selected.shape = {X_pre_selected.shape}. X_pre_selected_final.shape = {X_pre_selected_final.shape}")
        else:
            X_pre_selected_final = X_pre_selected
        logging.info(f" - '{pre_feature_select_method}' feature selection selected {min(n_pre_select, n_pre_select_goal)} variants. X_pre_selected_final.shape = {X_pre_selected_final.shape}. perf_metrics_selection: {perf_metric_preselect}")

# secondary feature selection

In [None]:
for feature_select_method in select_methods:
    current_loop = {"random_seed": RANDOM_SEED, "pre_select_method": pre_feature_select_method, "n_pre_select": n_pre_select, "n_pre_select_goal": n_pre_select_goal, "select_method": feature_select_method}
    feature_importance_cache_file_prefix = f"{X.shape[1]}_seed{RANDOM_SEED}_{pre_feature_select_method}_{n_pre_select}_{n_pre_select_goal}_{feature_select_method}"

    logging.info(f"*************** current loop: {current_loop} ***************")

    try:
        X_selected_list, perf_metric_select = select_feature(X = X_pre_selected_final, y = y, method = feature_select_method, n_list = n_select_list, train_idx = train_indices, val_idx = val_indices, cache_file_prefix = feature_importance_cache_file_prefix, from_cache = select_feature_from_cache) 
    except Exception as e:
        logging.error(f"An unexpected error occurred while select_feature of {current_loop}. {e.__class__.__name__}: {str(e)}")
        continue 

# train the SVM model

In [None]:
for X_selected, n_select in zip(X_selected_list, n_select_list):
    current_loop["select_n"] = n_select

    logging.info(f" - '{feature_select_method}' feature selection selected {n_select} variants. X_selected.shape = {X_selected.shape}. perf_metrics_selection: {perf_metric_select}")

    if len(X_selected.shape) == 3: # boolean encoding of SNP status
        X_selected = X_selected.reshape(X_selected.shape[0], -1) #flatten last feature dims
    X_train, X_val, X_test = X_selected[train_indices], X_selected[val_indices], X_selected[test_indices]
    y_train, y_val, y_test = y[train_indices], y[val_indices], y[test_indices]
    
    for n_dim_reduced in n_dim_reduce_list:
        if (n_dim_reduced is None): # use whole feature
            current_loop["n_dim_reduced"] = n_select
            X_train_reduced, X_val_reduced, X_test_reduced = X_train, X_val, X_test 
            logging.info(f" - Using whole features for training: X_train.shape = {X_train_reduced.shape}")
        else:
            if (n_dim_reduced < n_select):
                current_loop["n_dim_reduced"] = n_dim_reduced
                try:
                    X_train_reduced, X_val_reduced, X_test_reduced = feature_transform(X_train, X_val, X_test, n = n_dim_reduced)
                    logging.info(f" - Reduced to {n_dim_reduced} features using PCA: X_train_reduced.shape = {X_train_reduced.shape}")
                except Exception as e:
                    logging.error(f"An unexpected error occurred while feature_transform of {current_loop}. {e.__class__.__name__}: {str(e)}")
                    continue
            else:
                continue
            
    
        for train_model in ML_models:
            for hyper_param_index, current_hyper_param in enumerate(hyper_params[train_model]):
                current_loop["train_model"] = train_model

                logging.info(f" - Start {train_model} training: X_train.shape = {X_train_reduced.shape} X_test.shape = {X_test_reduced.shape} with hyper_param {current_hyper_param}")

                try:
                    (y_pred_train, y_pred_val, y_pred_test, train_params), perf_metric_train = train_ML(method = train_model, 
                                                                        X_train = X_train_reduced, y_train = y_train, 
                                                                        X_val = X_val_reduced, y_val = y_val, 
                                                                        X_test = X_test_reduced,
                                                                        params = current_hyper_param) 
                except Exception as e:
                    logging.error(f"An unexpected error occurred while train_ML of {current_loop}. {e.__class__.__name__}: {str(e)}")
                    continue 
                eval_metrics_train = evaluate_performance(y_train, y_pred_train, label_mapping, os.path.join(save_data_path, f"{feature_select_method}_{n_select}_{train_model}_{hyper_param_index}_train"))
                eval_metrics_val = evaluate_performance(y_val, y_pred_val, label_mapping, os.path.join(save_data_path, f"{feature_select_method}_{n_select}_{train_model}_{hyper_param_index}_val"))
                eval_metrics_test = evaluate_performance(y_test, y_pred_test, label_mapping, os.path.join(save_data_path, f"{feature_select_method}_{n_select}_{train_model}_{hyper_param_index}_test"))
                logging.info(f' - Train done with Accuracy: {eval_metrics_test["accuracy"]*100:.4f}%, perf_metrics_train: {perf_metric_train}')


                merged_metrics = {**current_loop,
                                "hyper_params" : str(current_hyper_param),
                                "model_params" : str(train_params),
                                **{f"preselect_{k}": v for k, v in perf_metric_preselect.items()},
                                **{f"select_{k}": v for k, v in perf_metric_select.items()},
                                **{f"train_{k}": v for k, v in perf_metric_train.items()},
                                **{f"testset_{k}": v for k, v in eval_metrics_test.items() if k != 'confusion_matrix'},
                                **{f"valset_{k}": v for k, v in eval_metrics_val.items() if k != 'confusion_matrix'},
                                **{f"trainset_{k}": v for k, v in eval_metrics_train.items() if k != 'confusion_matrix'},
                                }
                result_combined.append(merged_metrics)

                ## update the dataframe
                results_df = pd.DataFrame(result_combined)
                results_df.to_excel(os.path.join(save_data_path, save_result_file_name), index = False)

# save the selected features

In [12]:
preliminary_selection_file = "43034818_seed42_variance.npy"
secondary_selection_file = "43034818_seed42_variance_1000000_1000000_xgb_basic_feature_importance_mean.npy"
n_preliminary = n_pre_select_list[0]
n_secondary = 8192

output_file_name = "merged_support3_variance_1M_seed_42_xgb_8192_matrix.npy"

In [4]:
feature_data_path, sample_annotation_file = get_data_path(save_data_path)
feature_file_name = os.path.join(feature_data_path, target_feature + target_feature_suffix)
X = np.load(feature_file_name)
print(f"reading features from file {feature_file_name}. Data shape : {X.shape}")

reading features from file /home/jinhyun/data/1kGP/preprocessed/merged_support3_matrix.npy. Data shape : (3202, 43034818)


In [5]:
variances = np.load(preliminary_selection_file)
preliminary_selected_indices = np.argsort(variances)[-n_preliminary:]

boolean_mask = np.zeros(X.shape[1], dtype=bool)
boolean_mask[preliminary_selected_indices] = True

X_selected_preliminary = X[:, boolean_mask]

assert boolean_mask.sum() == X_selected_preliminary.shape[1]
print(f"Selected variants based on variance filter. Shape: {X_selected_preliminary.shape}")

Selected variants based on variance filter. Shape: (3202, 1000000)


In [10]:
secondary_feature_importance = np.load(secondary_selection_file)
secondary_selected_indices = np.argsort(secondary_feature_importance)[-n_secondary:][::-1]
X_selected_secondary = X_selected_preliminary[:, secondary_selected_indices]
assert secondary_feature_importance.shape[0] == X_selected_preliminary.shape[1]
print(f"Selected variants based on XGB. Shape: {X_selected_secondary.shape}")

Selected variants based on XGB. Shape: (3202, 8192)


In [13]:
np.save(os.path.join(feature_data_path, output_file_name), X_selected_secondary)
print(f"Saved the result to file {output_file_name}")

Saved the result to file merged_support3_variance_1M_seed_42_xgb_8192_matrix.npy


# hyperparam optimization

In [None]:
import optuna

def objective(trial, X_train, y_train, X_val, y_val, method):
    if method == "SVM":
        C = trial.suggest_loguniform('C', 1e-6, 10)  # Log-uniform distribution for C
        kernel = "linear"
        gamma = 'scale'
        
        # kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'sigmoid'])  # Categorical distribution for kernel type
        # if kernel == 'rbf' or kernel == 'sigmoid':
        #     gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])  # Categorical for gamma if relevant
        # else:
        #     gamma = 'scale'  # Default to 'scale' for linear to avoid irrelevance
        model = SVC(C=C, kernel=kernel, gamma=gamma, random_state=RANDOM_SEED)
        
    elif method == "RF":
        n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=100)
        max_depth = trial.suggest_int('max_depth', 10, 100, step=10)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        model = RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth,
            min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
            random_state=RANDOM_SEED)
    elif method == "XGB":
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1)
        n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
        max_depth = trial.suggest_int("max_depth", 3, 10)
        gamma = trial.suggest_float("gamma", 0.1, 1.0, step=0.1)
        subsample = trial.suggest_float("subsample", 0.6, 1.0, step=0.1)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.6, 1.0, step=0.1)
        model = XGBClassifier(
            learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth,
            gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree,
            objective='multi:softmax', num_class=len(np.unique(y_train)),
            use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM_SEED)
    
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)

    train_accuracy = accuracy_score(y_train, y_pred_train)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    trial.set_user_attr("train_accuracy", train_accuracy)
    return val_accuracy

def visualize_study(study):
    optuna.visualization.matplotlib.plot_optimization_history(study)
    plt.show()

    optuna.visualization.matplotlib.plot_parallel_coordinate(study)
    plt.show()

    # Additional plotting can be customized based on user's needs.
    # For instance, plot train vs validation accuracy:
    train_accuracies = [trial.user_attrs["train_accuracy"] for trial in study.trials]
    val_accuracies = [trial.value for trial in study.trials]

    plt.figure(figsize=(10, 5))
    plt.plot(train_accuracies, label='Train Accuracy')
    plt.plot(val_accuracies, label='Validation Accuracy')
    plt.xlabel('Trial')
    plt.ylabel('Accuracy')
    plt.title('Training vs Validation Accuracy')
    plt.legend()
    plt.show()

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, X_train_reduced, y_train, X_val_reduced, y_val, method=train_model), n_trials=50)

best_rf_params = study.best_trial.params
print("Best RF Params:", best_rf_params)

In [None]:
visualize_study(study)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
fi = np.load("1048576_seed42_xgb_basic_feature_importance_mean.npy")
fi = fi[np.argsort(fi)]

In [None]:
fi[-50:]

In [None]:
fig, ax = plt.subplots()
pd.Series(fi[-100:]).plot.bar(ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
counts, bins = np.histogram(fi, bins = 100)
plt.stairs(counts, bins)

In [None]:
counts