In [1]:
from main import *

In [7]:
target_feature = "merged_support3_variance_0.1" # Real_data
# target_feature = "merged_random_1k" # Test_data
target_feature_suffix = "_matrix.npy"
# target_feature_suffix = "_matrix_onehot.npy"
save_data_path = "./results_test"


feature_select_method = "random" #["random", "xgb", "rf", "variance", "chi2", "f_classif", "mutual_info_classif"] # Extra-trees
n_select = 131072
n_dim_reduced = 128 #[128, 1024, None]  ## list should always contain None to perform whole feature training after selection
train_model = "SVM" #["SVM", "XGB", "RF", "DT", "KNN"]

In [3]:
feature_data_path, sample_annotation_file = get_data_path(save_data_path)

dataset = data_loader(os.path.join(feature_data_path, target_feature + target_feature_suffix), 
                        sample_annotation_file)
(X, y_original, y), (train_indices, val_indices, test_indices), label_mapping = dataset.get_data()

In [5]:
X_selected, perf_metric_select = select_feature(X = X, y = y, method = feature_select_method, n = n_select, train_idx = train_indices, val_idx = val_indices) 
print(f" - '{feature_select_method}' feature selection selected {n_select} variants. X_selected.shape = {X_selected.shape}. perf_metrics_selection: {perf_metric_select}")

if len(X_selected.shape) == 3: # boolean encoding of SNP status
    X_selected = X_selected.reshape(X_selected.shape[0], -1) #flatten last feature dims
X_train, X_val, X_test = X_selected[train_indices], X_selected[val_indices], X_selected[test_indices]
y_train, y_val, y_test = y[train_indices], y[val_indices], y[test_indices]


'random' feature selection selected 131072 variants. X_selected.shape = (3201, 131072). perf_metrics_selection: {'cpu_time': 0.005127016333333737, 'wall_time': 0.007326344648996989, 'memory_usage': 15.81478500366211}


In [6]:
if (n_dim_reduced is None): # use whole feature
    n_dim_reduced = n_select
    X_train_reduced, X_val_reduced, X_test_reduced = X_train, X_val, X_test 
    print(f" - Using whole features for training: X_train.shape = {X_train_reduced.shape}")
else:
    if (n_dim_reduced < n_select):
        X_train_reduced, X_val_reduced, X_test_reduced = feature_transform(X_train, X_val, X_test, n = n_dim_reduced)
        print(f" - Reduced to {n_dim_reduced} features using PCA: X_train_reduced.shape = {X_train_reduced.shape}")
    else:
        raise ValueError

 - Reduced to 128 features using PCA: X_train_reduced.shape = (1920, 128)


# implement your changes

In [8]:
import optuna

def objective(trial, X_train, y_train, X_val, y_val, method):
    if method == "SVM":
        C = trial.suggest_loguniform('C', 1e-4, 10)  # Log-uniform distribution for C
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'sigmoid'])  # Categorical distribution for kernel type
        if kernel == 'rbf' or kernel == 'sigmoid':
            gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])  # Categorical for gamma if relevant
        else:
            gamma = 'scale'  # Default to 'scale' for linear to avoid irrelevance
        model = SVC(C=C, kernel=kernel, gamma=gamma, random_state=RANDOM_SEED)
        
    elif method == "RF":
        n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=100)
        max_depth = trial.suggest_int('max_depth', 10, 100, step=10)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        model = RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth,
            min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
            random_state=RANDOM_SEED)
    elif method == "XGB":
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1)
        n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
        max_depth = trial.suggest_int("max_depth", 3, 10)
        gamma = trial.suggest_float("gamma", 0.1, 1.0, step=0.1)
        subsample = trial.suggest_float("subsample", 0.6, 1.0, step=0.1)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.6, 1.0, step=0.1)
        model = XGBClassifier(
            learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth,
            gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree,
            objective='multi:softmax', num_class=len(np.unique(y_train)),
            use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM_SEED)
    
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)

    train_accuracy = accuracy_score(y_train, y_pred_train)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    trial.set_user_attr("train_accuracy", train_accuracy)
    return val_accuracy

def visualize_study(study):
    optuna.visualization.matplotlib.plot_optimization_history(study)
    plt.show()

    optuna.visualization.matplotlib.plot_parallel_coordinate(study)
    plt.show()

    # Additional plotting can be customized based on user's needs.
    # For instance, plot train vs validation accuracy:
    train_accuracies = [trial.user_attrs["train_accuracy"] for trial in study.trials]
    val_accuracies = [trial.value for trial in study.trials]

    plt.figure(figsize=(10, 5))
    plt.plot(train_accuracies, label='Train Accuracy')
    plt.plot(val_accuracies, label='Validation Accuracy')
    plt.xlabel('Trial')
    plt.ylabel('Accuracy')
    plt.title('Training vs Validation Accuracy')
    plt.legend()
    plt.show()

In [9]:
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, method=train_model), n_trials=50)

best_rf_params = study.best_trial.params
print("Best RF Params:", best_rf_params)

[I 2024-04-24 12:37:47,859] A new study created in memory with name: no-name-10e48293-9b3a-42b4-b7e5-be11f740b59b
[I 2024-04-24 12:54:06,727] Trial 0 finished with value: 0.165625 and parameters: {'C': 0.00012219443950894319, 'kernel': 'sigmoid', 'gamma': 'auto'}. Best is trial 0 with value: 0.165625.
[I 2024-04-24 12:58:21,940] Trial 1 finished with value: 0.9125 and parameters: {'C': 0.29029634172358115, 'kernel': 'linear'}. Best is trial 1 with value: 0.9125.
[I 2024-04-24 13:24:39,951] Trial 2 finished with value: 0.165625 and parameters: {'C': 0.006921880356972901, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 1 with value: 0.9125.


In [None]:
visualize_study(study)

In [None]:
print(f" - Start {train_model} training: X_train.shape = {X_train_reduced.shape} X_test.shape = {X_test_reduced.shape} with hyper_param {current_hyper_param}")

(y_pred_train, y_pred_val, y_pred_test, train_params), perf_metric_train = train_ML(method = train_model, 
                                                    X_train = X_train_reduced, y_train = y_train, 
                                                    X_val = X_val_reduced, y_val = y_val, 
                                                    X_test = X_test_reduced,
                                                    params = current_hyper_param) 

eval_metrics_train = evaluate_performance(y_train, y_pred_train, label_mapping, os.path.join(save_data_path, f"{feature_select_method}_{n_select}_{train_model}_{hyper_param_index}_train"))
eval_metrics_val = evaluate_performance(y_val, y_pred_val, label_mapping, os.path.join(save_data_path, f"{feature_select_method}_{n_select}_{train_model}_{hyper_param_index}_val"))
eval_metrics_test = evaluate_performance(y_test, y_pred_test, label_mapping, os.path.join(save_data_path, f"{feature_select_method}_{n_select}_{train_model}_{hyper_param_index}_test"))
logging.info(f' - Train done with Accuracy: {eval_metrics_test["accuracy"]*100:.4f}%, perf_metrics_train: {perf_metric_train}')
