In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

import warnings
#warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)

In [16]:
data = pd.read_csv('3old_df_FP_VAF_all.csv',nrows=100)

In [17]:
def train_and_evaluate(model, param_grid, X_train, y_train, X_test, y_test, cv, save_auroc=False):
    grid_search = GridSearchCV(model, param_grid, cv=cv,
                               scoring='neg_mean_absolute_error', n_jobs=5, return_train_score=True)
    grid_search.fit(X_train, y_train)
 
    best_model = grid_search.best_estimator_
 
    print(f"\nResults for {model.__class__.__name__} - Cross Validation:")
    print("Best Parameters:", grid_search.best_params_)
    
    predictions = best_model.predict(X_test)
 
    print("\nResults for", model.__class__.__name__, "- Testing:")
    print("Mean Absolute Error:", mean_absolute_error(y_test, predictions))
    print("Mean Squared Error:", mean_squared_error(y_test, predictions))
    print("R^2 Score:", r2_score(y_test, predictions))
    
def random_forest_regressor(X_train, y_train, X_test, y_test, cv, save_auroc=False):
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4, 8, 16, 20]
    }
 
    rf_model = RandomForestRegressor()
    train_and_evaluate(rf_model, param_grid, X_train, y_train, X_test, y_test, cv, save_auroc)
    
def KNeighbors_Regressor(X_train, y_train, X_test, y_test, cv, save_auroc=False):
    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
    }
 
    knn_model = KNeighborsRegressor()
    train_and_evaluate(knn_model, param_grid, X_train, y_train, X_test, y_test, cv, save_auroc)
    
def XGB_Regressor(X_train, y_train, X_test, y_test, cv, save_auroc=False):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    }
 
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
    train_and_evaluate(xgb_model, param_grid, X_train, y_train, X_test, y_test, cv, save_auroc)
    
def Extra_Trees_Regressor(X_train, y_train, X_test, y_test, cv, save_auroc=False):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
 
    et_model = ExtraTreesRegressor()
    train_and_evaluate(et_model, param_grid, X_train, y_train, X_test, y_test, cv, save_auroc)
    
def Gradient_Boosting_Regressor(X_train, y_train, X_test, y_test, cv, save_auroc=False):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
 
    gbr_model = GradientBoostingRegressor()
    train_and_evaluate(gbr_model, param_grid, X_train, y_train, X_test, y_test, cv, save_auroc)
    
def Decision_Tree_Regressor(X_train, y_train, X_test, y_test, cv, save_auroc=False):
    param_grid = {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
 
    dt_model = DecisionTreeRegressor()
    train_and_evaluate(dt_model, param_grid, X_train, y_train, X_test, y_test, cv, save_auroc)
    
def Bayesian_Ridge(X_train, y_train, X_test, y_test, cv, save_auroc=False):
    param_grid = {
    }
 
    br_model = BayesianRidge()
    train_and_evaluate(br_model, param_grid, X_train, y_train, X_test, y_test, cv, save_auroc)

In [18]:
#data = pd.read_csv('/home/users/ntu/leiz0003/NTU_Tri3/df_FP_VAF_all.csv')

fp_score_columns = [col for col in data.columns if 'fp_score' in col]
vaf_columns = [col for col in data.columns if 'VAF' in col]
X = data[vaf_columns]
y = data['2GAMBDQ_sample01_fp_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

random_forest_regressor(X_train, y_train, X_test, y_test, cv, save_auroc=True)
KNeighbors_Regressor(X_train, y_train, X_test, y_test, cv, save_auroc=True)
XGB_Regressor(X_train, y_train, X_test, y_test, cv, save_auroc=True)
Extra_Trees_Regressor(X_train, y_train, X_test, y_test, cv, save_auroc=True)
Gradient_Boosting_Regressor(X_train, y_train, X_test, y_test, cv, save_auroc=True)
Bayesian_Ridge(X_train, y_train, X_test, y_test, cv, save_auroc=True)


Results for RandomForestRegressor - Cross Validation:
Best Parameters: {'max_depth': None, 'min_samples_leaf': 20, 'min_samples_split': 10, 'n_estimators': 100}

Results for RandomForestRegressor - Testing:
Mean Absolute Error: 0.09826012919565937
Mean Squared Error: 0.03888910386868828
R^2 Score: -0.0077706431943598275

Results for KNeighborsRegressor - Cross Validation:
Best Parameters: {'n_neighbors': 9, 'weights': 'distance'}

Results for KNeighborsRegressor - Testing:
Mean Absolute Error: 0.12244740280189319
Mean Squared Error: 0.03801532998590394
R^2 Score: 0.014872297379259325


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype)
  if is_sparse(data):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_s


Results for XGBRegressor - Cross Validation:
Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}

Results for XGBRegressor - Testing:
Mean Absolute Error: 0.09386591445560118
Mean Squared Error: 0.03425065204294163
R^2 Score: 0.11243000724084973

Results for ExtraTreesRegressor - Cross Validation:
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}

Results for ExtraTreesRegressor - Testing:
Mean Absolute Error: 0.08465043513193128
Mean Squared Error: 0.031692879578041816
R^2 Score: 0.17871201802722614

Results for GradientBoostingRegressor - Cross Validation:
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}

Results for GradientBoostingRegressor - Testing:
Mean Absolute Error: 0.10280168160803041
Mean Squared Error: 0.038615506030725465
R^2 Score: -0.0006806400389418599

Results for BayesianRidge - Cross Validation:
Best Parameters: {}

Results for BayesianRidge - Testing:
Mean Ab