In [6]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, KFold
from collections import OrderedDict
import pickle


# Create a synthetic dataset
with open('./reduced_data/X_boruta_cfs.pickle', 'rb') as handle:
    X = pickle.load(handle)
    
# X = np.loadtxt('boruta_10.txt', delimiter=',')
y = np.loadtxt("../data/y_train.txt", delimiter=' ')

In [4]:
# Parameters for the Random Forest
params = OrderedDict(
    [('learning_rate', 0.023318165476353517), 
     ('max_depth', 20), 
     ('n_estimators', 42),
     ('subsample', 0.5)])

# Create the Random Forest model with the specified parameters
model = RandomForestClassifier(
    max_depth=params['max_depth'],
    min_samples_split=params['min_samples_split'],
    n_estimators=params['n_estimators'],
    random_state=42
)

# Set up k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

# Print cross-validation scores and mean score
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", np.mean(cv_scores))


Cross-validation scores:  [0.674 0.675 0.644 0.657 0.644]
Mean cross-validation score:  0.6588


In [8]:
# Parameters for the XGBoost
params = OrderedDict(
    [('learning_rate', 0.0027923018344754888), 
     ('max_depth', 100), 
     ('n_estimators', 128), 
     ('subsample', 0.1448290233931525)])

# Create the Random Forest model with the specified parameters
xgb_model = XGBClassifier(
    learning_rate=params['learning_rate'],
    max_depth=params['max_depth'],
    n_estimators=params['n_estimators'],
    subsample=params['subsample'],
    use_label_encoder=False, 
    eval_metric='logloss'
)

# Set up k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

# Print cross-validation scores and mean score
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", np.mean(cv_scores))


Cross-validation scores:  [0.674 0.675 0.644 0.657 0.644]
Mean cross-validation score:  0.6588
