In [24]:
import numpy as np
import pandas as pd
import pickle
import warnings
from sklearn.metrics import recall_score
warnings.filterwarnings('ignore')

In [25]:
with open('X_boruta_cfs.pickle', 'rb') as file_handle:
    X = pickle.load(file_handle)

In [26]:
y = np.loadtxt("../data/y_train.txt", delimiter=' ')

In [27]:
X.shape

(5000, 5)

In [31]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from tqdm import tqdm

models = [LinearSVC(dual=True), GaussianNB(), RandomForestClassifier(), MLPClassifier()]

def test_models(models, X, y):
    all_scores = []
    for model in tqdm(models):
        scores = []
        
        kFold = KFold(n_splits=10, shuffle=False)
        for train_index, test_index in kFold.split(X):     
            X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            scores.append(recall_score(y_test, y_pred))
        all_scores.append(scores)

    for model, score in zip(models, all_scores):
        print(f'model: {type(model).__name__},\t recall: {np.mean(score)},\t cross val score: {np.mean(cross_val_score(model, X, y, cv=10, scoring="recall"))}')

test_models(models, X, y)

100%|██████████| 4/4 [00:47<00:00, 11.82s/it]


model: LinearSVC,	 recall: 0.4773262698036257,	 cross val score: 0.48317269076305225
model: GaussianNB,	 recall: 0.5837566284878676,	 cross val score: 0.5853220883534136
model: RandomForestClassifier,	 recall: 0.6248336565882711,	 cross val score: 0.6181783132530121
model: MLPClassifier,	 recall: 0.6163773363000794,	 cross val score: 0.5845285140562249


In [32]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
    )


dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

params = {
    'objective': 'binary:logistic',  # Cel: klasyfikacja binarna
    'max_depth': 5,  # Maksymalna głębokość drzewa
    'eta': 0.1,  # Learning rate
    'eval_metric': 'logloss'  # Metryka ewaluacyjna: log loss
}

num_rounds = 100
bst = xgb.train(params, dtrain, num_rounds)

y_pred_proba = bst.predict(dtest)
y_pred = (y_pred_proba > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'Test Accuracy: {accuracy:.4f}')
print(f'Recall: {recall:.4f}')


Test Accuracy: 0.6640
Recall: 0.6280
