In [1]:
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import numpy as np


In [2]:
x_train_path = "../data/x_train.txt"
y_train_path = "../data/y_train.txt"

x_data = np.loadtxt(x_train_path, delimiter=' ')
y_data = np.loadtxt(y_train_path, delimiter=' ')

print("X shape:", x_data.shape)
print("Y shape:", y_data.shape)

standard_scaler = MinMaxScaler(feature_range=(0, 1))
x_data_standardized = standard_scaler.fit_transform(x_data, y_data)

polunomial_builder = PolynomialFeatures()
x_df_poly = polunomial_builder.fit_transform(x_data_standardized)

print("X polynomialed shape:", x_df_poly.shape)

x_train_poly, x_test_poly, y_train, y_test = train_test_split(
    x_df_poly, y_data, test_size=0.2, random_state=42
    )

feature_names = [f"feature {i}" for i in range(x_df_poly.shape[1])]

with open("the_best_features_2.txt", "r") as f:
    saved_features = [
        line.strip()
        for line in f.readlines()
        if line.strip() != ""
        ]

print(f"X end shape: ({x_data.shape[0]}, {len(saved_features)})")


X shape: (5000, 500)
Y shape: (5000,)
X polynomialed shape: (5000, 125751)
X end shape: (5000, 10)


In [3]:
top_indices = [feature_names.index(feature) for feature in saved_features]

x_train_top = x_train_poly[:, top_indices]
x_test_top = x_test_poly[:, top_indices]

dtrain = xgb.DMatrix(x_train_top, label=y_train)
dtest = xgb.DMatrix(x_test_top, label=y_test)


In [4]:
params = {
    'objective': 'binary:logistic',  # Cel: klasyfikacja binarna
    'max_depth': 5,  # Maksymalna głębokość drzewa
    'eta': 0.1,  # Learning rate
    'eval_metric': 'logloss'  # Metryka ewaluacyjna: log loss
}

num_rounds = 100
bst = xgb.train(params, dtrain, num_rounds)

y_pred_proba = bst.predict(dtest)
y_pred = (y_pred_proba > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)

print(f'Test Accuracy: {accuracy:.4f}')


Test Accuracy: 0.6890
