In [1]:
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Wczytanie danych
x_train_path = "../data/x_train.txt"
y_train_path = "../data/y_train.txt"

x_data = np.loadtxt(x_train_path, delimiter=" ")
y_data = np.loadtxt(y_train_path, delimiter=" ")

print("X shape:", x_data.shape)
print("Y shape:", y_data.shape)

# Podział danych na zbiór treningowy i testowy
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    x_data, y_data, test_size=test_size, shuffle=True
)

def get_model(model_type="random_forest"):
    if model_type == "random_forest":
        return RandomForestClassifier(n_estimators=100)
    elif model_type == "gradient_boosting":
        return GradientBoostingClassifier(n_estimators=100)
    elif model_type == "svm":
        return SVC(kernel='linear')
    elif model_type == "logistic_regression":
        return LogisticRegression(max_iter=1000)
    elif model_type == "knn":
        return KNeighborsClassifier(n_neighbors=5)
    else:
        raise ValueError(f"Unknown model type: {model_type}")

def test_model(X_train, X_test, y_train, y_test, selected_features, model_type="random_forest"):
    X_train_subset = X_train[:, selected_features]
    X_test_subset = X_test[:, selected_features]

    model = get_model(model_type)
    model.fit(X_train_subset, y_train)
    y_pred = model.predict(X_test_subset)

    num_correct = np.sum((y_test == 1) & (y_pred == 1))
    profit = num_correct * 20 - len(selected_features) * 200
    return profit

def random_column_selection(num_features, min_cols=3, max_cols=8):
    num_selected_features = random.randint(min_cols, max_cols)
    selected_features = random.sample(range(num_features), num_selected_features)
    return selected_features

def find_best_columns(X_train, X_test, y_train, y_test, iterations=100, model_type="random_forest"):
    num_features = X_train.shape[1]
    best_features = None
    best_profit = -np.inf

    for i in range(iterations):
        selected_features = random_column_selection(num_features)
        profit = test_model(X_train, X_test, y_train, y_test, selected_features, model_type)
        
        if profit > best_profit:
            best_profit = profit
            best_features = selected_features
            print()
        
        print(f"Iteration {i+1}/{iterations}, Profit: {profit}, Best Profit: {best_profit}")

    return best_features, best_profit

# Ustawienia
iterations = 1_000  # Liczba iteracji losowego wyboru kolumn
model_type = "random_forest"

# Znajdź najlepsze kolumny
best_features, best_profit = find_best_columns(X_train, X_test, y_train, y_test, iterations, model_type)

print(f"Best features: {best_features}")
print(f"Best profit: {best_profit}")


X shape: (5000, 500)
Y shape: (5000,)
Iteration 1/1000, Profit: 3240, Best Profit: 3240
Iteration 2/1000, Profit: 3320, Best Profit: 3320
Iteration 3/1000, Profit: 3340, Best Profit: 3340
Iteration 4/1000, Profit: 3240, Best Profit: 3340
Iteration 5/1000, Profit: 4020, Best Profit: 4020
Iteration 6/1000, Profit: 4120, Best Profit: 4120
Iteration 7/1000, Profit: 4100, Best Profit: 4120
Iteration 8/1000, Profit: 4000, Best Profit: 4120
Iteration 9/1000, Profit: 3920, Best Profit: 4120
Iteration 10/1000, Profit: 4140, Best Profit: 4140
Iteration 11/1000, Profit: 3040, Best Profit: 4140
Iteration 12/1000, Profit: 3360, Best Profit: 4140
Iteration 13/1000, Profit: 2920, Best Profit: 4140
Iteration 14/1000, Profit: 3960, Best Profit: 4140
Iteration 15/1000, Profit: 3780, Best Profit: 4140
Iteration 16/1000, Profit: 3260, Best Profit: 4140
Iteration 17/1000, Profit: 4320, Best Profit: 4320
Iteration 18/1000, Profit: 4060, Best Profit: 4320
Iteration 19/1000, Profit: 4200, Best Profit: 4320
It