In [None]:
import pandas as pd
import numpy as np
import optuna
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

# Funkcja do wyświetlania czasu trwania
def format_time(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"



print("Wczytywanie danych...")
train = pd.read_csv('/kaggle/input/test-train-all-data/train.csv', delimiter=',')
test = pd.read_csv('/kaggle/input/test-train-all-data/test.csv', delimiter=',')

train = train.drop(columns=['Kategoria_mocy'])
test = test.drop(columns=['Kategoria_mocy'])

print(f"Brakujące wartości w train: {train.isnull().sum().sum()}")
print(f"Brakujące wartości w test: {test.isnull().sum().sum()}")

print(f"train shape: {train.shape}")
print(f"test shape: {test.shape}")

categorical_features = train.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Zmienne kategoryczne: {categorical_features}")

X = train.drop(columns=['Cena'])
y = train['Cena']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_pool = Pool(X_train, y_train, cat_features=categorical_features)
val_pool = Pool(X_val, y_val, cat_features=categorical_features)

optuna.logging.set_verbosity(optuna.logging.INFO)

def objective(trial):
    params = {
        'iterations': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.08, log=True),
        'depth': trial.suggest_int('depth', 6, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0.1, 20.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'grow_policy': 'Lossguide',
        'max_leaves': trial.suggest_int('max_leaves', 31, 64),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50),
        'early_stopping_rounds': 200,
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'thread_count': -1,
        'random_seed': 42,
        'verbose': 0,
        'task_type': 'GPU',
        'devices': '0'
    }
    
    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=val_pool, verbose=False)
    
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    
    return rmse


print("\nRozpoczynanie optymalizacji hiperparametrów z Optuna...")
start_time = time.time()
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print(f"\nOptymalizacja zakończona po {format_time(time.time() - start_time)}")
print(f"Najlepszy RMSE: {study.best_value}")
print("Najlepsze parametry:")
for key, value in study.best_params.items():
    print(f"    {key}: {value}")

print("\nTworzenie wizualizacji wyników optymalizacji...")
try:
    fig = optuna.visualization.plot_optimization_history(study)
    fig.write_image('optimization_history.png')
    
    fig = optuna.visualization.plot_param_importances(study)
    fig.write_image('param_importances.png')
    
    fig = optuna.visualization.plot_slice(study)
    fig.write_image('param_slices.png')
except Exception as e:
    print(f"Błąd podczas tworzenia wizualizacji: {e}")


In [None]:
print("\nTrenowanie finalnego modelu z najlepszymi parametrami...")
best_params = study.best_params.copy()
best_params['iterations'] = 5000  # Zwiększamy liczbę iteracji dla finalnego modelu
best_params['verbose'] = 100
best_params['early_stopping_rounds'] = 300
best_params['task_type'] = 'GPU'
best_params['devices'] = '0'
best_params['grow_policy'] = 'Lossguide'  

final_model = CatBoostRegressor(**best_params)

print("Rozpoczęcie trenowania finalnego modelu...")
start_time = time.time()
final_model.fit(train_pool)
print(f"Trenowanie zakończone po {format_time(time.time() - start_time)}")

# Ocena finalnego modelu na zbiorze walidacyjnym
val_preds = final_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
r2 = r2_score(y_val, val_preds)
print(f"RMSE na zbiorze walidacyjnym: {rmse}")
print(f"R2 na zbiorze walidacyjnym: {r2}")


In [None]:
# Generowanie predykcji dla zbioru testowego
print("\nGenerowanie predykcji dla zbioru testowego...")
test_pool = Pool(test, cat_features=categorical_features)
predictions = final_model.predict(test_pool)

# Zapisanie wyników
result = pd.DataFrame({"ID": test["ID"], "Cena": predictions})
result.to_csv("CatBoost_result_2.csv", index=False)
print("Wyniki zapisane do pliku CatBoost_result_2.csv")

# Wizualizacja ważności cech
print("\nTworzenie wizualizacji ważności cech...")
feature_importance = final_model.get_feature_importance()
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 10))
plt.barh(importance_df['Feature'][:20], importance_df['Importance'][:20])
plt.title('Top 20 najważniejszych cech')
plt.tight_layout()
plt.savefig('feature_importance.png')
print("Wizualizacja ważności cech zapisana do pliku feature_importance.png")

print("\nCały proces zakończony!")