<a href="https://colab.research.google.com/github/JakubMarac/LEARN/blob/main/kaggle_predict_calorie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:
# 1. Wczytanie danych

sample = pd.read_csv('/content/drive/MyDrive/kaggle/playground-series-s5e5/sample_submission.csv')
train_df = pd.read_csv('/content/drive/MyDrive/kaggle/playground-series-s5e5/train.csv')
test_df  = pd.read_csv('/content/drive/MyDrive/kaggle/playground-series-s5e5/test.csv')

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (750000, 9)
Test shape: (250000, 8)


In [None]:
# 2. Encoding kolumny 'Sex'

le = LabelEncoder()
train_df['Sex_Encoded'] = le.fit_transform(train_df['Sex'])
test_df['Sex_Encoded']  = le.transform(test_df['Sex'])


In [None]:
# 3. Przygotowanie X, y

X = train_df.drop(columns=["Calories", "Sex"])
y = train_df["Calories"]

In [None]:
# 4. Train / Test split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train:", X_train.shape, "X_valid:", X_valid.shape)

X_train: (600000, 8) X_valid: (150000, 8)


In [42]:
# 6. Model: XGBoost (XGBRegressor) + RandomizedSearchCV

xgb = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    tree_method="hist",   # szybciej i mniej pamięciożerne
    n_jobs=1              # <--- BARDZO WAŻNE: tylko 1 wątek na model
)

param_dist = {
    "n_estimators":      [300, 500],          # trochę mniejsza siatka
    "max_depth":         [4, 6, 8],
    "learning_rate":     [0.03, 0.05, 0.1],
    "subsample":         [0.8, 1.0],
    "colsample_bytree":  [0.8, 1.0],
    "min_child_weight":  [1, 3],
    "reg_lambda":        [1.0, 5.0, 10.0]
}

xgb_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=10,                         # mniej kombinacji (10 zamiast 20)
    scoring="neg_mean_squared_error",
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=1                           # <--- też 1 wątek dla CV
)

xgb_search.fit(X_train, y_train)

print("\n Najlepsze parametry XGBoost")
print(xgb_search.best_params_)

best_model = xgb_search.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=8, min_child_weight=3, n_estimators=300, reg_lambda=10.0, subsample=0.8; total time=  23.6s
[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=8, min_child_weight=3, n_estimators=300, reg_lambda=10.0, subsample=0.8; total time=  16.0s
[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=8, min_child_weight=3, n_estimators=300, reg_lambda=10.0, subsample=0.8; total time=  15.1s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=4, min_child_weight=1, n_estimators=300, reg_lambda=5.0, subsample=1.0; total time=   7.8s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=4, min_child_weight=1, n_estimators=300, reg_lambda=5.0, subsample=1.0; total time=   9.0s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=4, min_child_weight=1, n_estimators=300, reg_lambda=5.0, subsample=1.0; total time=   8.3s
[CV] END colsample_bytree=0

In [45]:
# 6. Ocena modelu (train / validation)

def rmsle(y_true, y_pred):
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))

y_train_pred = best_model.predict(X_train)
y_valid_pred = best_model.predict(X_valid)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_valid = np.sqrt(mean_squared_error(y_valid, y_valid_pred))

mae_valid  = mean_absolute_error(y_valid, y_valid_pred)
r2_valid   = r2_score(y_valid, y_valid_pred)

rmsle_valid = rmsle(y_valid, y_valid_pred)

print("\n=== Wyniki modelu XGBoost ===")
print(f"Valid RMLSE: {rmsle_valid:.4f}")
print(f"Train RMSE: {rmse_train:.4f}")
print(f"Valid RMSE: {rmse_valid:.4f}")
print(f"Valid MAE : {mae_valid:.4f}")
print(f"Valid R^2 : {r2_valid:.4f}")



=== Wyniki modelu XGBoost ===
Valid RMLSE: 0.0590
Train RMSE: 3.3473
Valid RMSE: 3.3298
Valid MAE : 2.0956
Valid R^2 : 0.9971


In [46]:
# 7. Trening na całym train_df (pod submission)

best_model.fit(X, y)   # uczymy na wszystkich 750k rekordach

In [47]:
# 8. Predykcje dla test_df (Kaggle)

X_kaggle = test_df.drop(columns=["Sex"])
y_kaggle_pred = best_model.predict(X_kaggle)

In [48]:
# 9. Submission

submission = pd.DataFrame({
    "id": test_df["id"],
    "Calories": y_kaggle_pred
})

submission.to_csv("submission_rf.csv", index=False)
print("\nPlik 'submission_rf.csv' zapisany.")


Plik 'submission_rf.csv' zapisany.


In [None]:
#cross,