In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e25/sample_submission.csv
/kaggle/input/playground-series-s3e25/train.csv
/kaggle/input/playground-series-s3e25/test.csv


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import median_absolute_error
import lightgbm as lgb
import joblib
import matplotlib.pyplot as plt
import numpy as np

In [8]:
train_data = pd.read_csv('/kaggle/input/playground-series-s3e25/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s3e25/test.csv')

In [9]:
train_data = train_data.drop(columns=['id'])

In [10]:
y = train_data['Hardness']
X = train_data.drop(columns='Hardness')

In [11]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [12]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [13]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [15]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

In [16]:
model = lgb.LGBMRegressor()

In [17]:
param_grid = {
    'num_leaves': [31, 50],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200]
}

In [18]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_median_absolute_error')
grid_search.fit(X_train_preprocessed, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2547
[LightGBM] [Info] Number of data points in the train set: 5550, number of used features: 11
[LightGBM] [Info] Start training from score 4.650306
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2514
[LightGBM] [Info] Number of data points in the train set: 5550, number of used features: 11
[LightGBM] [Info] Start training from score 4.634739
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2521
[LightGBM] [Info] Number of data points in the train set: 5550, number of used features: 11
[LightGBM] [Info] Start traini

In [19]:
best_params = grid_search.best_params_
print(f"Найкращі параметри: {best_params}")

Найкращі параметри: {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 50}


In [20]:
best_model = grid_search.best_estimator_

In [21]:
y_val_pred = best_model.predict(X_val_preprocessed)

In [22]:
val_median_absolute_error = median_absolute_error(y_val, y_val_pred)
print(f"Median Absolute Error on validation set: {val_median_absolute_error}")

Median Absolute Error on validation set: 0.663929810442792


In [23]:
X_preprocessed = preprocessor.fit_transform(X)
best_model.fit(X_preprocessed, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002232 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2570
[LightGBM] [Info] Number of data points in the train set: 10407, number of used features: 11
[LightGBM] [Info] Start training from score 4.647126


In [25]:
id_data = test_data['id']
test_df = test_data.drop(columns=['id'])

In [26]:
X_test_preprocessed = preprocessor.transform(test_df)

In [27]:
y_test_pred = best_model.predict(X_test_preprocessed)

In [28]:
submission_df = pd.DataFrame({
    'id': id_data,
    'Hardness': y_test_pred
})

submission_file_path = '/kaggle/working/submission.csv'
submission_df.to_csv(submission_file_path, index=False)

print("Submission file created at:", submission_file_path)

Submission file created at: /kaggle/working/submission.csv


In [29]:
final_model = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', best_model)])
joblib.dump(final_model, 'mineral_hardness_model.pkl')

['mineral_hardness_model.pkl']

In [30]:
loaded_model = joblib.load('mineral_hardness_model.pkl')

In [31]:
example_data = X_test.iloc[:5, :]
example_pred = loaded_model.predict(example_data)

print("Predictions:", example_pred)
print("Actual Hardness:", y_test.iloc[:5].values)

NameError: name 'X_test' is not defined