In [19]:
import sys
sys.path.append('../src')
from modules import *

In [20]:
data = pd.read_csv('../../data/solubility_features.csv', index_col=0)

In [21]:
target_name = "Solubility"
target = data[target_name]
data = data.drop(columns=[target_name,'smiles', 'mol'])

data_train, data_test, target_train, target_test = train_test_split(
    data, target, train_size=0.8, random_state=42
)

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor

categorical_preprocessor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)
preprocessor = ColumnTransformer(
    [
        (
            "cat_preprocessor",
            categorical_preprocessor,
            selector(dtype_include=object),
        )
    ],
    remainder="passthrough",
)

from sklearn.pipeline import Pipeline

model = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("regressor", XGBRegressor(random_state=42)),
    ]
)

In [17]:
from sklearn.model_selection import cross_val_score

# Pretpostavke hiperparametara
learning_rate = [0.01, 0.1,0.15, 1]
max_depth = [1, 3, 5, 7]
n_estimators = [200,300,400,500]

best_score = 0
best_params = {}

for lr in learning_rate:
    for md in max_depth:
        for ne in n_estimators:
            print(
                f"Evaluating model with learning rate {lr:.3f}, "
                f"max_depth {md}, and n_estimators {ne}... ",
                end=""
            )
            model.set_params(
                regressor__learning_rate=lr, 
                regressor__max_depth=md,
                regressor__n_estimators=ne
            )
            scores = cross_val_score(model, data_train, target_train, cv=2)
            mean_score = scores.mean()
            print(f"score: {mean_score:.3f}")
            if mean_score > best_score:
                best_score = mean_score
                best_params = {
                    "learning_rate": lr, 
                    "max_depth": md, 
                    "n_estimators": ne
                }
                print(f"Found new best model with score {best_score:.3f}!")

print(f"The best accuracy obtained is {best_score:.3f}")
print(f"The best parameters found are:\n {best_params}")

Evaluating model with learning rate 0.010, max_depth 1, and n_estimators 200... score: 0.559
Found new best model with score 0.559!
Evaluating model with learning rate 0.010, max_depth 1, and n_estimators 300... score: 0.609
Found new best model with score 0.609!
Evaluating model with learning rate 0.010, max_depth 1, and n_estimators 400... score: 0.630
Found new best model with score 0.630!
Evaluating model with learning rate 0.010, max_depth 1, and n_estimators 500... score: 0.644
Found new best model with score 0.644!
Evaluating model with learning rate 0.010, max_depth 3, and n_estimators 200... score: 0.668
Found new best model with score 0.668!
Evaluating model with learning rate 0.010, max_depth 3, and n_estimators 300... score: 0.701
Found new best model with score 0.701!
Evaluating model with learning rate 0.010, max_depth 3, and n_estimators 400... score: 0.715
Found new best model with score 0.715!
Evaluating model with learning rate 0.010, max_depth 3, and n_estimators 500

In [23]:
# Pretpostavljamo da su best_params već definirani
best_lr = best_params["learning_rate"]
best_md = best_params["max_depth"]
best_ne = best_params["n_estimators"]

# Postavljanje najboljih parametara u model
model.set_params(
    regressor__learning_rate=best_lr, 
    regressor__max_depth=best_md,
    regressor__n_estimators=best_ne
)

# Treniranje modela s optimiziranim hiperparametrima
model.fit(data_train, target_train)

# Evaluacija modela na testnom skupu
test_score = model.score(data_test, target_test)

print(f"Test score after the parameter tuning: {test_score:.3f}")

Test score after the parameter tuning: 0.764


In [9]:
for parameter in model.get_params():
    print(parameter)

memory
steps
verbose
preprocessor
regressor
preprocessor__n_jobs
preprocessor__remainder
preprocessor__sparse_threshold
preprocessor__transformer_weights
preprocessor__transformers
preprocessor__verbose
preprocessor__verbose_feature_names_out
preprocessor__cat_preprocessor
preprocessor__cat_preprocessor__categories
preprocessor__cat_preprocessor__dtype
preprocessor__cat_preprocessor__encoded_missing_value
preprocessor__cat_preprocessor__handle_unknown
preprocessor__cat_preprocessor__max_categories
preprocessor__cat_preprocessor__min_frequency
preprocessor__cat_preprocessor__unknown_value
regressor__objective
regressor__base_score
regressor__booster
regressor__callbacks
regressor__colsample_bylevel
regressor__colsample_bynode
regressor__colsample_bytree
regressor__device
regressor__early_stopping_rounds
regressor__enable_categorical
regressor__eval_metric
regressor__feature_types
regressor__gamma
regressor__grow_policy
regressor__importance_type
regressor__interaction_constraints
regres

In [13]:
from sklearn.model_selection import cross_val_score

learning_rate = [0.01, 0.1, 1]
max_depth = [1, 3, 5, 7, 9]

best_score = 0
best_params = {}
for lr in learning_rate:
    for md in max_depth:
        print(
            (
                f"Evaluating model with learning rate {lr:.3f}"
                f" and max_depth {md}... "
            ),
            end="",
        )
        model.set_params(
            regressor__learning_rate=lr, regressor__max_depth=md
        )
        scores = cross_val_score(model, data_train, target_train, cv=2)
        mean_score = scores.mean()
        print(f"score: {mean_score:.3f}")
        if mean_score > best_score:
            best_score = mean_score
            best_params = {"learning_rate": lr, "max_depth": md}
            print(f"Found new best model with score {best_score:.3f}!")

print(f"The best accuracy obtained is {best_score:.3f}")
print(f"The best parameters found are:\n {best_params}")

Evaluating model with learning rate 0.010 and max_depth 1... score: 0.431
Found new best model with score 0.431!
Evaluating model with learning rate 0.010 and max_depth 3... score: 0.567
Found new best model with score 0.567!
Evaluating model with learning rate 0.010 and max_depth 5... score: 0.598
Found new best model with score 0.598!
Evaluating model with learning rate 0.010 and max_depth 7... score: 0.612
Found new best model with score 0.612!
Evaluating model with learning rate 0.010 and max_depth 9... score: 0.613
Found new best model with score 0.613!
Evaluating model with learning rate 0.100 and max_depth 1... score: 0.680
Found new best model with score 0.680!
Evaluating model with learning rate 0.100 and max_depth 3... score: 0.747
Found new best model with score 0.747!
Evaluating model with learning rate 0.100 and max_depth 5... score: 0.751
Found new best model with score 0.751!
Evaluating model with learning rate 0.100 and max_depth 7... score: 0.747
Evaluating model with 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

model = Pipeline(
    steps=[
        ("preprocessor", StandardScaler()),
        ("regressor", DecisionTreeRegressor()),
    ]
)

In [24]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target)
scores = cv_results["test_score"]
print(
    "Accuracy score via cross-validation:\n"
    f"{scores.mean():.3f} ± {scores.std():.3f}"
)

Accuracy score via cross-validation:
0.662 ± 0.112
