In [None]:
import pandas as pd
import smogn
import optuna
from src.model import BaseLineModel
from sklearn.metrics import mean_squared_error, r2_score
import random
import numpy as np

# For silencing smogn
import os
from contextlib import redirect_stdout, redirect_stderr

  from .autonotebook import tqdm as notebook_tqdm


In [103]:
base_value = 0

df_train = pd.read_csv(f'./data/evaluation/train.csv')
X_train = df_train.drop(columns=['strength'])
y_train = df_train['strength']
df_test = pd.read_csv(f'./data/evaluation/test.csv')
X_test = df_test.drop(columns=['strength'])
y_test = df_test['strength']

model = BaseLineModel(
        n_estimators=100, 
        random_state=76344
)

model.fit(X_train, y_train)
mse = mean_squared_error(y_test, model.predict(X_test))
r2 = r2_score(y_test, model.predict(X_test))

base_value = r2
base_value

0.849304522983098

In [59]:
# # %pip install optuna-integration[botorch]
df_train = pd.read_csv(f'./data/evaluation/train.csv')
df_test = pd.read_csv(f'./data/evaluation/test.csv')
X_test = df_test.drop(columns=['strength'])
y_test = df_test['strength']

def objective(trial):

    model = BaseLineModel(
        n_estimators=100, 
        random_state=76344
    )

    # this just reduces random spread but does not eliminate it completely (+-~ 0.03)
    random.seed(76344)
    np.random.seed(76344)

    try:
        with open(os.devnull, 'w') as fnull, redirect_stdout(fnull), redirect_stderr(fnull):
            augmentation = smogn.smoter(
                data=df_train,
                y="strength",
                k=trial.suggest_int('k', 2, 20),
                samp_method=trial.suggest_categorical('sample_method', ['balance', 'extreme']),
                pert=trial.suggest_float('pertubation', 0.01, 0.99, step=0.05),
                drop_na_row=True,


                # phi
                rel_thres=trial.suggest_float('rel threshold', 0.01, 0.99, step=0.01),
                rel_xtrm_type=trial.suggest_categorical('rel_xtrm_type', ['low', 'both', 'high']),
                # rel_coef =trial.suggest_float('rel_coef', 0.5, 2.25)
            )
    except ValueError as e:
        raise optuna.exceptions.TrialPruned(repr(e))

    augmented_data = pd.concat([augmentation, df_train])
    X_train = augmented_data.drop(columns=['strength'])
    y_train = augmented_data['strength']
    model.fit(X_train, y_train)
    mse = mean_squared_error(y_test, model.predict(X_test))
    r2 = r2_score(y_test, model.predict(X_test))
    return r2

sampler = optuna.samplers.TPESampler(multivariate=True, group=True, n_startup_trials=10)
study = optuna.create_study(sampler=sampler, direction='maximize')
# study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print(study.best_params)
print(study.best_value, base_value, f"improvement: {round(study.best_value - base_value, 4)}")

[I 2025-07-02 22:30:56,835] A new study created in memory with name: no-name-d4d9402e-da88-4268-98a1-b889fabaa048
[I 2025-07-02 22:30:57,760] Trial 0 finished with value: 0.8454549934290783 and parameters: {'k': 14, 'sample_method': 'balance', 'pertubation': 0.7100000000000001, 'rel threshold': 0.48000000000000004, 'rel_xtrm_type': 'both'}. Best is trial 0 with value: 0.8454549934290783.
[I 2025-07-02 22:30:57,764] Trial 1 pruned. ValueError('redefine phi relevance function: all points are 1')
[I 2025-07-02 22:30:58,735] Trial 2 finished with value: 0.8661216972062559 and parameters: {'k': 8, 'sample_method': 'balance', 'pertubation': 0.96, 'rel threshold': 0.4, 'rel_xtrm_type': 'high'}. Best is trial 2 with value: 0.8661216972062559.
[I 2025-07-02 22:30:58,740] Trial 3 pruned. ValueError('redefine phi relevance function: all points are 1')
[I 2025-07-02 22:30:59,485] Trial 4 finished with value: 0.8493663648885117 and parameters: {'k': 18, 'sample_method': 'balance', 'pertubation': 0.

{'k': 7, 'sample_method': 'balance', 'pertubation': 0.66, 'rel threshold': 0.17, 'rel_xtrm_type': 'both'}


NameError: name 'base_value' is not defined

In [None]:
"""
{'k': 9, 'sample_method': 'extreme', 'pertubation': 0.6100000000000001, 'rel threshold': 0.18000000000000002, 'rel_xtrm_type': 'high'}
0.8800102826290365 0.849304522983098 improvement: 0.0307

{'k': 9, 'sample_method': 'balance', 'pertubation': 0.26, 'rel threshold': 0.18000000000000002, 'rel_xtrm_type': 'high'}

{'k': 7, 'sample_method': 'balance', 'pertubation': 0.66, 'rel threshold': 0.17, 'rel_xtrm_type': 'both'}
"""

In [68]:
import pandas as pd
import smogn
from src.model import BaseLineModel

model = BaseLineModel(
        n_estimators=100, 
        random_state=76344
    )

df = pd.read_csv("./data/evaluation/train.csv")

import random
import numpy as np

random.seed(76344)
np.random.seed(76344)
augmentation = smogn.smoter(
    data=df,
    y="strength",
    k=9,
    samp_method="balance",
    pert=0.26,
    drop_na_row=True,
    # phi
    rel_thres=0.18,
    rel_xtrm_type="high"
)

augmented_data = pd.concat([augmentation, df_train])
augmented_data.to_csv("./data/evaluation/smogn.csv", index=False)

# augmented_data = pd.read_csv("")

X_train = augmented_data.drop(columns=['strength'])
y_train = augmented_data['strength']
model.fit(X_train, y_train)
mse = mean_squared_error(y_test, model.predict(X_test))
r2 = r2_score(y_test, model.predict(X_test))
r2

r_index: 100%|##########| 5/5 [00:00<00:00, 1167.42it/s]


0.870323340429269