In [20]:
from sklearn.datasets import fetch_openml
import pandas as pd 

data = pd.read_csv("../data.csv")
X = data.drop('DECYZJA', axis=1)
y = data['DECYZJA']

In [21]:
classes_count = y.value_counts()

In [22]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector


num_pipe = SimpleImputer(strategy="mean", add_indicator=True)
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
)

preprocessor_tree = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)


In [27]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

bag_clf = make_pipeline(
    preprocessor_tree,
    BalancedBaggingClassifier(
        base_estimator=HistGradientBoostingClassifier(random_state=42),
        n_estimators=10,
        random_state=42,
        n_jobs=2,
    ),
)
# scoring = ["accuracy", "balanced_accuracy"]
# cv_result = cross_validate(bag_clf, X, y, scoring=scoring)

import pickle
filename = 'model.sav'
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# bag_clf.fit(X_train,y_train)
bag_clf.fit(X,y)

pickle.dump(bag_clf, open(filename, 'wb'))
# clf_predict = bag_clf.predict(X_test)
# from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
# print('ROCAUC score:',roc_auc_score(y_test, clf_predict))
# print('Accuracy score:',accuracy_score(y_test, clf_predict))
# print('F1 score:',f1_score(y_test, clf_predict))


