In [None]:
import os
import warnings
from datetime import datetime

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import xgboost as xgb
from catboost import CatBoostClassifier
from matplotlib.lines import Line2D
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

warnings.filterwarnings('ignore')

#### Wczytanie przygotowanych danych

In [None]:
train = pd.read_csv(os.path.join('prepared', 'train.csv'))
test = pd.read_csv(os.path.join('prepared', 'test.csv'))

salary_category_encoder = LabelEncoder().fit(train['salary_category'])

test_obs = test['obs']
test.drop(columns=['obs'], inplace=True)

In [None]:
train['job_state'] = 'xd'
test['job_state'] = 'xd'

#### Różne dane dla różnych modeli

In [None]:
train.info()

In [None]:
test.info()

### <center>Proces nauki, walidacji, testowania</center>

Główny podział na zbiór treningowy, walidacyjny i testowy.

In [None]:
X = train.drop(columns=['salary_category'])
y = salary_category_encoder.transform(train['salary_category'])

class_labels = np.unique(y)

test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

#### Trochę inne podejście do podziału danych

Podział zbiorów dla każdego modelu:
- LightGBM
- XGBoost
- Catboost

#### Zbiór dla LightGBM

In [None]:
X_lgb = X.drop(columns=['job_state'])
X_train_lgb, X_test_lgb = X_train.drop(columns=['job_state']).reset_index(drop=True), X_test.drop(columns=['job_state']).reset_index(drop=True)

lgb_categories = ['job_title', 'feature_1']
X_lgb[lgb_categories] = X_lgb[lgb_categories].astype('category')
X_train_lgb[lgb_categories] = X_train_lgb[lgb_categories].astype('category')
X_test_lgb[lgb_categories] = X_test_lgb[lgb_categories].astype('category')

#### Zbiór dla XGBoost

In [None]:
X_xgb = X.drop(columns=['job_state'])
X_train_xgb, X_test_xgb = X_train.drop(columns=['job_state']).reset_index(drop=True), X_test.drop(columns=['job_state']).reset_index(drop=True)

xgb_categories = ['job_title', 'feature_1']
X_xgb[xgb_categories] = X_xgb[xgb_categories].astype('category')
X_train_xgb[xgb_categories] = X_train_xgb[xgb_categories].astype('category')
X_test_xgb[xgb_categories] = X_test_xgb[xgb_categories].astype('category')

#### Zbiór dla CatBoost

In [None]:
X_cat = X.copy(deep=True)
X_train_cat, X_test_cat = X_train.copy(deep=True).reset_index(drop=True), X_test.copy(deep=True).reset_index(drop=True)

cat_categories = X_cat.select_dtypes(exclude=np.float64).columns.tolist()
X_cat['feature_10'] = X_cat['feature_10'].astype(str)
X_train_cat['feature_10'] = X_train_cat['feature_10'].astype(str)
X_test_cat['feature_10'] = X_test_cat['feature_10'].astype(str)
cat_categories += ['feature_10']

X_cat[cat_categories] = X_cat[cat_categories].astype('category')
X_train_cat[cat_categories] = X_train_cat[cat_categories].astype('category')
X_test_cat[cat_categories] = X_test_cat[cat_categories].astype('category')

In [None]:
test_lgb = test.copy(deep=True)
test_lgb.drop(columns=['job_state'], inplace=True)
test_lgb[lgb_categories] = test_lgb[lgb_categories].astype('category')

test_xgb = test.copy(deep=True)
test_xgb.drop(columns=['job_state'], inplace=True)
test_xgb[xgb_categories] = test_xgb[xgb_categories].astype('category')

test_cat = test.copy(deep=True)
test_cat['feature_10'] = test_cat['feature_10'].astype(str)
test_cat[cat_categories] = test_cat[cat_categories].astype('category')

#### LightGBM

In [None]:
from sklearn.metrics import accuracy_score

def custom_accuracy(preds, eval_data):
    y_true = eval_data.get_label()
    y_pred = preds.argmax(axis=1)
    return 'accuracy', accuracy_score(y_true, y_pred), True

def objective_lgb(trial):
    dataset = lgb.Dataset(X_lgb, y, params={
        'max_bin': trial.suggest_int('max_bin', 128, 256),
        'feature_pre_filter': False
    })

    params = {
        'objective': 'multiclass',
        'num_class': len(np.unique(y)),
        'boosting': 'gbdt',
        'n_jobs': -1,
        'metric': 'None',
        'random_state': 81,
        'verbose': -1,
        'reg_alpha': 1e-3,
        'reg_lambda': 1,
        'categorical_feature': 'name:job_title,feature_1',
        'learning_rate': trial.suggest_float('learning_rate', 1e-1, 1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'num_leaves': trial.suggest_int('num_leaves', 8, 64),
        'max_depth': trial.suggest_int('max_depth', 8, 15),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 1e-2, 2),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 3, 25),
        'subsample': trial.suggest_float('subsample', 0.6, 0.8),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.8),
    }

    cv_results = lgb.cv(
        params=params,
        train_set=dataset,
        folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        feval=custom_accuracy,
        metrics='None',
        return_cvbooster=False,
    )
    mean_accuracy = np.mean(cv_results['valid accuracy-mean'])
    return mean_accuracy

study_lgb = optuna.create_study(direction='maximize', study_name='EngineerSalaryLightGBM', sampler=optuna.samplers.TPESampler(seed=42))
study_lgb.optimize(objective_lgb, n_trials=100)

In [None]:
study_lgb.best_params

In [None]:
standard_params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y)),
    'boosting': 'gbdt',
    'n_jobs': -1,
    'metric': 'None',
    'random_state': 81,
    'verbose': -1,
    'reg_alpha': 1e-3,
    'reg_lambda': 1,
    'categorical_feature': 'name:job_title',
}

best_params = {**standard_params, **study_lgb.best_params}

lightgbm = lgb.LGBMClassifier(**best_params)
lightgbm.fit(X_train_lgb, y_train)

lightgbm_pred = lightgbm.predict(X_test_lgb)

print(classification_report(y_test, lightgbm_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Wizualizacja z optuną

In [None]:
optuna.visualization.plot_optimization_history(study_lgb)

In [None]:
optuna.visualization.plot_slice(study_lgb)

In [None]:
optuna.visualization.plot_param_importances(study_lgb)

In [None]:
# lightgbm = lgb.LGBMClassifier(**{
#     "boosting_type": 'gbdt',
#     "class_weight": None,
#     "colsample_bytree": 0.6891679757189934,
#     "importance_type": 'split',
#     "learning_rate": 0.22220757372057015,
#     "max_depth": 9,
#     "min_child_samples": 20,
#     "min_child_weight": 0.001,
#     "min_split_gain": 0.0,
#     "n_estimators": 188,
#     "n_jobs": -1,
#     "num_leaves": 34,
#     "objective": 'multiclass',
#     "random_state": 81,
#     "reg_alpha": 0.001,
#     "reg_lambda": 1,
#     "subsample": 0.7305466280090381,
#     "subsample_for_bin": 200000,
#     "subsample_freq": 0,
#     "num_class": 3,
#     "boosting": 'gbdt',
#     "metric": None,
#     "verbose": -1,
#     "categorical_feature": 'name:job_title,feature_1',
#     "max_bin": 244,
#     "min_gain_to_split": 0.08190294734785593,
#     "min_data_in_leaf": 3,
# }).fit(X_train_lgb, y_train)
#
# lightgbm_pred = lightgbm.predict(X_test_lgb)
#
# print(classification_report(y_test, lightgbm_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_test))))

#### Istotność cech dla LightGBM

In [None]:
lightgbm_importance = pd.DataFrame(
    data={
        'feature': lightgbm.feature_name_,
        'importance': lightgbm.feature_importances_
    }).sort_values(by='importance', ascending=False).head(10)

plt.figure(figsize=(12, 8))
sns.set_style('whitegrid')

ax = sns.barplot(
    data=lightgbm_importance,
    x='importance',
    y='feature',
    hue='feature',
    legend=False,
    palette='viridis'
)

for i in ax.containers:
    ax.bar_label(i, fmt='%g', label_type='edge', fontsize=10, padding=3)

plt.title('Top 10 najważniejszych cech dla LightGBM', fontsize=16)
plt.xlabel('Wartość istotności', fontsize=12)
plt.ylabel('Cecha', fontsize=12)

plt.xticks(fontsize=10)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()

#### XGBoost

In [None]:
def define_xgb(trial):
    params = {
        'tree_method': 'hist',
        'enable_categorical': True,
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 1, log=True),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'max_leaves': trial.suggest_int('max_leaves', 8, 64),
        'gamma': trial.suggest_float('gamma', 0.5, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1e-1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 1, log=True),
        'max_bin': 128,
        'n_jobs': -1,
        'objective': 'multiclass',
        'num_class': len(np.unique(y)),
        'random_state': 42
    }
    return xgb.XGBClassifier(**params)

def objective_xgb(trial):
    model = define_xgb(trial)
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_xgb, y, cv=skf, n_jobs=-1, scoring='accuracy')
    return scores.mean()

study_xgb = optuna.create_study(direction='maximize', study_name='EngineerSalaryXGBoost', sampler=optuna.samplers.TPESampler(seed=42))
study_xgb.optimize(objective_xgb, n_trials=100)

xgBoost = define_xgb(study_xgb.best_trial).fit(X_train_xgb, y_train)
xgBoost_pred = xgBoost.predict(X_test_xgb)

print(classification_report(y_test, xgBoost_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

In [None]:
study_xgb.best_params

#### Wizualizacji procesu optymalizacji hiperparametrów

In [None]:
optuna.visualization.plot_optimization_history(study_xgb)

In [None]:
optuna.visualization.plot_slice(study_xgb)

In [None]:
optuna.visualization.plot_param_importances(study_xgb)

In [None]:
# xgBoost = xgb.XGBClassifier(**{
#     "objective": 'multi:softprob',
#     "use_label_encoder": None,
#     "base_score": None,
#     "booster": None,
#     "callbacks": None,
#     "colsample_bylevel": None,
#     "colsample_bynode": None,
#     "colsample_bytree": None,
#     "early_stopping_rounds": None,
#     "enable_categorical": True,
#     "eval_metric": None,
#     "feature_types": None,
#     "gamma": 0.665783147587586,
#     "gpu_id": None,
#     "grow_policy": None,
#     "importance_type": None,
#     "interaction_constraints": None,
#     "learning_rate": 0.025868358584852924,
#     "max_bin": 128,
#     "max_cat_threshold": None,
#     "max_cat_to_onehot": None,
#     "max_delta_step": None,
#     "max_depth": 8,
#     "max_leaves": 35,
#     "min_child_weight": 1.2458595157117232,
#     "monotone_constraints": None,
#     "n_estimators": 148,
#     "n_jobs": -1,
#     "num_parallel_tree": None,
#     "predictor": None,
#     "random_state": 42,
#     "reg_alpha": 0.007829247923930879,
#     "reg_lambda": 0.3228408290141024,
#     "sampling_method": None,
#     "scale_pos_weight": None,
#     "subsample": 0.8758723969660378,
#     "tree_method": 'hist',
#     "validate_parameters": None,
#     "verbosity": None,
#     "num_class": 3,
# }).fit(X_train_xgb, y_train)
#
# xgBoost_pred = xgBoost.predict(X_test_xgb)
#
# print(classification_report(y_test, xgBoost_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_test))))

#### Istotność cech dla XGBoost

In [None]:
importance_dict = xgBoost.get_booster().get_score(importance_type='weight')

xgBoost_importance = pd.DataFrame({
    'feature': list(importance_dict.keys()),
    'importance': list(importance_dict.values())
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(12, 8))
sns.set_style('whitegrid')

ax = sns.barplot(
    data=xgBoost_importance,
    x='importance',
    y='feature',
    hue='feature',
    legend=False,
    palette='viridis'
)

for i in ax.containers:
    ax.bar_label(i, fmt='%g', label_type='edge', fontsize=10, padding=3)

plt.title('Top 10 najważniejszych cech dla XGBoost', fontsize=16)
plt.xlabel('Wartość istotności', fontsize=12)
plt.ylabel('Cecha', fontsize=12)

plt.xticks(fontsize=10)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()

#### CatBoost

In [None]:
def define_catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 50, 200),
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 1, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_float('random_strength', 1e-3, 10, log=True),
        'rsm': trial.suggest_float('rsm', 0.1, 1.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 20),
        'bootstrap_type': 'Bayesian',
        'bagging_temperature': 0.5,
        'loss_function': 'MultiClass',
        'cat_features': cat_categories,
        'random_seed': 42,
        'logging_level': 'Silent'
    }
    return CatBoostClassifier(**params)

def objective_catboost(trial):
    model = define_catboost(trial)
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_cat, y, cv=skf, n_jobs=-1, scoring='accuracy')
    return scores.mean()

study_cat = optuna.create_study(direction='maximize', study_name='EngineerSalaryCatBoost', sampler=optuna.samplers.TPESampler())
study_cat.optimize(objective_catboost, n_trials=100)

In [None]:
catBoost = define_catboost(study_cat.best_trial).fit(X_train_cat, y_train)
catBoost_pred = catBoost.predict(X_test_cat)

print(classification_report(y_test, catBoost_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

In [None]:
study_cat.best_params

In [None]:
optuna.visualization.plot_optimization_history(study_cat)

In [None]:
optuna.visualization.plot_slice(study_cat)

In [None]:
optuna.visualization.plot_param_importances(study_cat)

In [None]:
# catBoost = CatBoostClassifier(**{
#     "iterations": 145,
#     "learning_rate": 0.17585387437940722,
#     "depth": 6,
#     "l2_leaf_reg": 0.02675666277151487,
#     "loss_function": 'MultiClass',
#     "random_seed": 42,
#     "bagging_temperature": 0.5,
#     "bootstrap_type": 'Bayesian',
#     "cat_features": cat_categories,
# }).fit(X_train_cat, y_train)
#
# catBoost_pred = catBoost.predict(X_test_cat)
#
# print(classification_report(y_test, catBoost_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_test))))

#### Prawdopodobieństwa poszczególnych modeli

In [None]:
catBoost_pred_proba = catBoost.predict_proba(X_test_cat)
lgb_pred_proba = lightgbm.predict_proba(X_test_lgb)
xgBoost_pred_proba = xgBoost.predict_proba(X_test_xgb)

#### StackingClassifier

In [None]:
n_classes = len(np.unique(y_train))
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

oof_cat = np.zeros((len(X_train), n_classes))
oof_lgb = np.zeros((len(X_train), n_classes))
oof_xgb = np.zeros((len(X_train), n_classes))

for train_idx, valid_idx in skf.split(X_train_lgb, y_train):
    X_tr_lgb, X_val_lgb = X_train_lgb.loc[train_idx, :], X_train_lgb.loc[valid_idx, :]
    y_tr = y_train[train_idx]
    lightgbm.fit(X_tr_lgb, y_tr)
    oof_lgb[valid_idx] = lightgbm.predict_proba(X_val_lgb)

for train_idx, valid_idx in skf.split(X_train_xgb, y_train):
    X_tr_xgb, X_val_xgb = X_train_xgb.loc[train_idx, :], X_train_xgb.loc[valid_idx, :]
    y_tr = y_train[train_idx]
    xgBoost.fit(X_tr_xgb, y_tr)
    oof_xgb[valid_idx] = xgBoost.predict_proba(X_val_xgb)

for train_idx, valid_idx in skf.split(X_train_cat, y_train):
    X_tr_cat, X_val_cat = X_train_cat.loc[train_idx, :], X_train_cat.loc[valid_idx, :]
    y_tr = y_train[train_idx]
    catBoost.fit(X_tr_cat, y_tr)
    oof_cat[valid_idx] = catBoost.predict_proba(X_val_cat)

In [None]:
meta_estimator = LogisticRegression(
    penalty='l2',
    C=0.1,
    class_weight='balanced',
    solver='lbfgs',
    max_iter=100,
    random_state=42
)
X_meta_train = np.hstack([oof_cat, oof_lgb, oof_xgb])
meta_estimator.fit(X_meta_train, y_train)

### <center>Predykcje StackingClassifier</center>

In [None]:
catBoost.fit(X_train_cat, y_train)
lightgbm.fit(X_train_lgb, y_train)
xgBoost.fit(X_train_xgb, y_train)

X_meta_test = np.hstack([catBoost_pred_proba, lgb_pred_proba, xgBoost_pred_proba])

stacking_pred = meta_estimator.predict(X_meta_test)
stacking_pred_proba = meta_estimator.predict_proba(X_meta_test)

#### Współczynniki LogisticRegression

In [None]:
coefficients = meta_estimator.coef_

proba_ticks = [f'cat_{salary_category_encoder.inverse_transform([i % 3])[0]}' for i in range(coefficients.shape[1])]
proba_ticks[3:6] = [f'lgb_{salary_category_encoder.inverse_transform([i % 3])[0]}' for i in range(coefficients.shape[1] // 3)]
proba_ticks[6:9] = [f'xgb_{salary_category_encoder.inverse_transform([i % 3])[0]}' for i in range(coefficients.shape[1] // 3)]

plt.figure(figsize=(12, 8))
sns.heatmap(
    coefficients,
    yticklabels=salary_category_encoder.classes_,
    xticklabels=proba_ticks,
    annot=True,
    fmt='.2f',
    cbar=False,
    cmap='viridis'
)

plt.title('Moc wpływu składowych klas modeli na ostateczne predykcje dla StackingClassifier', fontsize=16)
plt.xlabel('Klasa modeli bazowych', fontsize=12)
plt.ylabel('Ostateczna klasa', fontsize=12)

### <center>Macierze pomyłek</center>

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 12))

fig.suptitle('Macierze pomyłek dla wybranych klasyfikatorów', fontsize=16, fontweight='bold')

pred = [catBoost_pred, lightgbm_pred, xgBoost_pred, stacking_pred]
titles = ['CatBoost', 'LightGBM', 'XGBoost', 'StackingClassifier']

sns.set_style('white')

for i in range(len(titles)):
    xc, yc = divmod(i, 2)
    conf_matrix = confusion_matrix(y_test, pred[i])
    sns.heatmap(
        conf_matrix.T,
        annot=True,
        fmt='d',
        cbar=False,
        xticklabels=salary_category_encoder.classes_,
        yticklabels=salary_category_encoder.classes_,
        ax=ax[xc, yc],
        cmap='rocket',
        annot_kws={'size': 16}
    )
    ax[xc, yc].set_title(f'{titles[i]}', fontsize=16, pad=12)
    ax[xc, yc].set_xlabel('Rzeczywiste etykiety' if xc == 1 else '', fontdict={'fontsize': 14})
    ax[xc, yc].set_ylabel('Przewidziane etykiety' if i % 2 == 0 else '', fontdict={'fontsize': 14})
    ax[xc, yc].grid(False)


plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

### <center>Krzywe ROC</center>

In [None]:
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)

class_of_interest = salary_category_encoder.inverse_transform(class_labels)

fig, ax = plt.subplots(nrows=3, ncols=4, figsize=(16, 12))
fig.suptitle('Krzywe ROC One-vs-Rest dla wszystkich klasyfikatorów', fontsize=16, fontweight='bold')
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#17becf']

model_pred_proba = [catBoost_pred_proba, lgb_pred_proba, xgBoost_pred_proba, stacking_pred_proba]
titles = ['CatBoost', 'LightGBM', 'XGBoost', 'StackingClassifier']

for mi in range(len(titles)):
    model_auc_scores = [roc_auc_score(y_onehot_test[:, i], model_pred_proba[mi][:, i]) for i in range(len(class_labels))]
    legend_elements = [Line2D([0], [0], color=colors[i], label=f'{class_of_interest[i]} (AUC = {model_auc_scores[i]:.3f})') for i in range(len(class_of_interest))]
    legend_elements.append(Line2D([0], [0], linestyle='dashed', color='black', label=f'Losowy wybór (AUC = 0.5)'))
    for class_id in range(len(class_labels)):
        RocCurveDisplay.from_predictions(
            y_onehot_test[:, class_id],
            model_pred_proba[mi][:, class_id],
            name=f'{class_of_interest[class_id]} vs Rest',
            plot_chance_level=True,
            ax=ax[class_id, mi],
            color=colors[class_id],
        )
        ax[class_id, mi].grid(True)
        ax[class_id, mi].set_label(class_of_interest[class_id])
        ax[class_id, mi].set_title(f'{class_of_interest[class_id]} vs Rest', fontsize=14)
        ax[class_id, mi].set_xlabel('False Positive Rate')
        ax[class_id, mi].set_ylabel('True Positive Rate')
        ax[class_id, mi].get_legend().remove()

    ax[0, mi].legend(handles=legend_elements, loc='lower center', bbox_to_anchor=(0.5, 1.3), fontsize=10, title='Klasa (One-vs-Rest)', title_fontsize=11, frameon=True)
    ax[0, mi].text(0.5, 2, f'{titles[mi]}', ha='center', va='center', fontsize=17)

plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

### <center>Krzywe uczenia</center>

In [None]:
# fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(18, 20), sharex=True, sharey=True)
#
# plt.suptitle('Krzywe uczenia dla wybranych klasyfikatorów', fontsize=16, fontweight='bold')
# models = [catBoost, lightgbm, xgBoost, stacking]
# titles = ['CatBoost', 'LightGBM', 'XGBoost', 'StackingClassifier']
# for i in range(len(titles)):
#     x1, y1 = divmod(i, 2)
#     LearningCurveDisplay.from_estimator(
#         estimator=models[i],
#         X=X,
#         y=y,
#         cv=5,
#         scoring='accuracy',
#         train_sizes=np.linspace(0.1, 1.0, 10),
#         line_kw= {'marker': 'o'},
#         ax=ax[x1, y1],
#         n_jobs=-1
#     )
#     ax[x1, y1].grid(True)
#     ax[x1, y1].set_title(titles[i], fontsize=16, pad=10)
#     ax[x1, y1].set_ylabel('Wartość dokładności')
#     ax[x1, y1].set_xlabel('Liczba danych zbioru treningowego' if i == len(titles) - 1 else '')
#
# plt.tight_layout(rect=[0, 0, 1, 0.97])
# plt.show()

#### Sztuczka z połączeniem test z train

Nie daje lepszych wyników.

In [None]:
# filtered = pd.DataFrame(data=final_stacking_pred_proba, columns=salary_category_encoder.inverse_transform([0, 1, 2]))
# filtered['obs'] = test_obs
# filtered = filtered.loc[
#     (filtered['High'] > 0.9) |
#     (filtered['Medium'] > 0.9) |
#     (filtered['Low'] > 0.9), :
# ]
# filtered['salary_category'] = salary_category_encoder.inverse_transform(np.argmax(filtered[['High', 'Low', 'Medium']], axis=1))
# filtered.drop(columns=['High', 'Low', 'Medium'], inplace=True)
# test['obs'] = test_obs
#
# sure_test = pd.merge(test, filtered, on='obs', how='right')
# sure_test.drop(columns=['obs'], inplace=True)
# test.drop(columns=['obs'], inplace=True)
#
# whole = pd.concat([train, sure_test], axis=0)
#
# sure_test.drop('salary_category', axis=1, inplace=True)
# refit_X = whole.drop('salary_category', axis=1)
# refit_y = salary_category_encoder.transform(whole['salary_category'])
#
# lightgbm.fit(refit_X, refit_y)
# catBoost.fit(refit_X, refit_y)
# xgBoost.fit(refit_X, refit_y)
# stacking.fit(refit_X, refit_y)
#
# train_test_stacking = stacking.predict(test)

### <center>Ostateczne predykcje</center>

In [None]:
lightgbm.fit(X_lgb, y)
catBoost.fit(X_cat, y)
xgBoost.fit(X_xgb, y)

final_test_cat = catBoost.predict_proba(test_cat)
final_test_lgb = lightgbm.predict_proba(test_lgb)
final_test_xgb = xgBoost.predict_proba(test_xgb)

final_X_meta_test = np.hstack([final_test_cat, final_test_lgb, final_test_xgb])

final_stacking_pred = meta_estimator.predict(final_X_meta_test)
final_stacking_pred_proba = meta_estimator.predict_proba(final_X_meta_test)

final_stacking_pred_proba

In [None]:
def prepare_submission(predictions, name):
    if 'submissions' not in os.listdir(os.getcwd()):
        os.mkdir('submissions')
    submission = pd.DataFrame(
        data={
            'obs': test_obs,
            'salary_category': salary_category_encoder.inverse_transform(predictions)
        }
    )
    submission.to_csv(os.path.join('submissions', f'{name}.csv'), index=False)
    return submission

st = prepare_submission(final_stacking_pred, 'new_stacking_lgb_xgb_cat')
the_best = pd.read_csv('the_best.csv')

In [None]:
st['salary_category'].value_counts()

In [None]:
the_best['salary_category'].value_counts()

#### Porównanie z najlepszym sub

In [None]:
st.rename(columns={'salary_category': 'new_sub'}, inplace=True)
the_best.rename(columns={'salary_category': 'old_sub'}, inplace=True)

compare = pd.merge(st, the_best, on='obs', how='inner')

In [None]:
len(compare.loc[compare['new_sub'] != compare['old_sub'], 'obs'].to_numpy())

In [None]:
compare.loc[compare['new_sub'] != compare['old_sub'], 'obs'].to_numpy()

#### Informacje o modelach

In [None]:
if 'models' not in os.listdir(os.getcwd()):
    os.mkdir('models')

timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
with open(os.path.join('models', f'model_info_{timestamp}.txt'), 'w') as f:
    f.write(f'Test size: {test_size}\n\n')

    f.write('CatBoost Parameters:\n')
    for key, value in catBoost.get_params().items():
        f.write(f'"{key}": {value},\n')

    f.write('\nLightGBM Parameters:\n')
    for key, value in lightgbm.get_params().items():
        f.write(f'"{key}": {value},\n')

    f.write('\nXGBoost Parameters:\n')
    for key, value in xgBoost.get_params().items():
        f.write(f'"{key}": {value},\n')