In [None]:
import os
from datetime import datetime

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import xgboost as xgb
from matplotlib.lines import Line2D
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, LearningCurveDisplay
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
import warnings

warnings.filterwarnings('ignore')

#### Wczytanie przygotowanych danych

In [None]:
train = pd.read_csv(os.path.join('prepared', 'train.csv'))
test = pd.read_csv(os.path.join('prepared', 'test.csv'))

salary_category_encoder = LabelEncoder().fit(train['salary_category'])

test_obs = test['obs']
test.drop(columns=['obs'], inplace=True)

my_categories = train.select_dtypes(exclude=['object', np.float64]).columns
train[my_categories] = train[my_categories].astype('category')
test[my_categories] = test[my_categories].astype('category')

In [None]:
train.info()

In [None]:
test.info()

### <center>Proces nauki, walidacji, testowania</center>

In [None]:
X = train.drop(columns=['salary_category'])
y = salary_category_encoder.transform(train['salary_category'])

class_labels = np.unique(y)

test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

#### LightGBM

In [None]:
# from sklearn.metrics import accuracy_score
#
#
# dataset = lgb.Dataset(X_train, y_train, params={
#     'max_bin': 100,
#     'feature_pre_filter': False
# })
#
# def custom_accuracy(preds, eval_data):
#     y_true = eval_data.get_label()
#     y_pred = preds.argmax(axis=1)
#     return 'accuracy', accuracy_score(y_true, y_pred), True
#
# def objective_lgb(trial):
#     params = {
#         'objective': 'multiclass',
#         'num_class': len(np.unique(y)),
#         'boosting': 'gbdt',
#         'n_jobs': -1,
#         'metric': 'None',
#         'random_state': 42,
#         'verbose': -1,
#         'reg_lambda': 0.2,
#         'learning_rate': trial.suggest_float('learning_rate', 5e-2, 1, log=True),
#         'n_estimators': trial.suggest_int('n_estimators', 25, 150),
#         'num_leaves': trial.suggest_int('num_leaves', 8, 64),
#         'max_depth': trial.suggest_int('max_depth', 8, 15),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 6, 50),
#         'subsample': trial.suggest_float('subsample', 0.7, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
#     }
#
#     cv_results = lgb.cv(
#         params=params,
#         train_set=dataset,
#         folds=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
#         feval=custom_accuracy,
#         metrics='None',
#         return_cvbooster=True,
#     )
#     mean_accuracy = np.mean(cv_results['valid accuracy-mean'])
#     return mean_accuracy
#
# study_lgb = optuna.create_study(direction='maximize', study_name='EngineerSalaryLightGBM', sampler=optuna.samplers.TPESampler())
# study_lgb.optimize(objective_lgb, n_trials=100)

In [None]:
# best_params = study_lgb.best_params
# standard_params = {
#     'objective': 'multiclass',
#     'num_class': len(np.unique(y)),
#     'boosting': 'gbdt',
#     'n_jobs': -1,
#     'metric': 'None',
#     'random_state': 42,
#     'verbose': -1,
#     'reg_lambda': 1,
# }
#
# best_params = {**standard_params, **best_params}
#
# lightgbm = lgb.LGBMClassifier(**best_params)
# lightgbm.fit(X_train, y_train)
#
# lightgbm_pred = lightgbm.predict(X_test)
#
# print(classification_report(y_test, lightgbm_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Wizualizacja z optuną

In [None]:
# optuna.visualization.plot_optimization_history(study_lgb)

In [None]:
# optuna.visualization.plot_slice(study_lgb)

In [None]:
# optuna.visualization.plot_param_importances(study_lgb)

In [None]:
lightgbm = lgb.LGBMClassifier(**{
    'objective': 'multiclass',
    'num_class': len(np.unique(y)),
    'boosting': 'gbdt',
    'learning_rate': 0.02,
    'n_estimators': 100,
    'num_leaves': 16,
    'max_depth': 10,
    'reg_alpha': 1e-3,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'n_jobs': -1,
    'random_state': 42,
    'verbose': -1,
    'categorical_feature': 'name:feature_1,feature_3,feature_4,feature_6,feature_7,feature_8,feature_9,feature_11,feature_12,has_description,month,Central,East,is_others_job_title',
})
lightgbm.fit(X_train, y_train)

lightgbm_pred = lightgbm.predict(X_test)

print(classification_report(y_test, lightgbm_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Istotność cech dla LightGBM

In [None]:
lightgbm_importance = pd.DataFrame(
    data={
        'feature': lightgbm.feature_name_,
        'importance': lightgbm.feature_importances_
    }).sort_values(by='importance', ascending=False).head(10)

plt.figure(figsize=(12, 8))
sns.set_style('whitegrid')

ax = sns.barplot(
    data=lightgbm_importance,
    x='importance',
    y='feature',
    hue='feature',
    legend=False,
    palette='viridis'
)

for i in ax.containers:
    ax.bar_label(i, fmt='%g', label_type='edge', fontsize=10, padding=3)

plt.title('Top 10 najważniejszych cech dla LightGBM', fontsize=16)
plt.xlabel('Wartość istotności', fontsize=12)
plt.ylabel('Cecha', fontsize=12)

plt.xticks(fontsize=10)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()

#### XGBoost

In [None]:
# def define_xgb(trial):
#     params = {
#         'tree_method': 'hist',
#         'enable_categorical': True,
#         'n_estimators': trial.suggest_int('n_estimators', 10, 100),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-2, 1, log=True),
#         'max_depth': trial.suggest_int('max_depth', 5, 15),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'max_leaves': trial.suggest_int('max_leaves', 8, 48),
#         'gamma': trial.suggest_float('gamma', 0.5, 1),
#         'max_bin': 100,
#         'n_jobs': -1,
#         'objective': 'multiclass',
#         'num_class': len(np.unique(y)),
#         'random_state': 42
#     }
#     return xgb.XGBClassifier(**params)
#
# def objective_xgb(trial):
#     model = define_xgb(trial)
#     skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
#     scores = cross_val_score(model, X, y, cv=skf, n_jobs=-1, scoring='accuracy')
#     return scores.mean()
#
# study_xgb = optuna.create_study(direction='maximize', study_name='EngineerSalaryXGBoost', sampler=optuna.samplers.TPESampler())
# study_xgb.optimize(objective_xgb, n_trials=100)
#
# xgBoost = define_xgb(study_xgb.best_trial).fit(X_train, y_train)
# xgBoost_pred = xgBoost.predict(X_test)
#
# print(classification_report(y_test, xgBoost_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Wizualizacji procesu optymalizacji hiperparametrów

In [None]:
# optuna.visualization.plot_optimization_history(study_xgb)

In [None]:
# optuna.visualization.plot_slice(study_xgb)

In [None]:
# optuna.visualization.plot_param_importances(study_xgb)

In [None]:
xgBoost = xgb.XGBClassifier(**{
    'objective': 'multi:softprob',
    'use_label_encoder': None,
    'base_score': None,
    'booster': None,
    'callbacks': None,
    'colsample_bylevel': None,
    'colsample_bynode': None,
    'colsample_bytree': None,
    'early_stopping_rounds': None,
    'enable_categorical': True,
    'eval_metric': None,
    'feature_types': None,
    'gamma': 0.194932455867791,
    'gpu_id': None,
    'grow_policy': None,
    'importance_type': None,
    'interaction_constraints': None,
    'learning_rate': 0.16148817938442583,
    'max_bin': 100,
    'max_cat_threshold': None,
    'max_cat_to_onehot': None,
    'max_delta_step': None,
    'max_depth': 11,
    'max_leaves': 35,
    'min_child_weight': None,
    'monotone_constraints': None,
    'n_estimators': 82,
    'n_jobs': -1,
    'num_parallel_tree': None,
    'predictor': None,
    'random_state': 42,
    'reg_alpha': None,
    'reg_lambda': None,
    'sampling_method': None,
    'scale_pos_weight': None,
    'subsample': 0.9735189885213125,
    'tree_method': 'hist',
    'validate_parameters': None,
    'verbosity': None,
    'num_class': 3,
}).fit(X_train, y_train)
xgBoost_pred = xgBoost.predict(X_test)

print(classification_report(y_test, xgBoost_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Istotność cech dla XGBoost

In [None]:
importance_dict = xgBoost.get_booster().get_score(importance_type='weight')

xgBoost_importance = pd.DataFrame({
    'feature': list(importance_dict.keys()),
    'importance': list(importance_dict.values())
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(12, 8))
sns.set_style('whitegrid')

ax = sns.barplot(
    data=xgBoost_importance,
    x='importance',
    y='feature',
    hue='feature',
    legend=False,
    palette='viridis'
)

for i in ax.containers:
    ax.bar_label(i, fmt='%g', label_type='edge', fontsize=10, padding=3)

plt.title('Top 10 najważniejszych cech dla XGBoost', fontsize=16)
plt.xlabel('Wartość istotności', fontsize=12)
plt.ylabel('Cecha', fontsize=12)

plt.xticks(fontsize=10)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()

#### Las losowy

In [None]:
def define_random_forest(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 8, 64),
        'criterion': 'gini',
        'min_samples_split': trial.suggest_int('min_samples_split', 5, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 15),
        'bootstrap': True,
        'max_samples': trial.suggest_float('max_samples', 0.6, 1.0),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 1e-4, 1e-2, log=True),
        'random_state': 42,
        'n_jobs': -1,
        'class_weight': 'balanced',
    }
    return RandomForestClassifier(**params)

def objective_random_forest(trial):
    model = define_random_forest(trial)
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=skf, n_jobs=-1, scoring='accuracy')
    return scores.mean()

study_rf = optuna.create_study(direction='maximize', study_name='EngineerSalaryRandomForest', sampler=optuna.samplers.TPESampler())
study_rf.optimize(objective_random_forest, n_trials=100)

rf = define_random_forest(study_rf.best_trial).fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print(classification_report(y_test, rf_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Wizualizacja procesu optymalizacji hiperparametrów

In [None]:
# optuna.visualization.plot_optimization_history(study_rf)

In [None]:
# optuna.visualization.plot_slice(study_rf)

In [None]:
# optuna.visualization.plot_param_importances(study_rf)

In [None]:
rf = RandomForestClassifier(**{
    'bootstrap': True,
    'ccp_alpha': 0.0,
    'class_weight': 'balanced',
    'criterion': 'gini',
    'max_depth': 14,  # zmien!!! 14
    'max_features': 'sqrt',  # zmien!!! 'sqrt'
    'max_leaf_nodes': None,  # zmien!!! None
    'max_samples': 0.8203767555132567,
    'min_impurity_decrease': 0.001,  # zmien!!! 0.0
    'min_samples_leaf': 1,
    'min_samples_split': 4,
    'min_weight_fraction_leaf': 0.0,
    'monotonic_cst': None,
    'n_estimators': 112,
    'n_jobs': -1,
    'oob_score': False,
    'random_state': 42,
    'verbose': 0,
    'warm_start': False,
}).fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print(classification_report(y_test, rf_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Istotność cech dla lasu losowego

In [None]:
rf_importance = pd.DataFrame({
    'feature': rf.feature_names_in_,
    'importance': rf.feature_importances_
}).sort_values(by='importance', ascending=False).head(10)

plt.figure(figsize=(12, 8))
sns.set_style('whitegrid')

ax = sns.barplot(
    data=rf_importance,
    x='importance',
    y='feature',
    hue='feature',
    legend=False,
    palette='viridis'
)

for i in ax.containers:
    ax.bar_label(i, fmt='%.3f', label_type='edge', fontsize=10, padding=3)

plt.title('Top 10 najważniejszych cech dla lasu losowego', fontsize=16)
plt.xlabel('Wartość istotności', fontsize=12)
plt.ylabel('Cecha', fontsize=12)

plt.xticks(fontsize=10)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()

In [None]:
rf_pred_proba = rf.predict_proba(X_test)
lgb_pred_proba = lightgbm.predict_proba(X_test)
xgBoost_pred_proba = xgBoost.predict_proba(X_test)

avg = (lgb_pred_proba + xgBoost_pred_proba + rf_pred_proba) / 3

print(classification_report(y_test, np.argmax(avg, axis=1), target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### StackingClassifier

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression


stacked = StackingClassifier(
    estimators=[
        ('rf', rf),
        ('lgb', lightgbm),
        ('xgBoost', xgBoost)
    ],
    final_estimator=LogisticRegression(
        penalty='l2',
        C=1.0,
        class_weight='balanced',
        solver='lbfgs',
        max_iter=1000,
        random_state=42
    ),
    n_jobs=-1,
    stack_method='predict_proba'
).fit(X_train, y_train)
stacked_pred = stacked.predict(X_test)
stacked_pred_proba = stacked.predict_proba(X_test)

print(classification_report(y_test, stacked_pred, target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

### <center>Macierze pomyłek</center>

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=2, figsize=(10, 12))

fig.suptitle('Macierze pomyłek dla wybranych klasyfikatorów', fontsize=16, fontweight='bold')

pred = [rf_pred, lightgbm_pred, xgBoost_pred, np.argmax(avg, axis=1), stacked_pred]
titles = ['Las losowy', 'LightGBM', 'XGBoost', 'avg_lgb_xgb_rf', 'StackingClassifier']

sns.set_style('white')

for i in range(len(titles)):
    xc, yc = divmod(i, 2)
    conf_matrix = confusion_matrix(y_test, pred[i])
    sns.heatmap(
        conf_matrix.T,
        annot=True,
        fmt='d',
        cbar=False,
        xticklabels=salary_category_encoder.classes_,
        yticklabels=salary_category_encoder.classes_,
        ax=ax[xc, yc],
        cmap='rocket',
        annot_kws={'size': 16}
    )
    ax[xc, yc].set_title(f'{titles[i]}', fontsize=16, pad=12)
    ax[xc, yc].set_xlabel('Rzeczywiste etykiety' if xc == 1 else '', fontdict={'fontsize': 14})
    ax[xc, yc].set_ylabel('Przewidziane etykiety' if i % 2 == 0 else '', fontdict={'fontsize': 14})
    ax[xc, yc].grid(False)


ax[2, 1].set_axis_off()
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

### <center>Krzywe ROC</center>

In [None]:
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)

class_of_interest = salary_category_encoder.inverse_transform(class_labels)

fig, ax = plt.subplots(nrows=3, ncols=4, figsize=(16, 12))
fig.suptitle('Krzywe ROC One-vs-Rest dla wszystkich klasyfikatorów', fontsize=16, fontweight='bold')
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#17becf']

models = [rf, lightgbm, xgBoost, stacked]
model_pred_proba = [rf_pred_proba, lgb_pred_proba, xgBoost_pred_proba, stacked_pred_proba]
titles = ['Las losowy' , 'LightGBM', 'XGBoost', 'StackingClassifier']

for mi, model in enumerate(models):
    model_auc_scores = [roc_auc_score(y_onehot_test[:, i], model_pred_proba[mi][:, i]) for i in range(len(class_labels))]
    legend_elements = [Line2D([0], [0], color=colors[i], label=f'{class_of_interest[i]} (AUC = {model_auc_scores[i]:.3f})') for i in range(len(class_of_interest))]
    legend_elements.append(Line2D([0], [0], linestyle='dashed', color='black', label=f'Losowy wybór (AUC = 0.5)'))
    for class_id in range(len(class_labels)):
        RocCurveDisplay.from_predictions(
            y_onehot_test[:, class_id],
            model_pred_proba[mi][:, class_id],
            name=f'{class_of_interest[class_id]} vs Rest',
            plot_chance_level=True,
            ax=ax[class_id, mi],
            color=colors[class_id],
        )
        ax[class_id, mi].grid(True)
        ax[class_id, mi].set_label(class_of_interest[class_id])
        ax[class_id, mi].set_title(f'{class_of_interest[class_id]} vs Rest', fontsize=14)
        ax[class_id, mi].set_xlabel('False Positive Rate')
        ax[class_id, mi].set_ylabel('True Positive Rate')
        ax[class_id, mi].get_legend().remove()

    ax[0, mi].legend(handles=legend_elements, loc='lower center', bbox_to_anchor=(0.5, 1.3), fontsize=10, title='Klasa (One-vs-Rest)', title_fontsize=11, frameon=True)
    ax[0, mi].text(0.5, 2, f'{titles[mi]}', ha='center', va='center', fontsize=17)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

### <center>Krzywe uczenia</center>

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(18, 20), sharex=True, sharey=True)

plt.suptitle('Krzywe uczenia dla wybranych klasyfikatorów', fontsize=16, fontweight='bold')
models = [rf, lightgbm, xgBoost, stacked]
titles = ['Las losowy', 'LightGBM', 'XGBoost', 'StackingClassifier']
for i in range(len(titles)):
    x1, y1 = divmod(i, 2)
    LearningCurveDisplay.from_estimator(
        estimator=models[i],
        X=X,
        y=y,
        cv=10,
        scoring='accuracy',
        train_sizes=np.linspace(0.1, 1.0, 10),
        line_kw= {'marker': 'o'},
        ax=ax[x1, y1],
        n_jobs=-1
    )
    ax[x1, y1].grid(True)
    ax[x1, y1].set_title(titles[i], fontsize=16, pad=10)
    ax[x1, y1].set_ylabel('Wartość dokładności')
    ax[x1, y1].set_xlabel('Liczba danych zbioru treningowego' if i == len(titles) - 1 else '')

plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

#### Fit na X, y

In [None]:
lightgbm.fit(X, y)
rf.fit(X, y)
xgBoost.fit(X, y)
stacked.fit(X, y)

### <center>Ostateczne predykcje</center>

In [None]:
final_rf_pred = rf.predict(test)
final_lgb_pred = lightgbm.predict(test)
final_xgb_pred = xgBoost.predict(test)

final_rf_pred_proba = rf.predict_proba(test)
final_lgb_pred_proba = lightgbm.predict_proba(test)
final_xgb_pred_proba = xgBoost.predict_proba(test)

final_avg = (final_lgb_pred_proba + final_xgb_pred_proba + final_rf_pred_proba) / 3

final_avg

In [None]:
final_stacked_pred = stacked.predict(test)
final_stacked_pred_proba = stacked.predict_proba(test)

final_stacked_pred_proba

#### Sztuczka z połączeniem test z train

Nie daje lepszych wyników. Wynik jest taki sam: 76.3466

In [None]:
test['salary_category'] = salary_category_encoder.inverse_transform(np.argmax(final_stacked_pred_proba, axis=1))

whole = pd.concat([train, test], axis=0)
whole['is_others_job_title'] = whole['is_others_job_title'].astype('category')

test.drop('salary_category', axis=1, inplace=True)
refit_X = whole.drop('salary_category', axis=1)
refit_y = salary_category_encoder.transform(whole['salary_category'])

stacked.fit(refit_X, refit_y)

train_test_stacked = stacked.predict(test)

In [None]:
def prepare_submission(predictions, name):
    if 'submissions' not in os.listdir(os.getcwd()):
        os.mkdir('submissions')
    submission = pd.DataFrame(
        data={
            'obs': test_obs,
            'salary_category': salary_category_encoder.inverse_transform(predictions)
        }
    )
    submission.to_csv(os.path.join('submissions', f'{name}.csv'), index=False)

# prepare_submission(np.argmax(final_avg, axis=1), 'average_lgb_xgboost_rf')
prepare_submission(final_stacked_pred, 'stacked_lgb_xgb_rf')
# prepare_submission(train_test_stacked, 'train_test_stacked')

#### Informacje o modelach

In [None]:
if 'models' not in os.listdir(os.getcwd()):
    os.mkdir('models')

timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
with open(os.path.join('models', f'model_info_{timestamp}.txt'), 'w') as f:
    f.write(f'Test size: {test_size}\n\n')

    f.write('Random Forest Parameters:\n')
    for key, value in rf.get_params().items():
        f.write(f'"{key}": {value},\n')

    f.write('\nLightGBM Parameters:\n')
    for key, value in lightgbm.get_params().items():
        f.write(f'"{key}": {value},\n')

    f.write('\nXGBoost Parameters:\n')
    for key, value in xgBoost.get_params().items():
        f.write(f'"{key}": {value},\n')