In [None]:
import os

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import LearningCurveDisplay, StratifiedKFold, train_test_split

#### Wczytanie danych

In [None]:
train = pd.read_csv(os.path.join('prepared', 'train.csv'))
test = pd.read_csv(os.path.join('prepared', 'test.csv'))

test_to_drop = ['obs']
train_to_drop = ['salary_category']

my_categories = ['job_title', 'feature_1']
train[my_categories] = train[my_categories].astype('category')
test[my_categories] = test[my_categories].astype('category')

test.drop(columns=test_to_drop, inplace=True)
train.drop(columns=train_to_drop, inplace=True)

#### Wstępne połączenie zbiorów

bo niżej jest ucinanie zakresu dla zbioru train

In [None]:
train['is_train'] = 1
test['is_train'] = 0
whole = pd.concat([train, test], axis=0, ignore_index=True).reset_index(drop=True)

whole['is_train'].value_counts()

#### Połączenie zbiorów

In [None]:
whole = pd.concat([train, test], axis=0, ignore_index=True).reset_index(drop=True)

#### Proces nauki, walidacji

In [None]:
X, y = whole.drop(columns=['is_train'], axis=1), whole['is_train']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [None]:
# def define_model(trial):
#     params = {
#         'objective': 'binary',
#         'boosting': 'gbdt',
#         'device': 'cpu',
#         'n_jobs': -1,
#         'random_state': 42,
#         'verbosity': -1,
#         'max_bin': trial.suggest_int('max_bin', 70, 250),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
#         'n_estimators': trial.suggest_int('n_estimators', 10, 150),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 128),
#         'max_depth': trial.suggest_int('max_depth', 2, 20),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 1, log=True),
#         'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 1, log=True),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#     }
#     return lgb.LGBMClassifier(**params)
#
# def objective(trial):
#     model = define_model(trial)
#     skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
#     scores = cross_val_score(model, X_train, y_train, cv=skf, n_jobs=-1, scoring='f1_weighted')
#     return scores.mean()
#
# study_lightgbm = optuna.create_study(direction='maximize', study_name='EngineerSalaryLightGBM', sampler=optuna.samplers.TPESampler())
# study_lightgbm.optimize(objective, n_trials=50)

In [None]:
lightgbm = lgb.LGBMClassifier(**{
    'objective': 'binary',
    'boosting': 'gbdt',
    'learning_rate': 0.02,
    'n_estimators': 100,
    'num_leaves': 16,
    'max_depth': 10,
    'reg_alpha': 1e-3,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'n_jobs': -1,
    'random_state': 42,
    'verbose': -1,
    'class_weight': 'balanced'
}).fit(X_train, y_train)
lightgbm_pred = lightgbm.predict(X_test)

print(classification_report(y_test, lightgbm_pred))

In [None]:
lightgbm_importance = pd.DataFrame(
    data={
        'feature': lightgbm.feature_name_,
        'importance': lightgbm.feature_importances_
    }).sort_values(by='importance', ascending=False).head(10)

plt.figure(figsize=(12, 8))
sns.set_style('whitegrid')

ax = sns.barplot(
    data=lightgbm_importance,
    x='importance',
    y='feature',
    hue='feature',
    legend=False,
    palette='viridis'
)

for i in ax.containers:
    ax.bar_label(i, fmt='%g', label_type='edge', fontsize=10, padding=3)

plt.title('Top 10 najważniejszych cech dla LightGBM', fontsize=16)
plt.xlabel('Wartość istotności', fontsize=12)
plt.ylabel('Cecha', fontsize=12)

plt.xticks(fontsize=10)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()

In [None]:
lgb.plot_tree(lightgbm, precision=2, figsize=(20, 12), show_info=['data_percentage'], dpi=200, orientation='vertical')

#### Macierz pomyłek

In [None]:
conf_matrix = confusion_matrix(y_test, lightgbm_pred)
sns.heatmap(
    conf_matrix.T,
    annot=True,
    fmt='d',
    cbar=False,
    cmap='rocket',
    annot_kws={'size': 16}
)
plt.xlabel('Rzeczywiste etykiety', fontdict={'fontsize': 14})
plt.ylabel('Przewidziane etykiety', fontdict={'fontsize': 14})

#### Krzywa uczenia

In [None]:
LearningCurveDisplay.from_estimator(
    estimator=lightgbm,
    X=X,
    y=y,
    cv=5,
    n_jobs=-1,
    random_state=42,
    scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10),
    line_kw= {'marker': 'o'}
)