In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv(os.path.join('prepared', 'train.csv'))
test = pd.read_csv(os.path.join('prepared', 'test.csv'))

test_to_drop = ['obs']
train_to_drop = ['salary_category']
# test_to_drop.extend(to_drop)
# train_to_drop.extend(to_drop)

test.drop(columns=test_to_drop, inplace=True)
train.drop(columns=train_to_drop, inplace=True)

In [None]:
train['is_train'] = 1
test['is_train'] = 0
whole = pd.concat([train, test], axis=0, ignore_index=True).reset_index(drop=True)

whole.info()

In [None]:
whole['is_train'].value_counts()

In [None]:
plt.figure(figsize=(10, 6))
sns.set_style('whitegrid')

sns.histplot(
    data=whole,
    x='feature_2',
    hue='is_train',
    kde=True,
    color='steelblue',
    bins=20,
    alpha=0.6
)

plt.title('Porównanie rozkładu cechy feature_2 w zbiorach Train i Test', fontsize=14)
plt.xlabel('feature_2', fontsize=12)
plt.ylabel('Liczba przypadków', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split


X, y = whole.drop(columns=['is_train'], axis=1), whole['is_train']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [None]:
import lightgbm as lgb
import optuna
from sklearn.model_selection import cross_val_score, StratifiedKFold


def define_model(trial):
    params = {
        'objective': 'binary',
        'max_bin': trial.suggest_int('max_bin', 70, 250),
        'boosting': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 10, 150),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'device': 'cpu',
        'n_jobs': -1,
        'random_state': 42,
        'verbosity': -1,
    }
    return lgb.LGBMClassifier(**params)

def objective(trial):
    model = define_model(trial)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=skf, n_jobs=-1, scoring='accuracy')
    return scores.mean()

In [None]:
study_lightgbm = optuna.create_study(direction='maximize', study_name='EngineerSalaryLightGBM', sampler=optuna.samplers.TPESampler())
study_lightgbm.optimize(objective, n_trials=50)

In [None]:
lightgbm = define_model(study_lightgbm.best_trial)

lightgbm.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report


print(classification_report(y_test, lightgbm.predict(X_test)))

In [None]:
lightgbm_importance = pd.DataFrame(
    data={
        'feature': lightgbm.feature_name_,
        'importance': lightgbm.feature_importances_
    }).sort_values(by='importance', ascending=False).head(20)

plt.figure(figsize=(12, 8))
sns.set_style('whitegrid')

ax = sns.barplot(
    data=lightgbm_importance,
    x='importance',
    y='feature',
    hue='feature',
    legend=False,
    palette='viridis'
)

for i in ax.containers:
    ax.bar_label(i, fmt='%g', label_type='edge', fontsize=10, padding=3)

plt.title('Top 20 najważniejszych cech dla LightGBM', fontsize=16)
plt.xlabel('Wartość istotności', fontsize=12)
plt.ylabel('Cecha', fontsize=12)

plt.xticks(fontsize=10)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()

In [None]:
lgb.plot_tree(lightgbm, precision=2, figsize=(20, 12), show_info=['data_percentage'], dpi=200, orientation='vertical')

In [None]:
from sklearn.metrics import confusion_matrix


conf_matrix = confusion_matrix(y_test, lightgbm.predict(X_test))
sns.heatmap(
    conf_matrix.T,
    annot=True,
    fmt='d',
    cbar=False,
    cmap='rocket',
    annot_kws={'size': 16}
)
plt.xlabel('Rzeczywiste etykiety', fontdict={'fontsize': 14})
plt.ylabel('Przewidziane etykiety', fontdict={'fontsize': 14})

In [None]:
from sklearn.model_selection import LearningCurveDisplay


LearningCurveDisplay.from_estimator(lightgbm, X, y, cv=StratifiedKFold(n_splits=5), n_jobs=-1, random_state=4, scoring='accuracy')