In [None]:
import os

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
from scipy.stats import mannwhitneyu
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LearningCurveDisplay
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv(os.path.join('prepared', 'train.csv'))
test = pd.read_csv(os.path.join('prepared', 'test.csv'))

test_to_drop = ['obs']
train_to_drop = ['salary_category']

test.drop(columns=test_to_drop, inplace=True)
train.drop(columns=train_to_drop, inplace=True)

#### Wstępne połączenie zbiorów

bo niżej jest ucinanie zakresu dla zbioru train

In [None]:
train['is_train'] = 1
test['is_train'] = 0
whole = pd.concat([train, test], axis=0, ignore_index=True).reset_index(drop=True)

whole['is_train'].value_counts()

#### Porównanie rozkładu cechy feature_2 w zbiorach Train i Test

In [None]:
plt.figure(figsize=(10, 6))
sns.set_style('whitegrid')

sns.histplot(
    data=whole,
    x='feature_2',
    hue='is_train',
    kde=True,
    color='steelblue',
    bins=20,
    alpha=0.6
)

train.drop(columns=['feature_2'], inplace=True)
test.drop(columns=['feature_2'], inplace=True)

plt.title('Porównanie rozkładu cechy feature_2 w zbiorach Train i Test', fontsize=14)
plt.xlabel('feature_2', fontsize=12)
plt.ylabel('Liczba przypadków', fontsize=12)
plt.tight_layout()
plt.show()

### Wykorzystanie testu U Manna Whitneya i Kolmogorova-Smirnova

Co wiem:
- Train i test są niezależnymi grupami.
-
do wskazania które cechy między Train, a Test różnią się najbardziej

In [None]:
# statistical_tests = pd.DataFrame(
#     data={
#         'columns': train.columns[:-1],
#         'umann_pvalue': [mannwhitneyu(train[column], test[column], nan_policy='omit', use_continuity=False, method='asymptotic', alternative='two-sided').pvalue for column in train.columns if column != 'is_train']
#     }
# )
#
# check_for_diff = statistical_tests.loc[statistical_tests['umann_pvalue'] <= 0.05, :].sort_values(by='umann_pvalue')
#
# check_for_diff

#### Porównanie wybranych kolumn

In [None]:
# train[check_for_diff['columns']].describe()

In [None]:
# test[check_for_diff['columns']].describe()

#### Czy są jakieś widoczne różnice między test, a train?

In [None]:
# fig, ax = plt.subplots(nrows=6, ncols=2, figsize=(10, 20))
#
# plt.suptitle('Porównanie rozkładu wybranych cech przez testy U Manna Whitneya')
#
# for i, column in enumerate(check_for_diff['columns']):
#     x, y = divmod(i, 2)
#     sns.set_style('whitegrid')
#     sns.histplot(
#         data=whole,
#         x=column,
#         hue='is_train',
#         kde=True,
#         color='steelblue',
#         bins=20,
#         alpha=0.6,
#         ax=ax[x, y],
#     )
#     ax[x, y].set_ylabel('Liczba przypadków' if y % 2 == 0 else '', fontsize=12)
#
# plt.tight_layout()
# plt.show()

#### Zmień przedziały w train, aby się zgadzały z test

In [None]:
# def cut_ranges(column):
#     test_min, test_max = test[column].min(), test[column].max()
#     return train[(train[column] >= test_min) & (train[column] <= test_max)].reset_index(drop=True)
#
# for column in check_for_diff['columns']:
#     train = cut_ranges(column)
#
# train.info()

#### Połączenie zbiorów

In [None]:
# train.drop(index=train.sample(n=np.abs(len(train) - len(test))).index, inplace=True)
whole = pd.concat([train, test], axis=0, ignore_index=True).reset_index(drop=True)

#### Proces nauki, walidacji

In [None]:
X, y = whole.drop(columns=['is_train'], axis=1), whole['is_train']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [None]:
# def define_model(trial):
#     params = {
#         'objective': 'binary',
#         'boosting': 'gbdt',
#         'device': 'cpu',
#         'n_jobs': -1,
#         'random_state': 42,
#         'verbosity': -1,
#         'max_bin': trial.suggest_int('max_bin', 70, 250),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
#         'n_estimators': trial.suggest_int('n_estimators', 10, 150),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 128),
#         'max_depth': trial.suggest_int('max_depth', 2, 20),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 1, log=True),
#         'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 1, log=True),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#     }
#     return lgb.LGBMClassifier(**params)
#
# def objective(trial):
#     model = define_model(trial)
#     skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
#     scores = cross_val_score(model, X_train, y_train, cv=skf, n_jobs=-1, scoring='f1_weighted')
#     return scores.mean()
#
# study_lightgbm = optuna.create_study(direction='maximize', study_name='EngineerSalaryLightGBM', sampler=optuna.samplers.TPESampler())
# study_lightgbm.optimize(objective, n_trials=50)

In [None]:
lightgbm = lgb.LGBMClassifier(**{
    'objective': 'binary',
    'boosting': 'gbdt',
    'learning_rate': 0.9,
    'n_estimators': 120,
    'num_leaves': 160,
    'max_depth': 30,
    'subsample': 0.7,
    'colsample_bytree': 0.2,
    'n_jobs': -1,
    'random_state': 42,
    'verbose': -1,
    'class_weight': 'balanced'
}).fit(X_train, y_train)

print(classification_report(y_test, lightgbm.predict(X_test)))

In [None]:
lightgbm_importance = pd.DataFrame(
    data={
        'feature': lightgbm.feature_name_,
        'importance': lightgbm.feature_importances_
    }).sort_values(by='importance', ascending=False).head(20)

plt.figure(figsize=(12, 8))
sns.set_style('whitegrid')

ax = sns.barplot(
    data=lightgbm_importance,
    x='importance',
    y='feature',
    hue='feature',
    legend=False,
    palette='viridis'
)

for i in ax.containers:
    ax.bar_label(i, fmt='%g', label_type='edge', fontsize=10, padding=3)

plt.title('Top 20 najważniejszych cech dla LightGBM', fontsize=16)
plt.xlabel('Wartość istotności', fontsize=12)
plt.ylabel('Cecha', fontsize=12)

plt.xticks(fontsize=10)
plt.yticks(fontsize=11)
plt.tight_layout()
plt.show()

In [None]:
lgb.plot_tree(lightgbm, precision=2, figsize=(20, 12), show_info=['data_percentage'], dpi=200, orientation='vertical')

#### Macierz pomyłek

In [None]:
conf_matrix = confusion_matrix(y_test, lightgbm.predict(X_test))
sns.heatmap(
    conf_matrix.T,
    annot=True,
    fmt='d',
    cbar=False,
    cmap='rocket',
    annot_kws={'size': 16}
)
plt.xlabel('Rzeczywiste etykiety', fontdict={'fontsize': 14})
plt.ylabel('Przewidziane etykiety', fontdict={'fontsize': 14})

#### Krzywa uczenia

In [None]:
LearningCurveDisplay.from_estimator(
    lightgbm,
    X,
    y,
    cv=StratifiedKFold(n_splits=5),
    n_jobs=-1,
    random_state=42,
    scoring='f1_weighted',
    train_sizes=np.linspace(0.1, 1.0, 10),
    line_kw= {'marker': 'o'}
)