In [None]:
import os

import lightgbm as lgb
import numpy as np
import optuna
import pandas as pd
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv(os.path.join('prepared', 'train.csv'))
test = pd.read_csv(os.path.join('prepared', 'test.csv'))

test_obs = test['obs']
test.drop(columns=['obs'], inplace=True)

my_categories = ['job_title', 'feature_1']
train[my_categories] = train[my_categories].astype('category')
test[my_categories] = test[my_categories].astype('category')

#### Podział na 3 zbiory do klasyfikacji binarnej

In [None]:
X = train.drop(columns=['salary_category'])
y = train['salary_category']

max_size = len(X)
train_size = 0.8
X_train = X.iloc[:int(max_size * train_size), :]
X_test = X.iloc[int(max_size * train_size):, :]

y_high = y.copy()
y_high[y_high != 'High'] = 'Other'

y_low = y.copy()
y_low[y_low != 'Low'] = 'Other'

y_medium = y.copy()
y_medium[y_medium != 'Medium'] = 'Other'

high_encoder = LabelEncoder().fit(y_high)
y_high_encoded = high_encoder.transform(y_high)
y_high_train = y_high_encoded[:int(max_size * train_size)]
y_high_test = y_high_encoded[int(max_size * train_size):]

low_encoder = LabelEncoder().fit(y_low)
y_low_encoded = low_encoder.transform(y_low)
y_low_train = y_low_encoded[:int(max_size * train_size)]
y_low_test = y_low_encoded[int(max_size * train_size):]

medium_encoder = LabelEncoder().fit(y_medium)
y_medium_encoded = medium_encoder.transform(y_medium)
y_medium_train = y_medium_encoded[:int(max_size * train_size)]
y_medium_test = y_medium_encoded[int(max_size * train_size):]

In [None]:
unique_high, counts_high = np.unique(y_high_encoded, return_counts=True)

counts_high

In [None]:
unique_low, counts_low = np.unique(y_low_encoded, return_counts=True)

counts_low

In [None]:
unique_medium, counts_medium = np.unique(y_medium_encoded, return_counts=True)

counts_medium

#### Klasa High vs Rest

In [None]:
def custom_f1_score(preds, eval_data):
    y_true = eval_data.get_label()
    y_pred = (preds > 0.5).astype(int)
    return 'f1', f1_score(y_true, y_pred), True

def objective_lgb(trial):
    dataset = lgb.Dataset(X_train, y_high_train, params={
        'max_bin': trial.suggest_int('max_bin', 32, 256),
        'feature_pre_filter': False
    })

    params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'n_jobs': -1,
        'metric': 'None',
        'random_state': 81,
        'verbose': -1,
        'reg_alpha': 1e-3,
        'is_unbalance': True,
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 250),
        'num_leaves': trial.suggest_int('num_leaves', 8, 64),
        'max_depth': trial.suggest_int('max_depth', 8, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 3, 50),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
    }

    cv_results = lgb.cv(
        params=params,
        train_set=dataset,
        folds=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
        feval=custom_f1_score,
        metrics='None',
        return_cvbooster=False,
    )
    mean_f1 = np.mean(cv_results['valid f1-mean'])
    return mean_f1

study_lgb_high = optuna.create_study(direction='maximize', study_name='EngineerSalaryLightGBM_High', sampler=optuna.samplers.TPESampler())
study_lgb_high.optimize(objective_lgb, n_trials=100)

#### Wizualizacja procesu optymalizacji hiperparametrów

In [None]:
optuna.visualization.plot_optimization_history(study_lgb_high)

In [None]:
optuna.visualization.plot_slice(study_lgb_high)

In [None]:
optuna.visualization.plot_param_importances(study_lgb_high)

#### Najlepszy lgb dla klasy High

In [None]:
best_high_params = study_lgb_high.best_params

best_high_params

In [None]:
standard_params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'n_jobs': -1,
    'metric': 'None',
    'random_state': 81,
    'verbose': -1,
    'reg_alpha': 1e-3,
    'is_unbalance': True,
}
best_high_params = {**standard_params, **best_high_params}

lightgbm_high = lgb.LGBMClassifier(**best_high_params).fit(X_train, y_high_train)
lightgbm_high_pred = lightgbm_high.predict(X_test)
print(classification_report(y_high_test, lightgbm_high_pred, target_names=high_encoder.inverse_transform(np.unique(y_high_test))))

#### Klasa Low vs Rest

In [None]:
def objective_lgb(trial):
    dataset = lgb.Dataset(X_train, y_low_train, params={
        'max_bin': trial.suggest_int('max_bin', 32, 256),
        'feature_pre_filter': False
    })

    params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'n_jobs': -1,
        'metric': 'None',
        'random_state': 81,
        'verbose': -1,
        'reg_alpha': 1e-3,
        'is_unbalance': True,
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 250),
        'num_leaves': trial.suggest_int('num_leaves', 8, 64),
        'max_depth': trial.suggest_int('max_depth', 8, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 3, 50),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
    }

    cv_results = lgb.cv(
        params=params,
        train_set=dataset,
        folds=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
        feval=custom_f1_score,
        metrics='None',
        return_cvbooster=False,
    )
    mean_f1 = np.mean(cv_results['valid f1-mean'])
    return mean_f1

study_lgb_low = optuna.create_study(direction='maximize', study_name='EngineerSalaryLightGBM_Low', sampler=optuna.samplers.TPESampler())
study_lgb_low.optimize(objective_lgb, n_trials=100)

#### Wizualizacja procesu optymalizacji hiperparametrów

In [None]:
optuna.visualization.plot_optimization_history(study_lgb_low)

In [None]:
optuna.visualization.plot_slice(study_lgb_low)

In [None]:
optuna.visualization.plot_param_importances(study_lgb_low)

#### Najlepszy lgb dla klasy Low

In [None]:
best_low_params = study_lgb_low.best_params

best_low_params

In [None]:
standard_params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'n_jobs': -1,
    'metric': 'None',
    'random_state': 81,
    'verbose': -1,
    'reg_alpha': 1e-3,
    'is_unbalance': True,
}
best_low_params = {**standard_params, **best_low_params}

lightgbm_low = lgb.LGBMClassifier(**best_low_params).fit(X_train, y_low_train)
lightgbm_low_pred = lightgbm_low.predict(X_test)
print(classification_report(y_low_test, lightgbm_low_pred, target_names=low_encoder.inverse_transform(np.unique(y_low_test))))

#### Klasa Medium vs Rest

In [None]:
def objective_lgb(trial):
    dataset = lgb.Dataset(X_train, y_medium_train, params={
        'max_bin': trial.suggest_int('max_bin', 32, 256),
        'feature_pre_filter': False
    })

    params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'n_jobs': -1,
        'metric': 'None',
        'random_state': 81,
        'verbose': -1,
        'reg_alpha': 1e-3,
        'is_unbalance': True,
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 250),
        'num_leaves': trial.suggest_int('num_leaves', 8, 64),
        'max_depth': trial.suggest_int('max_depth', 8, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 3, 50),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
    }

    cv_results = lgb.cv(
        params=params,
        train_set=dataset,
        folds=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
        feval=custom_f1_score,
        metrics='None',
        return_cvbooster=False,
    )
    mean_f1 = np.mean(cv_results['valid f1-mean'])
    return mean_f1

study_lgb_medium = optuna.create_study(direction='maximize', study_name='EngineerSalaryLightGBM_Medium', sampler=optuna.samplers.TPESampler())
study_lgb_medium.optimize(objective_lgb, n_trials=100)

#### Wizualizacja procesu optymalizacji hiperparametrów

In [None]:
optuna.visualization.plot_optimization_history(study_lgb_medium)

In [None]:
optuna.visualization.plot_slice(study_lgb_medium)

In [None]:
optuna.visualization.plot_param_importances(study_lgb_medium)

#### Najlepszy lgb dla Medium

In [None]:
best_medium_params = study_lgb_medium.best_params

best_medium_params

In [None]:
standard_params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'n_jobs': -1,
    'metric': 'None',
    'random_state': 81,
    'verbose': -1,
    'reg_alpha': 1e-3,
    'is_unbalance': True,
}
best_medium_params = {**standard_params, **best_medium_params}

lightgbm_medium = lgb.LGBMClassifier(**best_medium_params).fit(X_train, y_medium_train)
lightgbm_medium_pred = lightgbm_medium.predict(X_test)
print(classification_report(y_medium_test, lightgbm_medium_pred, target_names=medium_encoder.inverse_transform(np.unique(y_medium_test))))

#### Pomoc w ostatecznej predykcji

- wczytanie wyników poszczególnych klasyfikacji binarnych
- wczytanie stacking
- wykorzystanie margin sampling do oceny niepewnych przypadków klasyfikacji
- wskazanie progu marginesu niepewności t
-

In [None]:
test_high = pd.DataFrame(
    high_encoder.inverse_transform(lightgbm_high.predict(test)),
    columns=['lgb_high']
)
test_low = pd.DataFrame(
    low_encoder.inverse_transform(lightgbm_low.predict(test)),
    columns=['lgb_low']
)
test_medium = pd.DataFrame(
    medium_encoder.inverse_transform(lightgbm_medium.predict(test)),
    columns=['lgb_medium']
)

binary_class = pd.concat([test_high, test_low, test_medium], axis=1)

binary_class

In [None]:
stacking_proba = pd.read_csv('be_better.csv')

stacking_proba

In [None]:
t = 0.06
proba_array = stacking_proba.values
sorted_proba = -np.sort(-proba_array, axis=1)
margin = sorted_proba[:, 0] - sorted_proba[:, 1]

stacking_proba['margin_sampling'] = margin
stacking_proba = pd.concat([stacking_proba, binary_class], axis=1)
stacking_proba['count_other'] = (stacking_proba == 'Other').sum(axis=1)

In [None]:
salary_category_encoder = LabelEncoder().fit(train['salary_category'])

def prepare_submission(predictions, name):
    if 'submissions' not in os.listdir(os.getcwd()):
        os.mkdir('submissions')
    submission = pd.DataFrame(
        data={
            'obs': test_obs,
            'salary_category': predictions
        }
    )
    submission.to_csv(os.path.join('submissions', f'{name}.csv'), index=False)
    return submission

In [None]:
stacking_proba.loc[(stacking_proba['margin_sampling'] < t), :]

In [None]:
stacking_proba['final_pred'] = salary_category_encoder.inverse_transform(np.argmax(stacking_proba[['High', 'Low', 'Medium']], axis=1))
stacking_proba.loc[(stacking_proba['margin_sampling'] < t) & (stacking_proba['count_other'] == 3), 'final_pred'] = salary_category_encoder.inverse_transform(np.argmin(stacking_proba.loc[(stacking_proba['margin_sampling'] < t) & (stacking_proba['count_other'] == 3), ['High', 'Low', 'Medium']], axis=1))
stacking_proba.loc[(stacking_proba['margin_sampling'] < t) & (stacking_proba['count_other'] == 2) & (stacking_proba['lgb_high'] == 'High'), 'final_pred'] = 'High'
stacking_proba.loc[(stacking_proba['margin_sampling'] < t) & (stacking_proba['count_other'] == 2) & (stacking_proba['lgb_low'] == 'Low'), 'final_pred'] = 'Low'
stacking_proba.loc[(stacking_proba['margin_sampling'] < t) & (stacking_proba['count_other'] == 2) & (stacking_proba['lgb_medium'] == 'Medium'), 'final_pred'] = 'Medium'

In [None]:
stacking_proba['final_pred'].value_counts()

In [None]:
prepare_submission(stacking_proba['final_pred'], 'ovr_stacking_proba')