In [None]:
import os
from datetime import datetime

import lightgbm as lgb
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#### Wczytanie przygotowanych danych

In [None]:
train = pd.read_csv(os.path.join('prepared', 'train.csv'))
test = pd.read_csv(os.path.join('prepared', 'test.csv'))

salary_category_encoder = LabelEncoder().fit(train['salary_category'])
test_obs = test['obs']
test.drop(columns=['obs'], inplace=True)

### <center>Proces nauki, walidacji, testowania</center>

In [None]:
X = train.drop(columns=['salary_category'])
y = salary_category_encoder.transform(train['salary_category'])

test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

#### LightGBM

In [None]:
lightgbm = lgb.LGBMClassifier().fit(X_train, y_train)

print(classification_report(y_test, lightgbm.predict(X_test), target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Szukanie optymalnych hiperparametrów

In [None]:
# def define_lightgbm(trial):
#     params = {
#         'objective': 'multiclass',
#         'num_class': len(np.unique(y_train)),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1, log=True),
#         'n_estimators': trial.suggest_int('n_estimators', 10, 150),
#         'max_depth': trial.suggest_int('max_depth', 2, 20),
#         'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
#         'subsample': trial.suggest_float('subsample', 0.1, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0)
#     }
#     return lgb.LGBMClassifier(**params)
#
# def objective_random_forest(trial):
#     lgb_model = define_lightgbm(trial)
#     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     scores = cross_val_score(lgb_model, X_train, y_train, cv=skf, n_jobs=-1, scoring='accuracy')
#     return scores.mean()
#
# study_lgb = optuna.create_study(direction='maximize', study_name='EngineerSalaryLightGBM', sampler=optuna.samplers.TPESampler())
# study_lgb.optimize(objective_random_forest, n_trials=100)

In [None]:
# lightgbm = define_lightgbm(study_lgb.best_trial).fit(X_train, y_train)
#
# print(classification_report(y_test, lightgbm.predict(X_test), target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Istotność cech dla LightGBM

In [None]:
importance = pd.DataFrame(
    data={'names': lightgbm.feature_name_, 'importance': lightgbm.feature_importances_}
).sort_values(by='importance', ascending=False)

importance

#### Las losowy

In [None]:
random_forest = RandomForestClassifier().fit(X_train, y_train)

print(classification_report(y_test, random_forest.predict(X_test), target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Istotność cech dla lasu losowego

In [None]:
random_forest_importance = pd.DataFrame(
    data={'feature': random_forest.feature_names_in_, 'importance': random_forest.feature_importances_}
).sort_values(by='importance', ascending=False)

random_forest_importance

#### Przygotowanie submission

In [None]:
rf_pred_proba = random_forest.predict_proba(X_test)
lgb_pred_proba = lightgbm.predict_proba(X_test)

avg = (rf_pred_proba + lgb_pred_proba) / 2

print(classification_report(y_test, np.argmax(avg, axis=1),
                            target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Fit na X, y

In [None]:
lightgbm.fit(X, y)
random_forest.fit(X, y)

### <center>Ostateczne predykcje</center>

In [None]:
final_rf_pred = random_forest.predict(test)
final_lgb_pred = lightgbm.predict(test)
final_rf_pred_proba = random_forest.predict_proba(test)
final_lgb_pred_proba = lightgbm.predict_proba(test)

final_avg = (final_rf_pred_proba + final_lgb_pred_proba) / 2

final_avg

In [None]:
def prepare_submission(predictions, name):
    if 'submissions' not in os.listdir(os.getcwd()):
        os.mkdir('submissions')
    submission = pd.DataFrame(
        data={
            'obs': test_obs,
            'salary_category': salary_category_encoder.inverse_transform(predictions)
        }
    )
    submission.to_csv(os.path.join('submissions', f'{name}.csv'), index=False)

prepare_submission(final_rf_pred, 'random_forest')
prepare_submission(final_lgb_pred, 'lgb')
prepare_submission(np.argmax(final_avg, axis=1), 'average_lgb_random_forest')

#### Informacje o modelach

In [None]:
if 'models' not in os.listdir(os.getcwd()):
    os.mkdir('models')

timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
with open(os.path.join('models', f'model_info_{timestamp}.txt'), 'w') as f:
    f.write(f'Test size: {test_size}\n\n')

    # if job_desc_pca is not None:
    #     f.write(f'PCA na kolumnach job_desc.\n\n')

    f.write('Random Forest Parameters:\n')
    for key, value in random_forest.get_params().items():
        f.write(f'{key}: {value}\n')

    f.write('\nLightGBM Parameters:\n')
    for key, value in lightgbm.get_params().items():
        f.write(f'{key}: {value}\n')