### Engineers' Salary Prediction Challenge

In [None]:
import os.path
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import optuna
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

#### Wczytanie danych

In [None]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

train.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
def compare_train_test(column):
    train_values = train[column].value_counts().reset_index(name='train_count')
    test_values = test[column].value_counts().reset_index(name='test_count')
    return pd.merge(test_values, train_values, how='outer', on=column)

c_job_title = compare_train_test('job_title')
c_job_state = compare_train_test('job_state')

### <center>Braki w kolumnach</center>

In [None]:
train.isnull().sum().sort_values(ascending=False)

In [None]:
test.isnull().sum().sort_values(ascending=False)

### <center>Analiza kolumn job_desc</center>

In [None]:
job_desc_cols = [col for col in train.columns if col.startswith('job_desc_')]

train['is_description'] = (train[job_desc_cols].sum(axis=1) != 0).astype(int)
test['is_description'] = (test[job_desc_cols].sum(axis=1) != 0).astype(int)

### <center>Miesiąc i rok</center>

In [None]:
train['job_posted_date'] = pd.to_datetime(train['job_posted_date'], format='%Y/%m')
train['month'] = train['job_posted_date'].dt.month
train['year'] = train['job_posted_date'].dt.year
train.drop(columns=['job_posted_date'], inplace=True)

test['job_posted_date'] = pd.to_datetime(test['job_posted_date'], format='%Y/%m')
test['month'] = test['job_posted_date'].dt.month
test['year'] = test['job_posted_date'].dt.year
test.drop(columns=['job_posted_date'], inplace=True)

### <center>Porównanie wybranych kolumn kategorycznych w train/test</center>

#### <center>Stan w którym jest oferta pracy "job_state"</center>

In [None]:
c_job_state

#### Zmiana stanów na regiony USA

In [None]:
west_states = [
    'AK', 'AZ', 'CA', 'CO', 'HI', 'ID',
    'MT', 'NV', 'NM', 'OR', 'UT', 'WA', 'WY'
]

central_states = [
    'AR', 'IA', 'IL', 'IN', 'KS', 'KY', 'LA',
    'MI', 'MN', 'MO', 'ND', 'NE', 'OH',
    'OK', 'SD', 'TX', 'WI'
]

east_states = [
    'AL', 'CT', 'DE', 'FL', 'GA', 'MA', 'MD',
    'ME', 'MS', 'NC', 'NH', 'NJ', 'NY', 'PA',
    'RI', 'SC', 'TN', 'VA', 'VT', 'WV', 'DC'
]

def map_state_to_region(state):
    if state in west_states:
        return 'West'
    elif state in central_states:
        return 'Central'
    elif state in east_states:
        return 'East'
    else:
        return 'Other'

train['job_region'] = train['job_state'].apply(map_state_to_region)
test['job_region'] = test['job_state'].apply(map_state_to_region)

#### <center>WYKRES: Stan USA, a wysokość pensji</center>

In [None]:
salary_types = train['salary_category'].unique()

show_every_region = train.groupby(['job_region', 'salary_category']).size().unstack(fill_value=0).reindex(columns=salary_types, fill_value=0).stack().reset_index(name='Count')

width = 0.2
x = np.arange(len(show_every_region['job_region'].unique()))
my_xticks = [x - width, x, x + width]
colors = ['gold', 'silver', 'brown']

fig, ax = plt.subplots(figsize=(16, 12))

for i, salary in enumerate(salary_types):
    ax.bar(
        x=my_xticks[i],
        height=show_every_region.loc[(show_every_region['salary_category'] == salary), 'Count'],
        width=width,
        label=salary,
        color=colors[i],
        edgecolor='black',
        alpha=1,
    )

ax.grid(True)
ax.set_xticks(x + width)
ax.set_xticklabels(show_every_region['job_region'].unique(), rotation=90)
ax.set_xlabel('Region')
ax.set_ylabel('Liczba ofert')
ax.set_title('Kategoria zarobkowa w stanach')
ax.legend(title='Kategoria wynagrodzenia')

#### Kodowanie one-hot "job_region"

In [None]:
job_region_one_hot = OneHotEncoder(sparse_output=False)
job_region_one_hot.fit(train[['job_region']])

train = pd.concat([train, pd.DataFrame(job_region_one_hot.transform(train[['job_region']]), columns=job_region_one_hot.categories_[0])], axis=1)
test = pd.concat([test, pd.DataFrame(job_region_one_hot.transform(test[['job_region']]), columns=job_region_one_hot.categories_[0])], axis=1)

train.drop(columns=['job_state', 'job_region'], inplace=True)
test.drop(columns=['job_state', 'job_region'], inplace=True)

#### <center>Stanowisko pracy "job_title"</center>

In [None]:
c_job_title

#### <center>WYKRES: Stanowisko pracy a wysokość pensji</center>

In [None]:
salary_types = train['salary_category'].unique()

show_every_state = train.groupby(['job_title', 'salary_category']).size().unstack(fill_value=0).reindex(columns=salary_types, fill_value=0).stack().reset_index(name='Count')

width = 0.2
x = np.arange(len(show_every_state['job_title'].unique()))
my_xticks = [x - width, x, x + width]
colors = ['gold', 'silver', 'brown']

fig, ax = plt.subplots(figsize=(16, 12))

for i, salary in enumerate(salary_types):
    ax.bar(
        x=my_xticks[i],
        height=show_every_state.loc[(show_every_state['salary_category'] == salary), 'Count'],
        width=width,
        label=salary,
        color=colors[i],
        edgecolor='black',
        alpha=1,
    )

ax.grid(True)
ax.set_xticks(x + width)
ax.set_xticklabels(show_every_state['job_title'].unique(), rotation=90)
ax.set_xlabel('Stan')
ax.set_ylabel('Liczba ofert')
ax.set_title('Kategorie zarobkowe dla każdej oferty pracy')
ax.legend(title='Kategoria wynagrodzenia')

#### Kodowanie one-hot "job_title"

In [None]:
# job_title_one_hot = OneHotEncoder(sparse_output=False)
# job_title_one_hot.fit(train[['job_title']])
#
# train = pd.concat([train, pd.DataFrame(job_title_one_hot.transform(train[['job_title']]), columns=job_title_one_hot.categories_[0])], axis=1)
# test = pd.concat([test, pd.DataFrame(job_title_one_hot.transform(test[['job_title']]), columns=job_title_one_hot.categories_[0])], axis=1)

train.drop(columns=['job_title'], inplace=True)
test.drop(columns=['job_title'], inplace=True)

#### <center>Kolumny "feature_"</center>

In [None]:
feature_columns = [col for col in train.columns if col.startswith('feature_')]

train[feature_columns].info()

In [None]:
grouped_feature_1 = train.groupby(['salary_category', 'feature_1']).size().unstack(fill_value=0)

feature_1_encoder = LabelEncoder().fit(train['feature_1'])
train['feature_1'] = feature_1_encoder.transform(train['feature_1'])
test['feature_1'] = feature_1_encoder.transform(test['feature_1'])

grouped_feature_1

W feature_10 okres 06-08/2024 ma najwięcej brakujących wartości dla zbioru treningowego.

Tak samo jest w zbiorze testowym.

In [None]:
train_grouped_feature_10 = train.loc[train['feature_10'].isna(), ['month', 'year']]

train_grouped_feature_10.groupby(['year', 'month']).size().sort_values(ascending=False).head(n=3)

In [None]:
test_grouped_feature_10 = test.loc[test['feature_10'].isna(), ['month', 'year']]

test_grouped_feature_10.groupby(['year', 'month']).size().sort_values(ascending=False).head(n=3)

#### Kodowanie LabelEncoder na salary_category

In [None]:
salary_category = train['salary_category']
salary_category_encoder = LabelEncoder().fit(salary_category)
train['salary_category'] = salary_category_encoder.transform(train['salary_category'])

#### Wartości korelacji dla kolumny salary_category

In [None]:
train.drop(columns=['obs'], inplace=True)
test_obs = test['obs']
test.drop(columns=['obs'], inplace=True)

corr = train.corr()[['salary_category']]

columns_to_drop = corr.loc[(corr['salary_category'] < 0.02) & (corr['salary_category'] > -0.02), :].index.values

train.drop(columns=columns_to_drop, inplace=True)
test.drop(columns=columns_to_drop, inplace=True)

In [None]:
train.info()

### <center>Proces nauki, walidacji, testowania</center>

#### LightGBM

In [None]:
X = train.drop(columns=['salary_category'])
y = salary_category_encoder.transform(salary_category)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

#### Szukanie optymalnych hiperparametrów

In [None]:
# def define_lightgbm(trial):
#     params = {
#         'objective': 'multiclass',
#         'num_class': len(np.unique(y_train)),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1, log=True),
#         'n_estimators': trial.suggest_int('n_estimators', 10, 150),
#         'max_depth': trial.suggest_int('max_depth', 2, 20),
#         'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
#         'subsample': trial.suggest_float('subsample', 0.1, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0)
#     }
#     return lgb.LGBMClassifier(**params)
#
# def objective_random_forest(trial):
#     lgb_model = define_lightgbm(trial)
#     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     scores = cross_val_score(lgb_model, X_train, y_train, cv=skf, n_jobs=-1, scoring='accuracy')
#     return scores.mean()
#
# study_lgb = optuna.create_study(direction='maximize', study_name='EngineerSalaryLightGBM', sampler=optuna.samplers.TPESampler())
# study_lgb.optimize(objective_random_forest, n_trials=100)

In [None]:
lightgbm = lgb.LGBMClassifier().fit(X_train, y_train)

print(classification_report(y_test, lightgbm.predict(X_test), target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

In [None]:
# lightgbm = define_lightgbm(study_lgb.best_trial).fit(X_train, y_train)
# 
# print(classification_report(y_test, lightgbm.predict(X_test), target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Istotność cech dla LightGBM

In [None]:
importance = pd.DataFrame(data={'names': lightgbm.feature_name_, 'importance': lightgbm.feature_importances_}).sort_values(by='importance', ascending=False)

importance

#### Las losowy

In [None]:
random_forest = RandomForestClassifier().fit(X_train, y_train)

print(classification_report(y_test, random_forest.predict(X_test), target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Istotność cech dla lasu losowego

In [None]:
random_forest_importance = pd.DataFrame(data={'feature': random_forest.feature_names_in_, 'importance': random_forest.feature_importances_}).sort_values(by='importance', ascending=False)

random_forest_importance

#### Przygotowanie submission

In [None]:
rf_pred_proba = random_forest.predict_proba(X_test)
lgb_pred_proba = lightgbm.predict_proba(X_test)

avg = (rf_pred_proba + lgb_pred_proba) / 2

print(classification_report(y_test, np.argmax(avg, axis=1), target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

#### Fit na X, y

In [None]:
lightgbm.fit(X, y)
random_forest.fit(X, y)

### <center>Ostateczne predykcje</center>

In [None]:
final_rf_pred = random_forest.predict(test)
final_lgb_pred = lightgbm.predict(test)
final_rf_pred_proba = random_forest.predict_proba(test)
final_lgb_pred_proba = lightgbm.predict_proba(test)

final_avg = (final_rf_pred_proba + final_lgb_pred_proba) / 2

final_avg

In [None]:
def prepare_submission(predictions, name):
    submission = pd.DataFrame(data={'obs': test_obs, 'salary_category': salary_category_encoder.inverse_transform(predictions)})
    submission.to_csv(os.path.join('submissions', f'{name}.csv'), index=False)

prepare_submission(final_rf_pred, 'random_forest')
prepare_submission(final_lgb_pred, 'lgb')
prepare_submission(np.argmax(final_avg, axis=1), 'average_lgb_random_forest')

### Informacje o modelach

In [None]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
with open(os.path.join('models', f'model_info_{timestamp}.txt'), 'w') as f:
    f.write(f"Test size:\n{0.2}\n\n")

    f.write("Random Forest Parameters:\n")
    for key, value in random_forest.get_params().items():
        f.write(f"{key}: {value}\n")

    f.write("\nLightGBM Parameters:\n")
    for key, value in lightgbm.get_params().items():
        f.write(f"{key}: {value}\n")