### Engineers' Salary Prediction Challenge

In [None]:
import os.path
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score

#### Wczytanie danych

In [None]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

train.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
def compare_train_test(column):
    train_values = train[column].value_counts().reset_index(name='train_count')
    test_values = test[column].value_counts().reset_index(name='test_count')
    return pd.merge(test_values, train_values, how='outer', on=column)

c_job_title = compare_train_test('job_title')
c_job_state = compare_train_test('job_state')

### <center>Braki w kolumnach</center>

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

### <center>Analiza kolumn job_desc</center>

In [None]:
job_desc_cols = [col for col in train.columns if col.startswith('job_desc_')]

train['is_description'] = (train[job_desc_cols].sum(axis=1) != 0).astype(int)
test['is_description'] = (test[job_desc_cols].sum(axis=1) != 0).astype(int)

train.drop(columns=job_desc_cols, inplace=True)
test.drop(columns=job_desc_cols, inplace=True)

### <center>Miesiąc i rok</center>

In [None]:
train['job_posted_date'] = pd.to_datetime(train['job_posted_date'], format='%Y/%m')
train['month'] = train['job_posted_date'].dt.month
train['year'] = train['job_posted_date'].dt.year
train.drop(columns=['job_posted_date'], inplace=True)

test['job_posted_date'] = pd.to_datetime(test['job_posted_date'], format='%Y/%m')
test['month'] = test['job_posted_date'].dt.month
test['year'] = test['job_posted_date'].dt.year
test.drop(columns=['job_posted_date'], inplace=True)

### <center>Porównanie wybranych kolumn kategorycznych w train/test</center>

In [None]:
c_job_state

In [None]:
c_job_title

### <center>Liczebność każdej kategorii zarobkowej w każdym stanie</center>

In [None]:
salary_types = train['salary_category'].unique()

show_every_state = train.groupby(['job_state', 'salary_category']).size().unstack(fill_value=0).reindex(columns=salary_types, fill_value=0).stack().reset_index(name='Count')

width = 0.2
x = np.arange(len(show_every_state['job_state'].unique()))
my_xticks = [x - width, x, x + width]
colors = ['gold', 'silver', 'brown']

fig, ax = plt.subplots(figsize=(16, 12))

for i, salary in enumerate(salary_types):
    ax.bar(
        x=my_xticks[i],
        height=show_every_state.loc[(show_every_state['salary_category'] == salary), 'Count'],
        width=width,
        label=salary,
        color=colors[i],
        edgecolor='black',
        alpha=1,
    )

ax.grid(True)
ax.set_xticks(x + width)
ax.set_xticklabels(show_every_state['job_state'].unique(), rotation=90)
ax.set_xlabel('Stan')
ax.set_ylabel('Liczba ofert')
ax.set_title('Kategoria zarobkowa w stanach')
ax.legend(title='Kategoria wynagrodzenia')

### <center>Liczebność każdej kategorii zarobkowej dla każdej pracy</center>

In [None]:
salary_types = train['salary_category'].unique()

show_every_state = train.groupby(['job_title', 'salary_category']).size().unstack(fill_value=0).reindex(columns=salary_types, fill_value=0).stack().reset_index(name='Count')

width = 0.2
x = np.arange(len(show_every_state['job_title'].unique()))
my_xticks = [x - width, x, x + width]
colors = ['gold', 'silver', 'brown']

fig, ax = plt.subplots(figsize=(16, 12))

for i, salary in enumerate(salary_types):
    ax.bar(
        x=my_xticks[i],
        height=show_every_state.loc[(show_every_state['salary_category'] == salary), 'Count'],
        width=width,
        label=salary,
        color=colors[i],
        edgecolor='black',
        alpha=1,
    )

ax.grid(True)
ax.set_xticks(x + width)
ax.set_xticklabels(show_every_state['job_title'].unique(), rotation=90)
ax.set_xlabel('Stan')
ax.set_ylabel('Liczba ofert')
ax.set_title('Kategorie zarobkowe dla każdej oferty pracy')
ax.legend(title='Kategoria wynagrodzenia')

#### Podział kategorii zarobkowych względem kolumn feature

In [None]:
feature_columns = [col for col in train.columns if col.startswith('feature_')]

train[feature_columns].info()

In [None]:
grouped = train.groupby(['salary_category', 'feature_1']).size().unstack(fill_value=0)

feature_1_encoder = LabelEncoder().fit(train['feature_1'])
train['feature_1'] = feature_1_encoder.transform(train['feature_1'])

grouped

#### Macierz korelacji feature z salary_category

In [None]:
salary_category = train['salary_category']
salary_category_encoder = LabelEncoder().fit(salary_category)
train['salary_category'] = salary_category_encoder.transform(train['salary_category'])

feature_columns.append('salary_category')
plt.figure(figsize=(16, 12))
sns.heatmap(data=train[feature_columns].corr(), annot=True, cmap='YlGnBu', fmt='.2f')
feature_columns.remove('salary_category')

In [None]:
object_columns = train.select_dtypes(include=['object']).columns

for i, column in enumerate(object_columns):
    train[column] = LabelEncoder().fit_transform(train[column])
    test[column] = LabelEncoder().fit_transform(test[column])

### <center>Proces nauki, walidacji, testowania</center>

In [None]:
train.drop(columns=['obs'], inplace=True)
test.drop(columns=['obs'], inplace=True)

X = train.drop(columns=['salary_category'])
y = salary_category_encoder.transform(salary_category)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=test.columns)

In [None]:
lgb_model = lgb.LGBMClassifier().fit(X_train, y_train)

print(classification_report(y_test, lgb_model.predict(X_test), target_names=salary_category_encoder.inverse_transform(np.unique(y_train))))

In [None]:
importance = pd.DataFrame(data={'names': lgb_model.feature_name_, 'importance': lgb_model.feature_importances_}).sort_values(by='importance', ascending=False)

importance