### Engineers' Salary Prediction Challenge

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA

#### Wczytanie danych

In [None]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

train.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
def compare_train_test(column):
    train_values = train[column].value_counts().reset_index(name='train_count')
    test_values = test[column].value_counts().reset_index(name='test_count')
    return pd.merge(test_values, train_values, how='outer', on=column)

c_job_title = compare_train_test('job_title')
c_job_state = compare_train_test('job_state')

### <center>Braki w kolumnach</center>

In [None]:
train.isnull().sum().sort_values(ascending=False)

In [None]:
test.isnull().sum().sort_values(ascending=False)

### <center>Analiza kolumn job_desc</center>

In [None]:
job_desc_columns = [col for col in train.columns if col.startswith('job_desc_')]

train['has_description'] = (train[job_desc_columns].sum(axis=1) != 0).astype(int)
test['has_description'] = (test[job_desc_columns].sum(axis=1) != 0).astype(int)

#### PCA dla kolumn job_desc

In [None]:
n_components = 6
pca_columns = [f'pca_job_desc_{i}' for i in range(n_components)]

job_desc_pca = PCA(n_components=n_components, random_state=42).fit(train[job_desc_columns])

train_decomposed = job_desc_pca.transform(train[job_desc_columns])
test_decomposed = job_desc_pca.transform(test[job_desc_columns])

train.drop(columns=job_desc_columns, inplace=True)
test.drop(columns=job_desc_columns, inplace=True)

train = pd.concat([train, pd.DataFrame(data=train_decomposed, columns=pca_columns)], axis=1)
test = pd.concat([test, pd.DataFrame(data=test_decomposed, columns=pca_columns)], axis=1)

### <center>Miesiąc i rok</center>

In [None]:
train['job_posted_date'] = pd.to_datetime(train['job_posted_date'], format='%Y/%m')
train['month'] = train['job_posted_date'].dt.month
train['year'] = train['job_posted_date'].dt.year
train.drop(columns=['job_posted_date'], inplace=True)

test['job_posted_date'] = pd.to_datetime(test['job_posted_date'], format='%Y/%m')
test['month'] = test['job_posted_date'].dt.month
test['year'] = test['job_posted_date'].dt.year
test.drop(columns=['job_posted_date'], inplace=True)

### <center>Porównanie wybranych kolumn kategorycznych w train/test</center>

#### <center>Stan w którym jest oferta pracy "job_state"</center>

In [None]:
c_job_state

#### <center>WYKRES: Region USA, a wysokość pensji</center>

In [None]:
salary_types = train['salary_category'].unique()

show_every_state = train.groupby(['job_state', 'salary_category']).size().unstack(fill_value=0).reindex(columns=salary_types, fill_value=0).stack().reset_index(name='Count')

width = 0.2
x = np.arange(len(show_every_state['job_state'].unique()))
my_xticks = [x - width, x, x + width]
colors = ['gold', 'silver', 'brown']

fig, ax = plt.subplots(figsize=(10, 8))

for i, salary in enumerate(salary_types):
    ax.bar(
        x=my_xticks[i],
        height=show_every_state.loc[(show_every_state['salary_category'] == salary), 'Count'],
        width=width,
        label=salary,
        color=colors[i],
        edgecolor='black',
        alpha=1,
    )

ax.grid(True)
ax.set_xticks(x)
ax.set_xticklabels(show_every_state['job_state'].unique(), rotation=90)
ax.set_xlabel('Stan', fontsize=16)
ax.set_ylabel('Liczba ofert', fontsize=16)
ax.set_title('Kategoria zarobkowa w stanach', fontsize=16)
ax.legend(title='Kategoria wynagrodzenia')

#### Zmiana stanów na regiony USA

In [None]:
west_states = [
    'AK', 'AZ', 'CA', 'CO', 'HI', 'ID',
    'MT', 'NV', 'NM', 'OR', 'UT', 'WA', 'WY'
]

central_states = [
    'AR', 'IA', 'IL', 'IN', 'KS', 'KY', 'LA',
    'MI', 'MN', 'MO', 'ND', 'NE', 'OH',
    'OK', 'SD', 'TX', 'WI'
]

east_states = [
    'AL', 'CT', 'DE', 'FL', 'GA', 'MA', 'MD',
    'ME', 'MS', 'NC', 'NH', 'NJ', 'NY', 'PA',
    'RI', 'SC', 'TN', 'VA', 'VT', 'WV', 'DC'
]

def map_state_to_region(state):
    if state in west_states:
        return 'West'
    elif state in central_states:
        return 'Central'
    elif state in east_states:
        return 'East'
    else:
        return 'None'

train['job_region'] = train['job_state'].apply(map_state_to_region)
test['job_region'] = test['job_state'].apply(map_state_to_region)

#### <center>WYKRES: Region USA, a wysokość pensji</center>

In [None]:
salary_types = train['salary_category'].unique()

show_every_region = train.groupby(['job_region', 'salary_category']).size().unstack(fill_value=0).reindex(columns=salary_types, fill_value=0).stack().reset_index(name='Count')

width = 0.2
x = np.arange(len(show_every_region['job_region'].unique()))
my_xticks = [x - width, x, x + width]
colors = ['gold', 'silver', 'brown']

fig, ax = plt.subplots(figsize=(10, 8))

for i, salary in enumerate(salary_types):
    ax.bar(
        x=my_xticks[i],
        height=show_every_region.loc[(show_every_region['salary_category'] == salary), 'Count'],
        width=width,
        label=salary,
        color=colors[i],
        edgecolor='black',
        alpha=1,
    )

ax.grid(True)
ax.set_xticks(x)
ax.set_xticklabels(show_every_region['job_region'].unique(), rotation=90)
ax.set_xlabel('Region', fontsize=16)
ax.set_ylabel('Liczba ofert', fontsize=16)
ax.set_title('Kategoria zarobkowa w regionach', fontsize=16)
ax.legend(title='Kategoria wynagrodzenia')

#### Co zrobić z job_state?

Testowano:
- usunięcie -> tak sobie
- zmiana na "job_region" i kodowanie one-hot -> narazie najlepiej

In [None]:
job_region_one_hot = OneHotEncoder(sparse_output=False)
job_region_one_hot.fit(train[['job_region']])

train = pd.concat([train, pd.DataFrame(job_region_one_hot.transform(train[['job_region']]), columns=job_region_one_hot.categories_[0])], axis=1)
test = pd.concat([test, pd.DataFrame(job_region_one_hot.transform(test[['job_region']]), columns=job_region_one_hot.categories_[0])], axis=1)

train.drop(columns=['job_state', 'job_region'], inplace=True)
test.drop(columns=['job_state', 'job_region'], inplace=True)

#### <center>Stanowisko pracy "job_title"</center>

In [None]:
c_job_title

In [None]:
(train['job_title'].isna().sum(), test['job_title'].isna().sum())

#### <center>WYKRES: Stanowisko pracy a wysokość pensji</center>

In [None]:
salary_types = train['salary_category'].unique()

show_every_title = train.groupby(['job_title', 'salary_category']).size().unstack(fill_value=0).reindex(columns=salary_types, fill_value=0).stack().reset_index(name='Count')

width = 0.2
x = np.arange(len(show_every_title['job_title'].unique()))
my_xticks = [x - width, x, x + width]
colors = ['gold', 'silver', 'brown']

fig, ax = plt.subplots(figsize=(16, 12))

for i, salary in enumerate(salary_types):
    ax.bar(
        x=my_xticks[i],
        height=show_every_title.loc[(show_every_title['salary_category'] == salary), 'Count'],
        width=width,
        label=salary,
        color=colors[i],
        edgecolor='black',
        alpha=1,
    )

ax.grid(True)
ax.set_xticks(x)
ax.set_xticklabels(show_every_title['job_title'].unique(), rotation=45)
ax.set_xlabel('Stan', fontsize=12)
ax.set_ylabel('Liczba ofert', fontsize=12)
ax.set_title('Kategorie zarobkowe dla każdej oferty pracy')
ax.legend(title='Kategoria wynagrodzenia')

#### Co zrobić z job_title?

Testowane:
- kodowanie one-hot -> nie poprawia
- usunięcie -> narazie najlepiej
- pogrupować tak żeby była przewaga jakiś klas z 'salary_category'. Stworzenie 'job_title_earning' i kodowanie one-hot -> nie poprawia
- zmiana na binary 'is_others_job_title' -> nie testowano

In [None]:
train.loc[train['job_title'] == 'Others', 'is_others_job_title'] = 1
test.loc[train['job_title'] == 'Others', 'is_others_job_title'] = 1

train.loc[train['job_title'] != 'Others', 'is_others_job_title'] = 0
test.loc[train['job_title'] != 'Others', 'is_others_job_title'] = 0

train.drop(columns=['job_title'], inplace=True)
test.drop(columns=['job_title'], inplace=True)

#### <center>WYKRES: Rodzaj stanowiska pracy a wysokość pensji</center>

In [None]:
salary_types = train['salary_category'].unique()

show_every_title = train.groupby(['is_others_job_title', 'salary_category']).size().unstack(fill_value=0).reindex(columns=salary_types, fill_value=0).stack().reset_index(name='Count')

width = 0.2
x = np.arange(len(show_every_title['is_others_job_title'].unique()))
my_xticks = [x - width, x, x + width]
colors = ['gold', 'silver', 'brown']

fig, ax = plt.subplots(figsize=(10, 8))

for i, salary in enumerate(salary_types):
    ax.bar(
        x=my_xticks[i],
        height=show_every_title.loc[(show_every_title['salary_category'] == salary), 'Count'],
        width=width,
        label=salary,
        color=colors[i],
        edgecolor='black',
        alpha=1,
    )

ax.grid(True)
ax.set_xticks(x)
ax.set_xticklabels(show_every_title['is_others_job_title'].unique(), rotation=45)
ax.set_xlabel('is_others_job_title', fontsize=16)
ax.set_ylabel('Liczba ofert', fontsize=16)
ax.set_title('Kategorie zarobkowe dla typu oferty', fontsize=16)
ax.legend(title='Czy należy do innych ofert')

#### <center>Kolumny "feature_"</center>

In [None]:
feature_columns = [col for col in train.columns if col.startswith('feature_')]

train[feature_columns].info()

#### feature_1

In [None]:
grouped_feature_1 = train.groupby(['salary_category', 'feature_1']).size().unstack(fill_value=0)

feature_1_encoder = LabelEncoder().fit(train['feature_1'])
train['feature_1'] = feature_1_encoder.transform(train['feature_1'])
test['feature_1'] = feature_1_encoder.transform(test['feature_1'])

grouped_feature_1

#### feature_2 (bo jest najwyżej w importance dla xgb, lgb i rf)

In [None]:
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(
    data=train,
    x='feature_2',
    hue='salary_category',
    kde=True,
    palette='Set2',
    multiple='stack',
    bins=100,
)

plt.title('Rozkład feature_2 według kategorii wynagrodzenia', fontsize=14)
plt.xlabel('feature_2', fontsize=12)
plt.ylabel('Liczba przypadków', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

#### feature_10 (bo jest najwięcej NaN)

W feature_10 okres 06-08/2024 ma najwięcej brakujących wartości dla zbioru treningowego.

Tak samo jest w zbiorze testowym.

In [None]:
train_grouped_feature_10 = train.loc[train['feature_10'].isna(), ['month', 'year']]

train_grouped_feature_10.groupby(['year', 'month']).size().sort_values(ascending=False).head(n=3)

In [None]:
test_grouped_feature_10 = test.loc[test['feature_10'].isna(), ['month', 'year']]

test_grouped_feature_10.groupby(['year', 'month']).size().sort_values(ascending=False).head(n=3)

#### Kodowanie LabelEncoder na salary_category

In [None]:
salary_category = train['salary_category']
salary_category_encoder = LabelEncoder().fit(salary_category)
train['salary_category'] = salary_category_encoder.transform(train['salary_category'])

#### Wartości korelacji dla kolumny salary_category

Usunięcie kolumn mało informatywnych.

Usuń kolumny wysoko skorelowane.

In [None]:
test_obs = test['obs']
train.drop(columns=['obs'], inplace=True)
test.drop(columns=['obs'], inplace=True)

corr = train.corr()[['salary_category']]

columns_to_drop = corr.loc[(corr['salary_category'] < 0.02) & (corr['salary_category'] > -0.02), :].index.values

train['salary_category'] = salary_category_encoder.inverse_transform(train['salary_category'])
train.drop(columns=columns_to_drop, inplace=True)
test.drop(columns=columns_to_drop, inplace=True)

test = pd.concat([test_obs, test], axis=1)

In [None]:
def drop_high_correlated_columns():
    matrix = train.corr(numeric_only=True).abs()
    upper_t = matrix.where(np.triu(np.ones_like(matrix, dtype=np.bool_), k=1))
    return [col for col in upper_t.columns if any(upper_t[col] > 0.9)]


high_correlated_columns = drop_high_correlated_columns()
train.drop(columns=high_correlated_columns, inplace=True)
test.drop(columns=high_correlated_columns, inplace=True)

In [None]:
train.head()

In [None]:
test.head()

## Przybliżenie train do test

- ucięcie przedziałów na train aby zgadzały się z test na podstawie wyniku testu U Manna Whitneya -> nie poprawia
- usunięcie tych kolumn -> nie testowano

In [None]:
from scipy.stats import mannwhitneyu


statistical_tests = pd.DataFrame(
    data={
        'columns': train.columns[1:],
        'umann_pvalue': [mannwhitneyu(train[column], test[column], nan_policy='omit', use_continuity=False, method='asymptotic', alternative='two-sided').pvalue for column in train.columns if column != 'salary_category']
    }
)

check_for_diff = statistical_tests.loc[statistical_tests['umann_pvalue'] <= 0.05, :].sort_values(by='umann_pvalue')

In [None]:
# def cut_ranges(column):
#     test_min, test_max = test[column].min(), test[column].max()
#     return train[(train[column] >= test_min) & (train[column] <= test_max)].reset_index(drop=True)
#
# for column in check_for_diff['columns']:
#     train = cut_ranges(column)
#
# train.info()

train.drop(columns=check_for_diff['columns'], inplace=True)
test.drop(columns=check_for_diff['columns'], inplace=True)

#### Zapisz przygotowane dane

In [None]:
if 'prepared' not in os.listdir(os.getcwd()):
    os.mkdir('prepared')

train.to_csv(os.path.join('prepared', 'train.csv'), index=False)
test.to_csv(os.path.join('prepared', 'test.csv'), index=False)