# Loading libraries and data

In [9]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error

n_splits = 5
SEED = 42

In [10]:
TRAIN_PATH = 'C:/Users/jeeva/jika/salary-prediction-for-job-postings/usjobs_train.csv'
TEST_PATH =  'C:/Users/jeeva/jika/salary-prediction-for-job-postings/usjobs_test.csv'
SOLUTION_PATH = 'C:/Users/jeeva/jika/salary-prediction-for-job-postings/submission.csv'

df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)
df_solution = pd.read_csv(SOLUTION_PATH)

# Feature Engineering

In [11]:
import warnings

warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

In [12]:
from sentence_transformers import SentenceTransformer

language_model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

In [13]:
category_columns = ['Jobs_Group', 'Profile', 'Remote', 'City', 'State', 'Frecuency_Salary', 'Sector_Group','Revenue', 'Employee', 'Director']

In [14]:
def featureEngineering(df, mode):
    features_skills = language_model.encode(df['Skills'])
    features_job = language_model.encode(df['Job']) 
    features_company = language_model.encode(df['Company']) 
    features_sector = language_model.encode(df['Sector'].astype(str)) 
    
    for i in range(features_skills.shape[1]):
        df[f'Embedding_Skill_{i+1}'] = features_skills[:, i]
    for i in range(features_job.shape[1]):
        df[f'Embedding_Job_{i+1}'] = features_job[:, i]
    for i in range(features_company.shape[1]):
        df[f'Embedding_Company_{i+1}'] = features_company[:, i]
    for i in range(features_sector.shape[1]):
        df[f'Embedding_Sector_{i+1}'] = features_sector[:, i]
        
    if(mode == 'train'):
        df = df.drop(['ID', 'Mean_Salary', 'Skills', 'Location', 'Job', 'Company', 'URL', 'Sector'], axis = 1)
    else:
        df = df.drop(['ID',  'Skills', 'Location', 'Job', 'Company', 'URL', 'Sector'], axis = 1)
        
    df[category_columns] = df[category_columns].astype("category")
    
    return df

In [15]:
X_train = featureEngineering(df_train, mode = 'train')
y_train = df_train['Mean_Salary'].to_numpy()
X_test = featureEngineering(df_test, mode = 'test')

# Regressor

In [16]:
# Using pseudolabel technic

params = {'n_estimators': 1048, 'num_leaves': 70, 'subsample_for_bin': 302861, 'max_depth': 54, 'learning_rate': 0.0953117574398012, 'reg_alpha': 0.7128389224131779, 'reg_lambda': 0.18000309104506118, 'min_split_gain': 7.950478049619436, 'min_child_samples': 26, 'colsample_bytree': 0.5503658153831644}

model_lgb = lgb.LGBMRegressor(device='cpu', random_state = SEED, **params)
model_lgb.fit(X_train, y_train, categorical_feature=category_columns)
y_pred_test = model_lgb.predict(X_test)

X_comb = pd.concat([X_train, X_test], ignore_index=True)
y_comb = np.concatenate((y_train, y_pred_test))

X_comb[category_columns] = X_comb[category_columns].astype("category")

model_lgb = lgb.LGBMRegressor(device='cpu', random_state = SEED, **params)
model_lgb.fit(X_comb, y_comb, categorical_feature=category_columns)

y_pred = model_lgb.predict(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.896836 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 731209
[LightGBM] [Info] Number of data points in the train set: 33248, number of used features: 3085
[LightGBM] [Info] Start training from score 104938.651998
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.370421 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 749414
[LightGBM] [Info] Number of data points in the train set: 55414, number of used features: 3085
[LightGBM] [Info] Start training from score 105091.405287


# Submission

In [17]:
def submission(y_pred, file_name = 'submission.csv'):
    data = {'ID': df_test['ID'], 
        'Mean_Salary': y_pred} 
    df_result = pd.DataFrame(data, columns=['ID', 'Mean_Salary'])
    df_result.to_csv(file_name, index=False)

In [18]:
submission(y_pred, "submission.csv")