# Loading libraries and data

In [2]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error

n_splits = 5
SEED = 42



In [3]:
TRAIN_PATH = '/kaggle/input/salary-prediction-for-job-postings/usjobs_train.csv'
TEST_PATH = '/kaggle/input/salary-prediction-for-job-postings/usjobs_test.csv'
SOLUTION_PATH = '/kaggle/input/salary-prediction-for-job-postings/usjobs_sample_submission.csv'

df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)
df_solution = pd.read_csv(SOLUTION_PATH)

# Feature Engineering

In [9]:
import warnings

warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

In [4]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=20e4e4a2ab6b1ed097457345ae2131589791244068935a2976d7a5e54ff6487c
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [5]:
from sentence_transformers import SentenceTransformer

language_model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [6]:
category_columns = ['Jobs_Group', 'Profile', 'Remote', 'City', 'State', 'Frecuency_Salary', 'Sector_Group','Revenue', 'Employee', 'Director']

In [7]:
def featureEngineering(df, mode):
    features_skills = language_model.encode(df['Skills'])
    features_job = language_model.encode(df['Job']) 
    features_company = language_model.encode(df['Company']) 
    features_sector = language_model.encode(df['Sector'].astype(str)) 
    
    for i in range(features_skills.shape[1]):
        df[f'Embedding_Skill_{i+1}'] = features_skills[:, i]
    for i in range(features_job.shape[1]):
        df[f'Embedding_Job_{i+1}'] = features_job[:, i]
    for i in range(features_company.shape[1]):
        df[f'Embedding_Company_{i+1}'] = features_company[:, i]
    for i in range(features_sector.shape[1]):
        df[f'Embedding_Sector_{i+1}'] = features_sector[:, i]
        
    if(mode == 'train'):
        df = df.drop(['ID', 'Mean_Salary', 'Skills', 'Location', 'Job', 'Company', 'URL', 'Sector'], axis = 1)
    else:
        df = df.drop(['ID',  'Skills', 'Location', 'Job', 'Company', 'URL', 'Sector'], axis = 1)
        
    df[category_columns] = df[category_columns].astype("category")
    
    return df

In [10]:
X_train = featureEngineering(df_train, mode = 'train')
y_train = df_train['Mean_Salary'].to_numpy()
X_test = featureEngineering(df_test, mode = 'test')

Batches:   0%|          | 0/1039 [00:00<?, ?it/s]

Batches:   0%|          | 0/1039 [00:00<?, ?it/s]

Batches:   0%|          | 0/1039 [00:00<?, ?it/s]

Batches:   0%|          | 0/1039 [00:00<?, ?it/s]

Batches:   0%|          | 0/693 [00:00<?, ?it/s]

Batches:   0%|          | 0/693 [00:00<?, ?it/s]

Batches:   0%|          | 0/693 [00:00<?, ?it/s]

Batches:   0%|          | 0/693 [00:00<?, ?it/s]

# Regressor

In [11]:
# Using pseudolabel technic

params = {'n_estimators': 1048, 'num_leaves': 70, 'subsample_for_bin': 302861, 'max_depth': 54, 'learning_rate': 0.0953117574398012, 'reg_alpha': 0.7128389224131779, 'reg_lambda': 0.18000309104506118, 'min_split_gain': 7.950478049619436, 'min_child_samples': 26, 'colsample_bytree': 0.5503658153831644}

model_lgb = lgb.LGBMRegressor(device='cpu', random_state = SEED, **params)
model_lgb.fit(X_train, y_train, categorical_feature=category_columns)
y_pred_test = model_lgb.predict(X_test)

X_comb = pd.concat([X_train, X_test], ignore_index=True)
y_comb = np.concatenate((y_train, y_pred_test))

X_comb[category_columns] = X_comb[category_columns].astype("category")

model_lgb = lgb.LGBMRegressor(device='cpu', random_state = SEED, **params)
model_lgb.fit(X_comb, y_comb, categorical_feature=category_columns)

y_pred = model_lgb.predict(X_test)



# Submission

In [12]:
def submission(y_pred, file_name = 'submission.csv'):
    data = {'ID': df_test['ID'], 
        'Mean_Salary': y_pred} 
    df_result = pd.DataFrame(data, columns=['ID', 'Mean_Salary'])
    df_result.to_csv(file_name, index=False)

In [13]:
submission(y_pred, "submission.csv")