### Import libraries

In [36]:
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
import os
import random
from sklearn.model_selection import GroupKFold
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [2]:
def SeedEverything(seed=808):
    """Method to seed everything."""
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

In [61]:
def data_preprocessing(path="case_2_results.csv"):
    df = pd.read_csv(path)
    
    # Filling nans with spaces
    df['resume_main_keywords'] = df['resume_main_keywords'].fillna("")
    df['vacancy_main_keywords'] = df['vacancy_main_keywords'].fillna("")
    
    df['vacancy_description'] = df['vacancy_description'].fillna("")
    df['resume_description'] = df['resume_description'].fillna("")
    
    return df

### Main part

In [62]:
SeedEverything()

In [63]:
df = data_preprocessing(path="case_2_results_new.csv")

In [64]:
df.head()

Unnamed: 0,vacancy_id,resume_id,requested_experience,vacancy_description,resume_description,is_english,edu,target,resume_experience,vacancy_main_keywords,resume_main_keywords
0,779f3a59-206a-3241-adc4-d7db504f960b,74392e00-ecfb-335b-9fc1-c2652dca06e5,3.0,описание мы расширяем команды и ищем разработ...,интеграционные адаптеры для передачи заявок п...,False,relevant_high,False,10,java разработчик команда инвестиции,java spring boot java ee sql hibernate git doc...
1,779f3a59-206a-3241-adc4-d7db504f960b,2b5ad5e1-1f31-3f3f-8a66-43cd89233672,3.0,описание мы расширяем команды и ищем разработ...,разработка программного комплекса кредитного ...,False,relevant_high,False,9,java разработчик команда инвестиции,ооп java java spring framework kotlin mysql do...
2,779f3a59-206a-3241-adc4-d7db504f960b,ea1ac51a-e16b-367a-9216-52fb64809db1,3.0,описание мы расширяем команды и ищем разработ...,"• разрабатывал backend часть по открытию, зак...",True,relevant_high,False,4,java разработчик команда инвестиции,java spring framework hibernate orm sql java j...
3,779f3a59-206a-3241-adc4-d7db504f960b,ecfc02a1-592c-3ed0-a801-1ad9ab3d30b8,3.0,описание мы расширяем команды и ищем разработ...,full architecture & design & release & support...,True,relevant_high,False,13,java разработчик команда инвестиции,java git sql html javascript css mysql ооп lin...
4,779f3a59-206a-3241-adc4-d7db504f960b,aff6b6bd-89c2-3b2c-ab2e-0b9f76ac367c,3.0,описание мы расширяем команды и ищем разработ...,разработка сервиса с нуля. spring boot + post...,True,relevant_high,False,5,java разработчик команда инвестиции,java spring git postgresql hibernate orm bpmn ...


### Create kfold object

In [65]:
# Define groups based on 'vacancy_id'
groups = df['vacancy_id']

# Initialize GroupKFold
gkf = GroupKFold(n_splits=5)

### Create catboost model

In [101]:
# Initialize CatBoost classifier
model = CatBoostClassifier(loss_function='Logloss', eval_metric='F1', depth=3, iterations=200, learning_rate=0.15)

# Specify categorical features
cat_features = ['is_english', 'edu',]

# Specify text features
text_features = ['vacancy_main_keywords', 'resume_main_keywords', 'vacancy_description'] # 'vacancy_description'

### Train catboost and check precision score

In [102]:
f1_scores = []
score = 0
fold = 0

# Iterate over splits
for train_idx, val_idx in gkf.split(df, df['target'], groups=groups):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    # Train the model
    model.fit(
        train_df.drop(columns=['target', 'vacancy_id', 'resume_id', 'resume_description']), train_df['target'],
        cat_features=cat_features,
        text_features=text_features,
        eval_set=(val_df.drop(columns=['target', 'vacancy_id', 'resume_id', 'resume_description']), val_df['target']),
        verbose=10,
        use_best_model=False,
    )
    
    # Make predictions on the validation set
    val_preds = model.predict(val_df.drop(columns=['target', 'vacancy_id', 'resume_id', 'resume_description']))
    # Convert predicted labels to boolean values
    val_preds = val_preds == 'True'
    # Calculate precision for the current fold and append it to the list
    f1 = f1_score(val_df['target'], val_preds)
    
    # Save score
    f1_scores.append(f1)
    score += f1
    
    # Specify the file path where you want to save the model
    model_path = f'catboost_model_fold_new_{fold}.cbm'
    fold += 1
    # Save the model to the specified file path
    model.save_model(model_path)
    
    

print("############")
print(f"Average f1 score for {gkf.n_splits} splits:", score / gkf.n_splits)

0:	learn: 0.7146067	test: 0.4923077	best: 0.4923077 (0)	total: 31.4ms	remaining: 6.25s
10:	learn: 0.7365439	test: 0.5185185	best: 0.5185185 (3)	total: 341ms	remaining: 5.86s
20:	learn: 0.7808989	test: 0.5185185	best: 0.5185185 (3)	total: 650ms	remaining: 5.54s
30:	learn: 0.7877095	test: 0.5185185	best: 0.5185185 (3)	total: 961ms	remaining: 5.24s
40:	learn: 0.7966102	test: 0.5283019	best: 0.5283019 (36)	total: 1.27s	remaining: 4.92s
50:	learn: 0.8225352	test: 0.5283019	best: 0.5283019 (36)	total: 1.58s	remaining: 4.62s
60:	learn: 0.8441926	test: 0.5000000	best: 0.5283019 (36)	total: 1.89s	remaining: 4.3s
70:	learn: 0.8563380	test: 0.5283019	best: 0.5283019 (36)	total: 2.2s	remaining: 3.99s
80:	learn: 0.8876081	test: 0.5660377	best: 0.5660377 (80)	total: 2.5s	remaining: 3.68s
90:	learn: 0.8979592	test: 0.5283019	best: 0.5660377 (80)	total: 2.81s	remaining: 3.37s
100:	learn: 0.8953488	test: 0.5454545	best: 0.5660377 (80)	total: 3.13s	remaining: 3.07s
110:	learn: 0.9154519	test: 0.5555556	

### Inference

In [103]:
df = data_preprocessing(path="case_2_test_new.csv")

In [108]:
NUM_MODELS = 5

In [115]:
for fold in range(0, NUM_MODELS):
    # Specify the file path from which you want to load the model
    model_path = f'catboost_model_fold_new_{fold}.cbm'
    
    # Load the model from the specified file path
    inference_model = CatBoostClassifier()
    inference_model.load_model(model_path)
    
    # Make predictions on the validation set
    preds_proba = model.predict_proba(df.drop(columns=['target', 'vacancy_id', 'resume_id', 'resume_description']))
    
    if fold == 0:
        preds_proba_total = preds_proba
    else:
        preds_proba_total += preds_proba
    
preds_proba_total = preds_proba_total / NUM_MODELS

In [116]:
# Process probabilities
preds = preds_proba_total[:, 1]
preds = preds > 0.5

In [117]:
# Save submission in csv format
df["model_preds"] = preds
df.to_csv("submission.csv", index=False)