### Import libraries

In [110]:
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
import os
import random
from sklearn.model_selection import GroupKFold
from sklearn.metrics import precision_score

In [226]:
def SeedEverything(seed=808):
    """Method to seed everything."""
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

In [227]:
def data_preprocessing(path="case_2_results.csv"):
    df = pd.read_csv(path)
    
    # Filling nans with spaces
    df['resume_main_keywords'] = df['resume_main_keywords'].fillna("")
    df['vacancy_main_keywords'] = df['vacancy_main_keywords'].fillna("")
    
    return df

### Main part

In [228]:
SeedEverything()

In [229]:
df = data_preprocessing(path="case_2_results.csv")

### Create kfold object

In [221]:
# Define groups based on 'vacancy_id'
groups = df['vacancy_id']

# Initialize GroupKFold
gkf = GroupKFold(n_splits=5)

### Create catboost model

In [223]:
# Initialize CatBoost classifier
model = CatBoostClassifier(loss_function='Logloss', eval_metric='Precision', depth=3, iterations=40, learning_rate=0.1)

# Specify categorical features
cat_features = ['is_english', 'edu',]

# Specify text features
text_features = ['vacancy_main_keywords', 'resume_main_keywords']

### Train catboost and check precision score

In [224]:
precision_scores = []
score = 0

# Iterate over splits
for train_idx, val_idx in gkf.split(df, df['target'], groups=groups):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    # Train the model
    model.fit(
        train_df.drop(columns=['target', 'vacancy_id', 'resume_id']), train_df['target'],
        cat_features=cat_features,
        text_features=text_features,
        eval_set=(val_df.drop(columns=['target', 'vacancy_id', 'resume_id']), val_df['target']),
        verbose=10,
        use_best_model=False,
    )
    
    # Make predictions on the validation set
    val_preds = model.predict(val_df.drop(columns=['target', 'vacancy_id', 'resume_id']))
    # Convert predicted labels to boolean values
    val_preds = val_preds == 'True'
    # Calculate precision for the current fold and append it to the list
    precision = precision_score(val_df['target'], val_preds)

    precision_scores.append(precision)
    score += precision

print("############")
print(f"Average precision score for {gkf.n_splits} splits:", score / gkf.n_splits)

0:	learn: 0.5318352	test: 0.4029851	best: 0.4029851 (0)	total: 7.51ms	remaining: 293ms
10:	learn: 0.5946970	test: 0.4482759	best: 0.4482759 (10)	total: 59.7ms	remaining: 157ms
20:	learn: 0.6517413	test: 0.5666667	best: 0.5862069 (15)	total: 111ms	remaining: 100ms
30:	learn: 0.7093023	test: 0.5925926	best: 0.5925926 (25)	total: 162ms	remaining: 47.1ms
39:	learn: 0.7305389	test: 0.6000000	best: 0.6250000 (32)	total: 210ms	remaining: 0us

bestTest = 0.625
bestIteration = 32

0:	learn: 0.8571429	test: 0.0000000	best: 0.0000000 (0)	total: 6.18ms	remaining: 241ms
10:	learn: 0.6956522	test: 1.0000000	best: 1.0000000 (2)	total: 58.4ms	remaining: 154ms
20:	learn: 0.7349398	test: 0.6000000	best: 1.0000000 (2)	total: 110ms	remaining: 99.9ms
30:	learn: 0.7398374	test: 0.7037037	best: 1.0000000 (2)	total: 164ms	remaining: 47.7ms
39:	learn: 0.7837838	test: 0.7647059	best: 1.0000000 (2)	total: 212ms	remaining: 0us

bestTest = 1
bestIteration = 2

0:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0

### Saving the model

In [230]:
# Specify the file path where you want to save the model
model_path = 'catboost_model.cbm'

# Save the model to the specified file path
model.save_model(model_path)
print("Model saved successfully to:", model_path)

Model saved successfully to: catboost_model.cbm


### Example of model loading

In [231]:
# Specify the file path from which you want to load the model
model_path = 'catboost_model.cbm'

# Load the model from the specified file path
loaded_model = CatBoostClassifier()
loaded_model.load_model(model_path)

<catboost.core.CatBoostClassifier at 0x2b866cc2b4f0>