In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import optuna
from functools import cache
from joblib import dump, load
from scipy.sparse import hstack
from collections import Counter
from tqdm import tqdm

import filling_missing_data
import memory_optimization
import text_processing
import saving_data

from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import KFold
from sklearn.metrics import fbeta_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from sentence_transformers import SentenceTransformer

In [51]:
def log10_positive(series: pd.Series) -> pd.Series:
  return np.where(series > 0, np.log10(series), series)

In [52]:
def fct_lump(data, n, other_category='Other'):
    if not isinstance(data, pd.Series):
        data = pd.Series(data)

    counts = data.value_counts()
    categories_to_keep = counts.nlargest(n).index
    data_lumped = data.where(data.isin(categories_to_keep), other_category)
    return data_lumped

In [53]:
def get_text_embeddings(model, series: pd.Series, cache_ = False):

    @cache
    def _encode(sentence):
        return model.encode([sentence])[0]

    if cache_:
        embeddings = []
        for sentence in series.tolist():
            embeddings.append(_encode(sentence))
        return np.vstack(embeddings)
    else:
        embeddings = model.encode(series.tolist())
        return embeddings

In [54]:
def perform_pca(df, n_components, name):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(scaled_data)
    
    pca_df = pd.DataFrame(data=principal_components, columns=[f'PC_{name}_{i+1}' for i in range(n_components)])
    
    explained_variance = pca.explained_variance_ratio_
    
    cumulative_variance = explained_variance.cumsum()
    
    #print(f'Variance explained by the first {n_components} components: {cumulative_variance[-1]:.4f}')
    
    return pca_df

In [55]:
model = SentenceTransformer("all-mpnet-base-v2")

1. Чтение и форматирование данных

In [56]:
train = pd.read_csv('src\\train.csv')
test = pd.read_csv('src\\test_feat.csv')

In [57]:
columns_for_rename = {'лимит_нарушен': 'limit_exceeded', \
                      'пол': 'sex', \
                      'тип': 'type', \
                      'цель': 'purpose', \
                      'кредитоспособность': 'creditworthiness', \
                      'другие_кредиты': 'other_loans', \
                      'бизнес': 'buisness', \
                      'сумма': 'amount', \
                      'сбор': 'collection', \
                      'срок': 'time', \
                      'амортизация': 'amortization', \
                      'только_процент': 'percentage_only', \
                      'один_платеж': 'one_payment', \
                      'стоимость_имущества': 'property_price', \
                      'работа': 'housing_type', \
                      'тип_залога': 'deposite_type', \
                      'тип_кредита': 'credit_type', \
                      'кредитный_рейтинг': 'credit_score', \
                      'возраст': 'age_category', \
                      'прямой_залог': 'direct_deposite', \
                      'речь': 'speech'
}

train.rename(columns=columns_for_rename, inplace=True)
train.rename(columns={'дефолт': 'target'}, inplace=True)
test.rename(columns=columns_for_rename, inplace=True)

2. Общая информация

In [58]:
train

Unnamed: 0,ID,limit_exceeded,sex,type,purpose,creditworthiness,other_loans,buisness,amount,collection,...,one_payment,property_price,housing_type,deposite_type,credit_type,credit_score,age_category,direct_deposite,target,speech
0,9e0a294e-c3cc-4aac-a489-41f479a55d6f,0.0,м,3.0,0.0,1.0,,0.0,174765.264968,normal,...,0.0,244488.111841,осн,дом,2.0,709.508335,55-64,1.0,0,"""Hey there! I’m a guy with a solid job and a m..."
1,e1509425-891f-4683-9e12-dbf1493d6196,0.0,,,0.0,1.0,0.0,0.0,,want,...,0.0,,осн,дом,1.0,722.508335,55-64,,0,"""Hi there! We are a couple with a solid financ..."
2,10b6a662-2ac1-45ca-8e5d-8f1296b31f4a,0.0,м,3.0,1.0,1.0,0.0,0.0,,regular,...,,364488.111841,осн,дом,1.0,785.508335,55-64,1.0,0,"""I'm a male client with a modest income. I wor..."
3,3b895644-549d-43b5-8dfe-448cacb70476,0.0,,3.0,1.0,1.0,0.0,0.0,474765.264968,common,...,0.0,894488.111841,осн,дом,,594.508335,35-44,1.0,0,"""Hello, we’re a working couple enjoying the st..."
4,e0b641d0-4345-429c-9655-8df410af54cc,0.0,,3.0,1.0,,0.0,0.0,384765.264968,excessive,...,0.0,504488.111841,осн,дом,,589.508335,55-64,,0,"""Greetings, we're Chris and Sam. Both employed..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,dcadfda8-ece0-47bb-9835-9b4b4dfcf384,0.0,,3.0,1.0,,0.0,0.0,,not slight,...,0.0,744488.111841,осн,дом,2.0,,35-44,,0,"""Hello! I’m here to introduce myself. I have a..."
7996,2b39e995-c13e-40d3-823a-0b12d6baf0f2,,,,1.0,1.0,0.0,1.0,,mean,...,0.0,184488.111841,осн,дом,1.0,796.508335,>74,1.0,0,"""Hello! I’m excited to be here. My income is m..."
7997,b44aad77-b20c-4a3f-bee8-acf10b3e2b6e,0.0,м,3.0,3.0,1.0,,0.0,394765.264968,Whole lotta nothin',...,0.0,,осн,дом,4.0,584.508335,25-34,1.0,1,"""I'm just an ordinary guy trying to make ends ..."
7998,98cf1188-06d2-4913-a117-3719b023a0ea,,,,1.0,1.0,0.0,0.0,594765.264968,Total lack of presence,...,0.0,,осн,дом,,873.508335,35-44,1.0,0,"""Hello! We’re a couple looking to manage our f..."


In [59]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                8000 non-null   object 
 1   limit_exceeded    6462 non-null   float64
 2   sex               3778 non-null   object 
 3   type              6466 non-null   float64
 4   purpose           6442 non-null   float64
 5   creditworthiness  6406 non-null   float64
 6   other_loans       6425 non-null   float64
 7   buisness          6369 non-null   float64
 8   amount            6414 non-null   float64
 9   collection        8000 non-null   object 
 10  time              6376 non-null   float64
 11  amortization      6338 non-null   float64
 12  percentage_only   6435 non-null   float64
 13  one_payment       6396 non-null   float64
 14  property_price    5790 non-null   float64
 15  housing_type      8000 non-null   object 
 16  deposite_type     8000 non-null   object 


In [60]:
train.describe()

Unnamed: 0,limit_exceeded,type,purpose,creditworthiness,other_loans,buisness,amount,time,amortization,percentage_only,one_payment,property_price,credit_type,credit_score,direct_deposite,target
count,6462.0,6466.0,6442.0,6406.0,6425.0,6369.0,6414.0,6376.0,6338.0,6435.0,6396.0,5790.0,6340.0,6386.0,6420.0,8000.0
mean,0.065305,2.664089,1.118131,0.96113,0.004669,0.134715,348441.6,340.620736,0.101925,0.050971,0.019856,538655.6,2.14858,710.309933,0.999844,0.23675
std,0.247082,0.653671,1.13839,0.1933,0.068178,0.341446,184294.4,59.079085,0.302573,0.219956,0.139517,360784.2,0.986423,115.213366,0.012481,0.425114
min,0.0,1.0,0.0,0.0,0.0,0.0,44765.26,101.885949,0.0,0.0,0.0,74488.11,1.0,511.508335,0.0,0.0
25%,0.0,3.0,0.0,1.0,0.0,0.0,214765.3,365.885949,0.0,0.0,0.0,314488.1,1.0,611.508335,1.0,0.0
50%,0.0,3.0,1.0,1.0,0.0,0.0,314765.3,365.885949,0.0,0.0,0.0,454488.1,2.0,708.508335,1.0,0.0
75%,0.0,3.0,2.0,1.0,0.0,0.0,452265.3,365.885949,0.0,0.0,0.0,664488.1,3.0,810.508335,1.0,0.0
max,1.0,3.0,3.0,1.0,1.0,1.0,3024765.0,365.885949,1.0,1.0,1.0,4914488.0,4.0,911.508335,1.0,1.0


3. Приведение series к нужным типам

In [62]:
train['time'].value_counts()

time
365.885949    5229
185.885949     575
245.885949     258
305.885949     116
329.885949      97
125.885949      17
149.885949      11
101.885949      11
353.885949      10
137.885949       9
317.885949       8
341.885949       8
173.885949       5
161.885949       4
269.885949       3
293.885949       2
233.885949       2
221.885949       2
257.885949       2
281.885949       2
209.885949       2
197.885949       1
170.885949       1
113.885949       1
Name: count, dtype: int64

In [13]:
def transform_time(series, n = 5):
    return fct_lump(series, n = n).astype(str).str.replace(r'\..*', '', regex=True)

In [14]:
loan_periods = {
    "365": "year",   
    "Other": "Other",         
    "185": "half",  
    "245": "nine_months",  
    "305": "ten_months",    
    "329": "eleven_months",   
}

In [15]:
transform_time(train['time']).map(loan_periods).value_counts()

time
year             5229
Other            1725
half              575
nine_months       258
ten_months        116
eleven_months      97
Name: count, dtype: int64

In [16]:
transform_time(test['time']).map(loan_periods).value_counts()

time
year             1301
Other             417
half              159
nine_months        66
ten_months         31
eleven_months      26
Name: count, dtype: int64

In [17]:
train['time'] = transform_time(train['time']).map(loan_periods)
test['time'] = transform_time(test['time']).map(loan_periods)

In [63]:
test['direct_deposite'].value_counts()

direct_deposite
1.0    1592
0.0       2
Name: count, dtype: int64

In [19]:
train['deposite_type'] = (train['deposite_type'] == 'дом').astype(int)
test['deposite_type'] = (test['deposite_type'] == 'дом').astype(int)

In [20]:
train['direct_deposite'] = (train['direct_deposite'] == 1.0).astype(int)
test['direct_deposite'] = (test['direct_deposite'] == 1.0).astype(int)

In [21]:
train['amount'] = log10_positive(train['amount'])
test['amount'] = log10_positive(test['amount'])

train['property_price'] = log10_positive(train['property_price'])
test['property_price'] = log10_positive(test['property_price'])

In [None]:
train.dtypes

In [23]:
memory_optimization.auto_optimize_dtypes(train, inplace=True)
memory_optimization.auto_optimize_dtypes(test, inplace=True)
train['target'] = train['target'].astype(int)

In [24]:
train.dtypes

ID                    object
limit_exceeded      category
sex                 category
type                category
purpose             category
creditworthiness    category
other_loans         category
buisness            category
amount               float32
collection            object
time                category
amortization        category
percentage_only     category
one_payment         category
property_price       float32
housing_type        category
deposite_type       category
credit_type         category
credit_score         float32
age_category        category
direct_deposite     category
target                 int32
speech                object
dtype: object

In [25]:
#filling_missing_data.filling_missing_data_train(train, inplace=True)
#filling_missing_data.filling_missing_data_test(test, inplace=True)

In [26]:
list(train['collection'].value_counts().to_dict().keys())[10:15]

['modest', 'normal', 'commonplace', 'average', 'adequate']

In [27]:
embeddings = model.encode(list(train['collection'].value_counts().to_dict().keys())[10:15])
similarities = model.similarity(embeddings, embeddings)
similarities

tensor([[1.0000, 0.3568, 0.4157, 0.3051, 0.4075],
        [0.3568, 1.0000, 0.5485, 0.5488, 0.3701],
        [0.4157, 0.5485, 1.0000, 0.4773, 0.3640],
        [0.3051, 0.5488, 0.4773, 1.0000, 0.5623],
        [0.4075, 0.3701, 0.3640, 0.5623, 1.0000]])

In [28]:
# collection_embed_train = get_text_embeddings(model, train['collection'], cache_ = True)
# speech_embed_train = get_text_embeddings(model, train['speech'])

# collection_embed_test = get_text_embeddings(model, test['collection'], cache_ = True)
# speech_embed_test = get_text_embeddings(model, test['speech'])

# saving_data.write_to_pickle_lzma(collection_embed_train, "collection_embed_train.lzma")
# saving_data.write_to_pickle_lzma(speech_embed_train, "speech_embed_train.lzma")

# saving_data.write_to_pickle_lzma(collection_embed_test, "collection_embed_test.lzma")
# saving_data.write_to_pickle_lzma(speech_embed_test, "speech_embed_test.lzma")

In [29]:
collection_embed_train = saving_data.read_from_pickle_lzma("collection_embed_train.lzma")
speech_embed_train = saving_data.read_from_pickle_lzma("speech_embed_train.lzma")

In [30]:
age_cats = {
    '45-54': '45-54',
    '35-44': '35-44',
    '55-64': '55-64',
    ">74": "65+", 
    "65-74": "65+", 
    "<25": "18-34", 
    "25-34": "18-34", 
    "-1": 'missing'
}

In [31]:
train['age_category'].value_counts()

age_category
45-54    1878
35-44    1777
55-64    1696
65-74    1137
25-34    1046
>74       376
<25        82
-1          8
Name: count, dtype: int64

In [32]:
train['age_category'].map(age_cats).value_counts()

age_category
45-54    1878
35-44    1777
55-64    1696
65+      1513
18-34    1128
Name: count, dtype: int64

In [33]:
X = train.drop(['ID', 'target', "collection", "speech"], axis=1)
test.drop('ID', axis=1, inplace=True)
y = train['target']

In [34]:
y.value_counts()

target
0    6106
1    1894
Name: count, dtype: int64

In [47]:
def objective_cb(trial):
    
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    use_collection_embed = trial.suggest_categorical("use_collection_embed", ["yes", "no"])
    use_speech_embed = trial.suggest_categorical("use_speech_embed", ["yes", "no"])
    trim_age_categories = trial.suggest_categorical("trim_age_categories", ["yes", "no"])

    dim_pca =  trial.suggest_int("dim_pca", low=5, high=35, step=5, log=False)

    X_trial = X.copy()

    if trim_age_categories == "yes":
        X_trial['age_category'] = X_trial['age_category'].map(age_cats).astype(str)

    X_trial = pd.get_dummies(X_trial, drop_first=True)
    X_trial.columns = X_trial.columns.astype(str).str.replace(r'[^\w]', '_', regex=True)

    if use_collection_embed == "yes":
        collection_pc = perform_pca(collection_embed_train, dim_pca, "collection")
        X_trial = pd.concat([X_trial, collection_pc], axis=1)

    if use_speech_embed == "yes":
        speech_pc = perform_pca(speech_embed_train, dim_pca, "speech")
        X_trial = pd.concat([X_trial, speech_pc], axis=1)

    kf = KFold(n_splits=5, shuffle=True)
    total_fbeta = 0
    total_samples = 0

    for train_index, val_index in tqdm(kf.split(X_trial)):
        X_train, X_val = X_trial.iloc[train_index], X_trial.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        rus = RandomUnderSampler(random_state=None)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

        cb = CatBoostClassifier(**params, silent=True)
        cb.fit(X_resampled, y_resampled)

        y_pred = cb.predict(X_val)

        y_pred_binary = np.where(y_pred > 0.5, 1, 0)

        f_beta = fbeta_score(y_val, y_pred_binary, beta=3)
        total_fbeta += f_beta

    average_fbeta = total_fbeta / 5 

    return average_fbeta


In [48]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_cb, n_trials = 100, n_jobs = 1, show_progress_bar = True)

[I 2025-01-09 16:26:14,350] A new study created in memory with name: no-name-a110f513-6eea-47dc-b50e-5dc7312c5d13


  0%|          | 0/100 [00:00<?, ?it/s]

5it [00:26,  5.30s/it]


[I 2025-01-09 16:26:41,416] Trial 0 finished with value: 0.9379374571165744 and parameters: {'learning_rate': 0.020646687589027046, 'depth': 2, 'subsample': 0.8297059858129878, 'colsample_bylevel': 0.5477244791895105, 'min_data_in_leaf': 64, 'use_collection_embed': 'no', 'use_speech_embed': 'yes', 'trim_age_categories': 'no', 'dim_pca': 25}. Best is trial 0 with value: 0.9379374571165744.


5it [00:54, 10.81s/it]


[I 2025-01-09 16:27:35,492] Trial 1 finished with value: 0.5963766988170387 and parameters: {'learning_rate': 0.01682561519307549, 'depth': 7, 'subsample': 0.5153666592053858, 'colsample_bylevel': 0.9304762096506769, 'min_data_in_leaf': 67, 'use_collection_embed': 'no', 'use_speech_embed': 'no', 'trim_age_categories': 'yes', 'dim_pca': 5}. Best is trial 0 with value: 0.9379374571165744.


5it [01:10, 14.01s/it]


[I 2025-01-09 16:28:46,110] Trial 2 finished with value: 0.9361767516211842 and parameters: {'learning_rate': 0.006476204749168591, 'depth': 6, 'subsample': 0.32193203697928985, 'colsample_bylevel': 0.20129274784840007, 'min_data_in_leaf': 10, 'use_collection_embed': 'no', 'use_speech_embed': 'yes', 'trim_age_categories': 'yes', 'dim_pca': 35}. Best is trial 0 with value: 0.9379374571165744.


5it [00:29,  5.86s/it]


[I 2025-01-09 16:29:16,126] Trial 3 finished with value: 0.9342721472725077 and parameters: {'learning_rate': 0.01136297203743071, 'depth': 3, 'subsample': 0.32683372000628036, 'colsample_bylevel': 0.21923619994852256, 'min_data_in_leaf': 40, 'use_collection_embed': 'no', 'use_speech_embed': 'yes', 'trim_age_categories': 'no', 'dim_pca': 35}. Best is trial 0 with value: 0.9379374571165744.


2it [02:08, 64.09s/it]


[W 2025-01-09 16:31:25,381] Trial 4 failed with parameters: {'learning_rate': 0.0037648146542459786, 'depth': 6, 'subsample': 0.6714897681580109, 'colsample_bylevel': 0.9794528376399229, 'min_data_in_leaf': 85, 'use_collection_embed': 'yes', 'use_speech_embed': 'yes', 'trim_age_categories': 'no', 'dim_pca': 30} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "c:\Users\Imani\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Imani\AppData\Local\Temp\ipykernel_1572\598441006.py", line 45, in objective_cb
    cb.fit(X_resampled, y_resampled)
  File "c:\Users\Imani\anaconda3\Lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "c:\Users\Imani\anaconda3\Lib\site-packages\catboo

KeyboardInterrupt: 