In [1]:
import polars as pl
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.metrics import f1_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import re
import gc
import yake
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))
len_word = 10

RANDOM_STATE = 255

try:
    from catboost.utils import get_gpu_device_count
    gpu_count = get_gpu_device_count()
except ImportError:
    gpu_count = 0

if gpu_count > 0:
    print(f"GPU доступен, количество устройств: {gpu_count}")
    task_type = 'GPU'
else:
    print("GPU недоступен, используем CPU")
    task_type = 'CPU'

from joblib import Parallel, delayed
n_jobs = 3
custom_stopwords = {"для", "в", "с","при","вы","не","от","что","это","на","к",
                    'from','and','on','to','for','with','that','this','what','who'}

stop_word = stop_words.union(custom_stopwords)

GPU доступен, количество устройств: 1


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rs_mi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ''
    cleaned = re.sub(r'[^а-яА-ЯёЁa-zA-Z0-9\s\(\)\{\}\[\]]', '', text)
    return cleaned.lower().strip()

def extract_words(text, max_len=100, stop_words=None, top_k=20):
    
    if not isinstance(text, str) or not text.strip():
        return ''
    
    shortened = text if len(text) <= max_len else text[:max_len]
    kw_extractor = yake.KeywordExtractor(n=5,top=top_k, lan="ru", stopwords=stop_words or set())
    keywords = kw_extractor.extract_keywords(shortened)
    if not keywords:
        return ''
    return keywords[0][0]
    
def process_chunk(texts_chunk, stop_words=stop_word, top_k=len_word, n_jobs=3):
    gc.collect()
    results = Parallel(n_jobs=n_jobs)(delayed(extract_words)(text, max_len=100, stop_words=stop_words, top_k=top_k) for text in texts_chunk)
    
    gc.collect()
    return results

def chunker(seq, size):
    for pos in range(0, len(seq), size):
        yield seq[pos:pos + size]

In [None]:
# Загрузка и предварительная обработка с Polars
file_path = "OZON/ml_ozon_сounterfeit_data/ml_ozon_сounterfeit_train.csv"
df = pl.read_csv(file_path)

target = 'resolution'
col_imp = ['id','resolution','PriceDiscounted','item_time_alive','seller_time_alive']
col_count = ['videos_published_count','photos_published_count','comments_published_count']
col_text = ['name_rus','CommercialTypeName4','brand_name']
col_total = col_imp + col_count + col_text

df_first = df.select(col_total)

# Заполнение пропусков (Polars позволяет делать это эффективно)
for col in col_count:
    df_first = df_first.with_columns([pl.col(col).fill_null(0).cast(pl.Int32).alias(col)])

for col in col_text:
    df_first = df_first.with_columns([pl.col(col).fill_null("").cast(pl.Utf8).alias(col)])

custom_stopwords = {"для", "в", "с","при","вы","не","от","что","это","на","к",
                    'from','and','on','to','for','with','that','this','what','who'}

stop_word = stop_words.union(custom_stopwords)

chunk_size = 1000

# Обработка текстовых колонок
for col in col_text:
    print(f"Обработка столбца {col}...")
    texts = df_first[col].to_list()
    cleaned_texts = [clean_text(t) for t in texts]
    all_results = []
    for chunk in chunker(cleaned_texts, chunk_size):
        chunk_results = process_chunk(chunk)
        all_results.extend(chunk_results)
    #df_first = df_first.with_column(pl.Series(name=col, values=all_results))
    df_first = df_first.with_columns([pl.Series(all_results).alias(col)])

gc.collect()

In [None]:
# Конвертация Polars DataFrame в Pandas для CatBoost 
df_pd = df_first.to_pandas()
df_pd.to_csv('df_first.csv')

In [2]:
# ячейка для запуска , если catboost вылетает 
df_pd = pd.read_csv('df_first.csv')
target = 'resolution'
col_imp = ['id','resolution','PriceDiscounted','item_time_alive','seller_time_alive']
col_count = ['videos_published_count','photos_published_count','comments_published_count']
col_text = ['name_rus','CommercialTypeName4','brand_name' ]#,'description'
col_total = col_imp + col_count+col_text

#df_first[col_text] = df_first[col_text].fillna('').astype('string')
#text_feature_indice = [df_first.columns.get_loc(c) for c in col_text]
#print(text_feature_indice)
#print(df_first.head())
#df_pd.info()
df_pd = df_pd.drop(columns=['Unnamed: 0']).copy()
df_pd.info()
print(df_pd.head())
#gc.collect()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197198 entries, 0 to 197197
Data columns (total 11 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id                        197198 non-null  int64  
 1   resolution                197198 non-null  int64  
 2   PriceDiscounted           197198 non-null  float64
 3   item_time_alive           197198 non-null  int64  
 4   seller_time_alive         197198 non-null  float64
 5   videos_published_count    197198 non-null  int64  
 6   photos_published_count    197198 non-null  int64  
 7   comments_published_count  197198 non-null  int64  
 8   name_rus                  197178 non-null  object 
 9   CommercialTypeName4       197191 non-null  object 
 10  brand_name                113654 non-null  object 
dtypes: float64(2), int64(6), object(3)
memory usage: 16.5+ MB
       id  resolution  PriceDiscounted  item_time_alive  seller_time_alive  \
0  159385           

In [None]:

df_pd[col_text] = df_pd[col_text].fillna('').astype('string')
X = df_pd.drop(columns=['id', target])
y = df_pd[target]

text_feature_indices = [X.columns.get_loc(c) for c in col_text]

# Кросс-валидация CatBoost с параметрами
param_grid = {
    'depth': [12],
    'learning_rate': [0.1],
    'l2_leaf_reg': [1],
    'iterations': [200],
    #'task_type': [task_type],
    'gpu_ram_part': [0.8],
    'loss_function': ['Logloss'],
    'eval_metric': ['F1'],
    'custom_metric':['Recall'],
    'random_seed': [1],
    'early_stopping_rounds': [3],
    'use_best_model': [True],
    'verbose': [3],
    'thread_count': [3],
    'allow_writing_files':[False],
    'used_ram_limit': ['4gb']
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
best_score = -np.inf
best_params = None
best_model = None

for params in ParameterGrid(param_grid):
    f1_scores = []
    current_best_model = None
    current_best_score = -np.inf

    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        train_pool = Pool(X_train, y_train, text_features=text_feature_indices)
        valid_pool = Pool(X_valid, y_valid, text_features=text_feature_indices)

        model = CatBoostClassifier(**params,class_weights=[1, 10])
        model.fit(train_pool, eval_set=valid_pool, verbose=False, early_stopping_rounds=50)

        preds = model.predict(X_valid)
        f1 = f1_score(y_valid, preds, average='weighted')
        f1_scores.append(f1)

        if f1 > current_best_score:
            current_best_score = f1
            current_best_model = model

        del train_pool, valid_pool, model
        gc.collect()

    mean_f1 = np.mean(f1_scores)
    print(f"Параметры: {params} -> Средний F1: {mean_f1:.4f}")

    if mean_f1 > best_score:
        best_score = mean_f1
        best_params = params
        best_model = current_best_model

print(f"\nЛучшие параметры: {best_params}")
print(f"Лучший средний F1: {best_score:.4f}")

preds = best_model.predict(X)
print("\nClassification report на всей выборке:")
print(classification_report(y, preds))

ConfusionMatrixDisplay.from_estimator(best_model, X, y, cmap=plt.cm.Blues)
plt.title("Матрица ошибок")
plt.show()

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve

probs = model.predict_proba(X_valid)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_valid, probs)

# Найти порог с балансом между precision и recall
optimal_idx = np.argmax(recall - (1 - precision))
optimal_threshold = thresholds[optimal_idx]

print(f"Оптимальный порог: {optimal_threshold}")

preds = (probs >= optimal_threshold).astype(int)
print(classification_report(y_valid, preds))

In [None]:
df_test = pd.read_csv('OZON/ml_ozon_сounterfeit_data/ml_ozon_сounterfeit_test.csv')
df_test.info()
X_test = df_test[['id','PriceDiscounted','item_time_alive','seller_time_alive']].copy()

model = CatBoostClassifier(**best_params)

predictions_test = best_model.predict(X_test)
probabilities_test = best_model.predict_proba(X_test)


submission = pd.DataFrame({
    'id': df_test['id'],
    'prediction': predictions_test
})

submission.to_csv('submission.csv', index=False)


print(f"Создан файл submission.csv с {len(submission)} предсказаниями")
print(f"Распределение предсказаний:")
print(submission['prediction'].value_counts())
print()