In [1]:
import polars as pl
import polars.selectors as cs
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool, cv
from pyod.models.cblof import CBLOF

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.metrics import f1_score, classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

import re
import gc
import yake
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))
len_word = 10

RANDOM_STATE = 255

try:
    from catboost.utils import get_gpu_device_count
    gpu_count = get_gpu_device_count()
except ImportError:
    gpu_count = 0

if gpu_count > 0:
    print(f"GPU доступен, количество устройств: {gpu_count}")
    task_type = 'GPU'
else:
    print("GPU недоступен, используем CPU")
    task_type = 'CPU'

from joblib import Parallel, delayed
n_jobs = 3
custom_stopwords = {"для", "в", "с","при","вы","не","от","что","это","на","к",
                    'from','and','on','to','for','with','that','this','what','who'}

stop_word = stop_words.union(custom_stopwords)

chunk_size = 1000
pattern_clean = re.compile(r'[^а-яА-ЯёЁa-zA-Z0-9\s\(\)\{\}\[\]]')

GPU доступен, количество устройств: 1


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rs_mi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def clean_text(text):
    if not isinstance(text, str):
        return ''
    cleaned = pattern_clean.sub('', text)
    return cleaned.lower().strip()

def extract_words(text, max_len=300, stop_words=None, top_k=20):
    
    if not isinstance(text, str) or not text.strip():
        return ''
    
    shortened = text if len(text) <= max_len else text[:max_len]
    kw_extractor = yake.KeywordExtractor(n=3,top=top_k, lan="ru", stopwords=stop_words or set())
    keywords = kw_extractor.extract_keywords(shortened)
    if not keywords:
        return ''
    return keywords[0][0]
    
def process_chunk(texts_chunk, stop_words=stop_word, top_k=len_word, n_jobs=3):
    gc.collect()
    results = Parallel(n_jobs=n_jobs)(delayed(extract_words)(text, max_len=100, stop_words=stop_words, top_k=top_k) for text in texts_chunk)
    
    gc.collect()
    return results

def chunker(seq, size):
    for pos in range(0, len(seq), size):
        yield seq[pos:pos + size]

In [3]:
# Загрузка и предварительная обработка с Polars
file_path = "OZON/ml_ozon_сounterfeit_data/ml_ozon_сounterfeit_train.csv"
df = pl.read_csv(file_path)

target = 'resolution'
col_drop = ['id','ItemID','SellerID',]
col_num = df.select(cs.numeric()).columns
col_text = ['name_rus','CommercialTypeName4','brand_name' ]

col_count = list(set(col_num)  - set(col_drop))
col_total= []
col_total.extend(col_count)
col_total.extend(col_text)

tfidf = TfidfVectorizer(max_features=200)       
scaler = StandardScaler()
df_first = df.select(col_total)

In [4]:
def prep_text(df,col_text):
   
    for col in col_text:
        df = df.with_columns([pl.col(col).fill_null("").cast(pl.Utf8).alias(col)])
        print(f"Обработка столбца {col}...")
        texts = df[col].to_list()
        cleaned_texts = [clean_text(t) for t in texts]
        if col=='name_rus':
            all_results = []
            for chunk in chunker(cleaned_texts, chunk_size):
                chunk_results = process_chunk(chunk)
                all_results.extend(chunk_results)
        else : all_results = cleaned_texts
    
        df = df.with_columns([pl.Series(all_results).alias(col)])
    return df

In [5]:
def prep_count(df,col_count):
    for col in col_count:
        df = df.with_columns(
            pl.col(col).fill_null(0).cast(pl.Int32).alias(col))
        df = df.with_columns([
            (pl.col('OrderAcceptedCountTotal30') - pl.col('OrderAcceptedCountTotal7')).alias('OrderAcceptedCountT_30_7'),
            (pl.col('OrderAcceptedCountTotal90') - pl.col('OrderAcceptedCountTotal30')).alias('OrderAcceptedCountT_90_30'),
            (pl.col('item_count_fake_returns30') - pl.col('item_count_fake_returns7')).alias('item_fake_returns_30_7'),
            (pl.col('item_count_fake_returns90') - pl.col('item_count_fake_returns30')).alias('item_fake_returns_90_30'),
            (pl.col('item_count_sales30') - pl.col('item_count_sales7')).alias('item_count_sales_30_7'),
            (pl.col('item_count_sales90') - pl.col('item_count_sales30')).alias('item_count_sales_90_30'),
            (pl.col('item_count_fake_returns30') - pl.col('item_count_fake_returns7')).alias('item_count_returns_30_7'),
            (pl.col('item_count_fake_returns90') - pl.col('item_count_fake_returns30')).alias('item_count_returns_90_30'),
            (pl.col('ExemplarReturnedCountTotal30') - pl.col('ExemplarReturnedCountTotal7')).alias('ExemplarReturnedCount30_7'),
            (pl.col('ExemplarReturnedCountTotal90') - pl.col('ExemplarReturnedCountTotal30')).alias('ExemplarReturnedCount90_30'),
            (pl.col('ExemplarReturnedValueTotal30') - pl.col('ExemplarReturnedValueTotal7')).alias('ExemplarReturnedValue30_7'),
            (pl.col('ExemplarReturnedValueTotal90') - pl.col('ExemplarReturnedValueTotal30')).alias('ExemplarReturnedValue90_30'),
            (pl.col('ExemplarAcceptedCountTotal30') - pl.col('ExemplarAcceptedCountTotal7')).alias('ExemplarAcceptedCount30_7'),
            (pl.col('ExemplarAcceptedCountTotal90') - pl.col('ExemplarAcceptedCountTotal30')).alias('ExemplarAcceptedCount90_30'),
            (pl.col('GmvTotal30') - pl.col('GmvTotal7')).alias('Gmv30_7'),
            (pl.col('GmvTotal90') - pl.col('GmvTotal30')).alias('Gmv90_30'),
        ])

    # Удаление ненужных колонок
    drop_col2 = ['OrderAcceptedCountTotal90','OrderAcceptedCountTotal30','OrderAcceptedCountTotal7',
                 'item_count_fake_returns7','item_count_fake_returns30','item_count_fake_returns90',
                 'item_count_sales7','item_count_sales90','item_count_sales30',
                 'item_count_returns30','item_count_returns90','item_count_returns7',
                 'ExemplarAcceptedCountTotal30','ExemplarAcceptedCountTotal90','ExemplarAcceptedCountTotal7',
                 'ExemplarReturnedCountTotal30','ExemplarReturnedCountTotal90','ExemplarReturnedCountTotal7',
                 'ExemplarReturnedValueTotal7','ExemplarReturnedValueTotal90','ExemplarReturnedValueTotal30',
                 'GmvTotal7','GmvTotal90','GmvTotal30']

    df = df.drop(drop_col2)
    return df

In [6]:
# Обработка текстовых колонок
df_first = prep_text(df_first,col_text)

# Обработка текстовых колонок
#display(sorted(col_count))
# Заполнение цифровых значений 
df_first = prep_count(df_first,col_count)
gc.collect()
df_pd = df_first.to_pandas()
#df_pd.info()

Обработка столбца name_rus...
Обработка столбца CommercialTypeName4...
Обработка столбца brand_name...


In [7]:
#  CBLOF # Добавляем аномалии как отдельный числовой признак

def add_CBLOF(df,col_count,col_text):
    X_full = df[col_count]
    for tc in  col_text:
        text_features = tfidf.fit_transform(df[tc]).toarray()    
        X_full = np.hstack([X_full, text_features])

    cblof = CBLOF(contamination=0.1, n_clusters=4, alpha=0.9, beta=5)#, use_weights=False)
    cblof.fit(X_full)
    anom_preds = cblof.predict(X_full)  # 0 - нормальный, 1 - аномалия
    
    return anom_preds.reshape(-1,1)


In [8]:
col_num2 = df_pd.select_dtypes(include=[np.number]).columns.tolist()
#display(col_num2)
df_pd['cblof'] = add_CBLOF(df_pd,col_num2,col_text)

for nc in  col_num2 :    
    df_pd[nc] = scaler.fit_transform(df_pd[[nc]]).ravel()
    

df_pd.to_csv('df_first.csv', index=False)
#df_pd.info()

In [9]:
X = df_pd.drop(target,axis=1 ).copy()
y = df[target].to_pandas()
gc.collect() 
text_feature_indices = [X.columns.get_loc(c) for c in col_text]
print(text_feature_indices)
#X.info() 


[13, 14, 15]


In [None]:
# Кросс-валидация CatBoost с параметрами старый вариант 
gc.collect() 
param_grid = {
    'depth': [12,14],
    'learning_rate': [0.1],
    'l2_leaf_reg': [1],
    'iterations': [300],
    'task_type': [task_type],
    'gpu_ram_part': [0.8],
    'loss_function': ['Logloss'],
    #'custom_loss': ['AUC'],
    'eval_metric': ['F1'],
    'custom_metric':['Recall'],
    'random_seed': [1],
    'early_stopping_rounds': [5],
    'use_best_model': [True],
    'verbose': [1],
    'thread_count': [3],
    'allow_writing_files':[False],
    'used_ram_limit': ['8gb']
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
best_score = -np.inf
best_params = None
best_model = None

for params in ParameterGrid(param_grid):
    f1_scores = []
    current_best_model = None
    current_best_score = -np.inf

    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
           
        train_pool = Pool(X_train, y_train, text_features=text_feature_indices)
        valid_pool = Pool(X_valid, y_valid, text_features=text_feature_indices)        
        
        model = CatBoostClassifier(**params,class_weights=[1, 14], text_features=text_feature_indices) #,class_weights=[1, 14]

        model.fit(train_pool, eval_set=valid_pool, verbose=False, early_stopping_rounds=10)             
            
        
        preds = model.predict(valid_pool)
        f1 = f1_score(y_valid, preds, average='macro')
        f1_scores.append(f1)

        if f1 > current_best_score:
            current_best_score = f1
            current_best_model = model

        del train_pool, valid_pool
        gc.collect()

    mean_f1 = np.mean(f1_scores)
    print(f"Параметры: {params} -> Средний F1: {mean_f1:.4f}")

    if mean_f1 > best_score:
        best_score = mean_f1
        best_params = params
        best_model = current_best_model
    
    gc.collect()    
   

Параметры: {'allow_writing_files': False, 'custom_metric': 'Recall', 'depth': 12, 'early_stopping_rounds': 5, 'eval_metric': 'F1', 'gpu_ram_part': 0.8, 'iterations': 300, 'l2_leaf_reg': 1, 'learning_rate': 0.1, 'loss_function': 'Logloss', 'random_seed': 1, 'task_type': 'GPU', 'thread_count': 3, 'use_best_model': True, 'used_ram_limit': '8gb', 'verbose': 1} -> Средний F1: 0.8242


In [2]:
print(f"\nЛучшие параметры: {best_params}")
print(f"Лучший средний F1: {best_score:.4f}")
print(f"\nЛучшая модель: {best_model}")

preds = best_model.predict(X)
print("\nClassification report на всей выборке:")
print(classification_report(y, preds))

ConfusionMatrixDisplay.from_estimator(best_model, X, y, cmap=plt.cm.Blues)
plt.title("Матрица ошибок")
plt.show()

NameError: name 'params' is not defined

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve

probs = model.predict_proba(X_valid)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_valid, probs)

# Найти порог с балансом между precision и recall
optimal_idx = np.argmax(recall - (1 - precision))
optimal_threshold = thresholds[optimal_idx]

print(f"Оптимальный порог: {optimal_threshold}")

preds = (probs >= optimal_threshold).astype(int)
print(classification_report(y_valid, preds))

In [None]:
df_test = pl.read_csv('OZON/ml_ozon_сounterfeit_data/ml_ozon_сounterfeit_test.csv')
#df_test.info()


# Обработка текстовых колонок
df_test = prep_text(df_test,col_text)

# Заполнение цифровых значений 
df_test = prep_count(df_test,col_count)

gc.collect()
df_test_pd = df_test.to_pandas()

col_num2 = df_test.select_dtypes(include=[np.number]).columns.tolist()

df_pd['cblof'] = add_CBLOF(df_test_pd,col_num2,col_text)

for nc in  col_num2 :    
    df_test_pd[nc] = scaler.fit_transform(df_test_pd[[nc]]).ravel()    




In [None]:
predictions_test = best_model.predict(df_test_pd)
probabilities_test = best_model.predict_proba(df_test_pd)


submission = pd.DataFrame({
    'id': df_test['id'],
    'prediction': predictions_test
})

submission.to_csv('submission_co.csv', index=False)


print(f"Создан файл submission_co.csv с {len(submission)} предсказаниями")
print(f"Распределение предсказаний:")
print(submission['prediction'].value_counts())
print()