In [2]:
!pip install --use-pep517 suod 



In [None]:
!conda install threadpoolctl

In [1]:
import polars as pl
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from pyod.models.cblof import CBLOF

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.metrics import f1_score, classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

import re
import gc
import yake
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))
len_word = 10

RANDOM_STATE = 255

try:
    from catboost.utils import get_gpu_device_count
    gpu_count = get_gpu_device_count()
except ImportError:
    gpu_count = 0

if gpu_count > 0:
    print(f"GPU доступен, количество устройств: {gpu_count}")
    task_type = 'GPU'
else:
    print("GPU недоступен, используем CPU")
    task_type = 'CPU'

from joblib import Parallel, delayed
n_jobs = 3
custom_stopwords = {"для", "в", "с","при","вы","не","от","что","это","на","к",
                    'from','and','on','to','for','with','that','this','what','who'}

stop_word = stop_words.union(custom_stopwords)

GPU доступен, количество устройств: 1


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rs_mi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def clean_text(text):
    if not isinstance(text, str):
        return ''
    cleaned = re.sub(r'[^а-яА-ЯёЁa-zA-Z0-9\s\(\)\{\}\[\]]', '', text)
    return cleaned.lower().strip()

def extract_words(text, max_len=100, stop_words=None, top_k=20):
    
    if not isinstance(text, str) or not text.strip():
        return ''
    
    shortened = text if len(text) <= max_len else text[:max_len]
    kw_extractor = yake.KeywordExtractor(n=5,top=top_k, lan="ru", stopwords=stop_words or set())
    keywords = kw_extractor.extract_keywords(shortened)
    if not keywords:
        return ''
    return keywords[0][0]
    
def process_chunk(texts_chunk, stop_words=stop_word, top_k=len_word, n_jobs=3):
    gc.collect()
    results = Parallel(n_jobs=n_jobs)(delayed(extract_words)(text, max_len=100, stop_words=stop_words, top_k=top_k) for text in texts_chunk)
    
    gc.collect()
    return results

def chunker(seq, size):
    for pos in range(0, len(seq), size):
        yield seq[pos:pos + size]

In [3]:
# Загрузка и предварительная обработка с Polars
file_path = "OZON/ml_ozon_сounterfeit_data/ml_ozon_сounterfeit_train.csv"
df = pl.read_csv(file_path)

target = 'resolution'
col_imp = ['id','resolution','PriceDiscounted','item_time_alive','seller_time_alive']
col_count = ['videos_published_count','photos_published_count','comments_published_count']
col_num = ['PriceDiscounted','item_time_alive','seller_time_alive','videos_published_count','photos_published_count','comments_published_count']
col_text = ['name_rus','CommercialTypeName4','brand_name' ]
col_total = col_imp + col_count+col_text

df_first = df.select(col_total)

# Заполнение пропусков (Polars позволяет делать это эффективно)
for col in col_count:
    df_first = df_first.with_columns([pl.col(col).fill_null(0).cast(pl.Int32).alias(col)])

for col in col_text:
    df_first = df_first.with_columns([pl.col(col).fill_null("").cast(pl.Utf8).alias(col)])

custom_stopwords = {"для", "в", "с","при","вы","не","от","что","это","на","к",
                    'from','and','on','to','for','with','that','this','what','who'}

stop_word = stop_words.union(custom_stopwords)

chunk_size = 1000

# Обработка текстовых колонок
for col in col_text:
    print(f"Обработка столбца {col}...")
    texts = df_first[col].to_list()
    cleaned_texts = [clean_text(t) for t in texts]
    all_results = []
    for chunk in chunker(cleaned_texts, chunk_size):
        chunk_results = process_chunk(chunk)
        all_results.extend(chunk_results)
    #df_first = df_first.with_column(pl.Series(name=col, values=all_results))
    df_first = df_first.with_columns([pl.Series(all_results).alias(col)])

gc.collect()
df_pd = df_first.to_pandas()
df_pd.to_csv('df_first.csv', index=False)

Обработка столбца name_rus...
Обработка столбца CommercialTypeName4...
Обработка столбца brand_name...


In [7]:
# Конвертация Polars DataFrame в Pandas для CatBoost И CBLOF
#df_pd = df_first.to_pandas()

tfidf = TfidfVectorizer(max_features=300)       
scaler = StandardScaler()
# TF-IDF преобразование текстового столбца
X_full = df_pd[col_num].copy()

for nc in  col_num :    
    X_full[nc] = scaler.fit_transform(X_full[[nc]]).ravel()
    
for tc in  col_text:
    text_features = tfidf.fit_transform(df_pd[tc]).toarray()
    #print(text_features)
    X_full = np.hstack([X_full, text_features])

cblof = CBLOF(contamination=0.1, n_clusters=4, alpha=0.9, beta=5, use_weights=False)

cblof.fit(X_full)
anom_preds = cblof.predict(X_full)  # 0 - нормальный, 1 - аномалия

# Добавляем аномалии как отдельный числовой признак
X_full_with_anom = np.hstack([X_full, anom_preds.reshape(-1,1)])


np.savetxt('X_full_with_anom.txt', X_full_with_anom, delimiter=',')
#X_full_with_anom.to_csv('X_full_with_anom.csv', index=False)

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

In [None]:
X = X_full_with_anom
y = df_pd[target]
gc.collect()
#text_feature_indices = [X.columns.get_loc(c) for c in col_text]

# Кросс-валидация CatBoost с параметрами
param_grid = {
    'depth': [10,12],
    'learning_rate': [0.1],
    'l2_leaf_reg': [1],
    'iterations': [300],
    'task_type': [task_type],
    'gpu_ram_part': [0.8],
    'loss_function': ['Logloss'],
    'eval_metric': ['F1'],
    'custom_metric':['Recall'],
    'random_seed': [1],
    'early_stopping_rounds': [3],
    'use_best_model': [True],
    'verbose': [1],
    'thread_count': [3],
    'allow_writing_files':[False],
    'used_ram_limit': ['8gb']
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
best_score = -np.inf
best_params = None
best_model = None

for params in ParameterGrid(param_grid):
    f1_scores = []
    current_best_model = None
    current_best_score = -np.inf

    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X[train_idx], X[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
           
        train_pool = Pool(X_train, y_train)
        valid_pool = Pool(X_valid, y_valid)        
        
        model = CatBoostClassifier(**params,class_weights=[1, 10])

        model.fit(train_pool, eval_set=valid_pool, verbose=False, early_stopping_rounds=30)             
            
        
        preds = model.predict(valid_pool)
        f1 = f1_score(y_valid, preds, average='macro')
        f1_scores.append(f1)

        if f1 > current_best_score:
            current_best_score = f1
            current_best_model = model

        del train_pool, valid_pool
        gc.collect()

    mean_f1 = np.mean(f1_scores)
    print(f"Параметры: {params} -> Средний F1: {mean_f1:.4f}")

    if mean_f1 > best_score:
        best_score = mean_f1
        best_params = params
        best_model = current_best_model
    
    gc.collect()    
   

Параметры: {'allow_writing_files': False, 'custom_metric': 'Recall', 'depth': 10, 'early_stopping_rounds': 3, 'eval_metric': 'F1', 'gpu_ram_part': 0.8, 'iterations': 300, 'l2_leaf_reg': 1, 'learning_rate': 0.1, 'loss_function': 'Logloss', 'random_seed': 1, 'task_type': 'GPU', 'thread_count': 3, 'use_best_model': True, 'used_ram_limit': '8gb', 'verbose': 1} -> Средний F1: 0.8238


In [None]:
 
print(f"\nЛучшие параметры: {best_params}")
print(f"Лучший средний F1: {best_score:.4f}")
print(f"\nЛучшая модель: {best_model}")

preds = best_model.predict(X)
print("\nClassification report на всей выборке:")
print(classification_report(y, preds))

ConfusionMatrixDisplay.from_estimator(best_model, X, y, cmap=plt.cm.Blues)
plt.title("Матрица ошибок")
plt.show()

In [None]:
df_test = pd.read_csv('OZON/ml_ozon_сounterfeit_data/ml_ozon_сounterfeit_test.csv')
df_test.info()

col_imp = ['id','PriceDiscounted','item_time_alive','seller_time_alive']
col_count = ['videos_published_count','photos_published_count','comments_published_count']
col_num = ['PriceDiscounted','item_time_alive','seller_time_alive','videos_published_count','photos_published_count','comments_published_count']
col_text = ['name_rus','CommercialTypeName4','brand_name' ]
col_total = col_imp + col_count+col_text

df_test1 = df_test.select(col_total)

# Заполнение пропусков (Polars позволяет делать это эффективно)
for col in col_count:
    df_test1 = df_test1.with_columns([pl.col(col).fill_null(0).cast(pl.Int32).alias(col)])

for col in col_text:
    df_test1 = df_test1.with_columns([pl.col(col).fill_null("").cast(pl.Utf8).alias(col)])



# Обработка текстовых колонок
for col in col_text:
    print(f"Обработка столбца {col}...")
    texts = df_test1[col].to_list()
    cleaned_texts = [clean_text(t) for t in texts]
    all_results = []
    for chunk in chunker(cleaned_texts, chunk_size):
        chunk_results = process_chunk(chunk)
        all_results.extend(chunk_results)
    #df_first = df_first.with_column(pl.Series(name=col, values=all_results))
    df_test1 = df_test1.with_columns([pl.Series(all_results).alias(col)])

gc.collect()

X_test_f = df_test1.to_pandas()
X_test = X_test_f[col_num].copy()
for nc in  col_num :    
    X_test[nc] = scaler.fit_transform(X_test[nc])
    
for tc in  col_text:
    text_features = tfidf.fit_transform(X_test_f[tc]).toarray()
    #print(text_features)
    X_test = np.hstack([X_test, text_features])

cblof.fit(X_test)
anom_preds = cblof.predict(X_full)  # 0 - нормальный, 1 - аномалия

# Добавляем аномалии как отдельный числовой признак
X_test_with_anom = np.hstack([X_test, anom_preds.reshape(-1,1)])





predictions_test = best_model.predict(X_test_with_anom)
probabilities_test = best_model.predict_proba(X_test_with_anom)


submission = pd.DataFrame({
    'id': df_test['id'],
    'prediction': predictions_test
})

submission.to_csv('submission_co.csv', index=False)


print(f"Создан файл submission_co.csv с {len(submission)} предсказаниями")
print(f"Распределение предсказаний:")
print(submission['prediction'].value_counts())
print()