In [4]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns = 500
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, roc_auc_score
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import plot_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from scipy.stats import chi2_contingency
from sklearn.base import BaseEstimator, TransformerMixin
import streamlit as st
import joblib

In [5]:
class MixedCategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=3):
        self.threshold = threshold
        self.one_hot_cols = []
        self.target_encoding_maps = {}

    def fit(self, X, y):
        self.target = y.name if hasattr(y, 'name') else 'target'
        for col in X.select_dtypes(include='object').columns:
            if X[col].nunique() < self.threshold:
                self.one_hot_cols.append(col)
            else:
                self.target_encoding_maps[col] = X[[col]].join(y).groupby(col)[self.target].mean().to_dict()
        return self

    def transform(self, X):
        X_new = X.copy()
        for col in self.one_hot_cols:
            dummies = pd.get_dummies(X_new[col], prefix=col, drop_first=True, dtype=int)
            X_new = X_new.drop(col, axis=1)
            X_new = pd.concat([X_new, dummies], axis=1)

        for col, mapping in self.target_encoding_maps.items():
            X_new[col] = X_new[col].map(mapping).fillna(0)

        return X_new

In [6]:
df = pd.read_excel('O:\Планирование\Папки сотрудников\Колчак\Bi\Данные для ВКР.xlsx')

In [7]:
df = df.drop(['атрибут5','атрибут3','Перенос', 'Модель','Полугодие','Квартал'],axis=1)

In [8]:
for col in df.describe(include = 'object'):
    df[col]=df[col].fillna('n/a')
    df[col]=df[col].replace('0','n/a')

In [9]:
df.loc[(df['Лидерство'] == 'Выше плана') | (df['Лидерство'] == 'Лидер') | (df['Лидерство'] == 'Остальные'),'target'] = 1
df.loc[(df['Лидерство'] == 'Ниже плана') | (df['Лидерство'] == 'Анти-лидер'),'target'] = 0

In [10]:
df = df.drop(['признак успешности модели', 'Лидерство', 'коэффициент успешности модели' ],axis=1)

In [11]:
X = df.drop('target',axis=1)
Y = df['target']

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2)

In [13]:
categorical_cols=['Товарная категория','Товарная группа','Целевая группа','Ассортимент','Производство обобщенное','атрибут1','атрибут2','атрибут4','Страна оригинала','Страна производства','Тип ткани','Цвет','Однотонность','Элементы дизайна','Посадка','Модность','Тип продукта','Коллекция','Атрибут цены','МЕСЯЦ PMM']
numeric_cols=['Кол-во размеров','Себестоимость','Цена розничная','IMU','Год','НЕДЕЛЯ PMM','Заказ','Глубина на модель']

In [14]:
encoder = MixedCategoricalEncoder(threshold=1)
encoder.fit(X_train[categorical_cols], Y_train)

In [15]:
X_train_cat = encoder.transform(X_train[categorical_cols])
X_test_cat = encoder.transform(X_test[categorical_cols])

In [16]:
X_train_full = pd.concat([X_train[numeric_cols].reset_index(drop=True), X_train_cat.reset_index(drop=True)], axis=1)
X_test_full = pd.concat([X_test[numeric_cols].reset_index(drop=True), X_test_cat.reset_index(drop=True)], axis=1)

In [17]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test_full)

In [18]:
et = ExtraTreesClassifier(n_estimators=100, random_state=42, class_weight='balanced')
et.fit(X_train_scaled, Y_train)

y_pred_et = et.predict(X_test_scaled)
y_proba_et= et.predict_proba(X_test_scaled)[:, 1]
print("ExtraTreesClassifier:\n", classification_report(Y_test, y_pred_et, digits=3))

ExtraTreesClassifier:
               precision    recall  f1-score   support

         0.0      0.831     0.879     0.854      8689
         1.0      0.819     0.753     0.785      6315

    accuracy                          0.826     15004
   macro avg      0.825     0.816     0.820     15004
weighted avg      0.826     0.826     0.825     15004



In [24]:
joblib.dump(et,"C:\\Repoz\\Kolchak\\et.pkl")

['C:\\Repoz\\Kolchak\\et.pkl']

In [20]:
joblib.dump(scaler,"C:\\Repoz\\Kolchak\\scaler.pkl")

['C:\\Repoz\\Kolchak\\scaler.pkl']

In [21]:
joblib.dump(encoder,"C:\\Repoz\\Kolchak\\encoder.pkl")

['C:\\Repoz\\Kolchak\\encoder.pkl']

In [22]:
X_train_cat.head()

Unnamed: 0,Товарная категория,Товарная группа,Целевая группа,Ассортимент,Производство обобщенное,атрибут1,атрибут2,атрибут4,Страна оригинала,Страна производства,Тип ткани,Цвет,Однотонность,Элементы дизайна,Посадка,Модность,Тип продукта,Коллекция,Атрибут цены,МЕСЯЦ PMM
46780,0.468691,0.427835,0.389118,0.390087,0.495462,0.537984,0.399093,0.47449,0.530112,0.393069,0.421644,0.391026,0.400147,0.508585,0.524158,0.405167,0.530954,0.407971,0.431497,0.333932
57354,0.414141,0.378689,0.411948,0.451565,0.387008,0.433325,0.469718,0.533898,0.461385,0.387753,0.436901,0.372093,0.471551,0.508585,0.425138,0.405167,0.398268,0.407971,0.431497,0.333932
43373,0.468691,0.427835,0.389118,0.390087,0.495462,0.537984,0.399093,0.47449,0.354747,0.393069,0.421644,0.453455,0.400147,0.508585,0.524158,0.405167,0.394162,0.407971,0.431497,0.333932
7500,0.466867,0.548292,0.445789,0.451565,0.387008,0.433325,0.469718,0.502846,0.461385,0.387753,0.542545,0.453455,0.400147,0.479167,0.425138,0.405167,0.48999,0.431087,0.383798,0.301202
55606,0.575949,0.548292,0.477205,0.451565,0.387008,0.433325,0.504429,0.442747,0.354747,0.387753,0.367568,0.506057,0.400147,0.380849,0.425138,0.522854,0.414286,0.403761,0.431497,0.449911
