In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
# import optuna


pd.set_option('display.max_columns', None)

In [5]:
rfm_df = pd.read_excel('rfm_df.xlsx')
rfm_df

Unnamed: 0,Телефон_new,quarter,total_orders,unique_sku,avg_sku_per_order,avg_check,med_check,avg_sku_check,med_sku_check,mean_days_between_orders,median_days_between_orders,total_margin,newyear_flag,weekend_order_share,peak_month_ratio,defect_ratio,delivery_method,good_type,Geo,ДЕТСКОЕ ПИТАНИЕ,ЖЕНСКИЕ ШТУЧКИ,ИГРУШКИ,"КАНЦТОВАРЫ, КНИГИ, ДИСКИ",КОСМЕТИКА/ГИГИЕНА,КРУПНОГАБАРИТНЫЙ ТОВАР,ОБУВЬ,ПОДГУЗНИКИ,СОПУТСТВУЮЩИЕ ТОВАРЫ,"ТЕКСТИЛЬ, ТРИКОТАЖ",ТЕХНИКА И ТОВАРЫ ДЛЯ ДОМА,ТОВАРЫ ДЛЯ ЖИВОТНЫХ,ТОВАРЫ ДЛЯ КОРМЛЕНИЯ,R_Score,F_Score,M_Score,RFM_Segment,segment_group,cluster_kmeans,cluster_gmm
0,55525753-50494856495470,2017Q2,1,2,1.00,4809.000000,4809.0,2404.500000,2404.5,0.0,0.0,600.00,0,0.0,0.0,0.500000,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,1,2,312,Freshers,12,0
1,55525753-50535655495477,2017Q1,1,2,1.00,900.000000,900.0,300.000000,300.0,0.0,0.0,138.00,0,1.0,1.0,0.600000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,1,311,Freshers,5,5
2,55525753-53565350484974,2017Q1,2,3,1.00,2999.333333,3300.0,912.666667,1100.0,15.0,15.0,579.08,0,0.0,0.0,0.000000,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,1,2,2,122,At Churn Risk,5,5
3,55525753-54565357505774,2017Q2,1,3,1.00,3880.000000,3880.0,125.333333,122.0,0.0,0.0,444.55,0,0.0,0.0,0.400000,5,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,3,1,2,312,Freshers,12,4
4,55525753-55515454494872,2017Q3,1,6,1.00,3668.000000,3668.0,346.666667,199.5,0.0,0.0,628.40,0,0.0,0.0,0.538462,0,0,0,3,0,0,0,0,1,0,1,0,0,0,0,0,3,1,2,312,Freshers,3,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231695,57485355-52485257575698,2017Q1,1,1,1.00,2076.000000,2076.0,2076.000000,2076.0,0.0,0.0,238.00,0,0.0,0.0,0.000000,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,1,1,311,Freshers,5,5
231696,57495351-50535151545597,2017Q3,1,6,1.00,1827.000000,1827.0,523.000000,161.0,12.0,0.0,343.63,0,1.0,0.0,0.000000,1,0,0,0,0,2,0,1,0,0,0,0,0,0,2,0,2,1,2,212,Sleeping,0,0
231697,57505754-52565350494991,2017Q1,1,2,1.00,8303.000000,8303.0,4151.500000,4151.5,0.0,0.0,954.22,0,0.0,0.0,0.333333,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,3,1,3,313,Freshers,5,5
231698,57545549-57544957525799,2017Q2,1,1,1.00,2560.000000,2560.0,2560.000000,2560.0,0.0,0.0,294.00,0,1.0,0.0,0.909091,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,1,2,312,Freshers,1,0


------------------
# preprocessing

In [6]:
# Short Preprocessing 

rfm_model_df = rfm_df[rfm_df['quarter'] != '2017Q1'].copy()
rfm_model_df = rfm_model_df.dropna(subset=['segment_group'])

In [7]:
le = LabelEncoder()
rfm_model_df['segment_encoded'] = le.fit_transform(rfm_model_df['segment_group'])

In [8]:
drop_cols = ['Телефон_new', 'quarter', 
             'R_Score', 'F_Score', 'M_Score', 
             'RFM_Segment', 'segment_group', 
             'total_margin', 'total_orders'
            ]
feature_cols = [col for col in rfm_model_df.columns if col not in drop_cols + ['segment_encoded']]

X = rfm_model_df[feature_cols].select_dtypes(include=['number'])
y = rfm_model_df['segment_encoded']

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

------------------------
# svm

In [79]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [83]:
# Обучаем SVM
svm_model = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)
svm_model.fit(X_train, y_train)

In [86]:
y_pred = svm_model.predict(X_test)


In [87]:
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

Classification Report:
                precision    recall  f1-score   support

At Churn Risk       0.92      0.82      0.87      2876
     Drifting       0.74      0.76      0.75      3510
     Freshers       0.87      0.96      0.91      8009
         Lost       0.96      0.97      0.97      8584
        Loyal       0.71      0.57      0.63      2207
     Sleeping       0.83      0.84      0.83      7620
          VIP       0.64      0.43      0.52      1118

     accuracy                           0.86     33924
    macro avg       0.81      0.77      0.78     33924
 weighted avg       0.86      0.86      0.86     33924



--------------
# MLP

In [11]:
from sklearn.neural_network import MLPClassifier

In [96]:
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)

In [97]:
print(classification_report(y_test, y_pred_mlp, target_names=le.classes_))

               precision    recall  f1-score   support

At Churn Risk       0.93      0.97      0.95      2876
     Drifting       0.87      0.90      0.88      3510
     Freshers       0.98      0.97      0.98      8009
         Lost       0.99      0.98      0.98      8584
        Loyal       0.80      0.83      0.82      2207
     Sleeping       0.94      0.93      0.93      7620
          VIP       0.74      0.72      0.73      1118

     accuracy                           0.94     33924
    macro avg       0.89      0.90      0.90     33924
 weighted avg       0.94      0.94      0.94     33924



----------------
# NN with coordinate_descent tuning

In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [None]:
def coordinate_descent_nn(X, y, param_grid, n_splits=3):
    best_score = 0
    best_params = {}

    for hl in param_grid['hidden_layer_sizes']:
        for alpha in param_grid['alpha']:
            scores = []
            skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
            for train_idx, val_idx in skf.split(X, y):
                X_train_cv, X_val_cv = X[train_idx], X[val_idx]
                y_train_cv, y_val_cv = y[train_idx], y[val_idx]

                model = MLPClassifier(
                            hidden_layer_sizes=hl,
                            alpha=alpha,
                            max_iter=100,
                            early_stopping=True,
                            n_iter_no_change=5,
                            random_state=42
                        )
                model.fit(X_train_cv, y_train_cv)
                y_pred = model.predict(X_val_cv)

                score = f1_score(y_val_cv, y_pred, average='macro')
                scores.append(score)

            avg_score = np.mean(scores)
            print(f"Params that have been tested: hidden_layer_sizes={hl}, alpha={alpha}, avg_f1={avg_score:.4f}")

            if avg_score > best_score:
                best_score = avg_score
                best_params = {'hidden_layer_sizes': hl, 'alpha': alpha}

    return best_params, best_score

In [12]:
X_train_np = np.asarray(X_train)
y_train_np = np.asarray(y_train)

params, score = coordinate_descent_nn(X_train_np, y_train_np, {
    'hidden_layer_sizes': [(64,), (64, 32)],
    'alpha': [0.0001, 0.01]
})


Params that have been tested: hidden_layer_sizes=(64,), alpha=0.0001, avg_f1=0.8532
Params that have been tested: hidden_layer_sizes=(64,), alpha=0.01, avg_f1=0.8536
Params that have been tested: hidden_layer_sizes=(64, 32), alpha=0.0001, avg_f1=0.8605
Params that have been tested: hidden_layer_sizes=(64, 32), alpha=0.01, avg_f1=0.8593


In [13]:
print(params)
print(f"Best F1-score (macro): {score:.4f}")

{'hidden_layer_sizes': (64, 32), 'alpha': 0.0001}
Best F1-score (macro): 0.8605


In [14]:
mlp_b = MLPClassifier(hidden_layer_sizes=(64, 32), 
                    alpha=0.0001,
                    max_iter=100,
                    early_stopping=True,
                    n_iter_no_change=5,
                    random_state=42
                    )
mlp_b.fit(X_train, y_train)
y_pred_mlp = mlp_b.predict(X_test)

In [15]:
print(classification_report(y_test, y_pred_mlp, target_names=le.classes_))

               precision    recall  f1-score   support

At Churn Risk       0.93      0.93      0.93      2876
     Drifting       0.85      0.84      0.84      3510
     Freshers       0.95      0.98      0.96      8009
         Lost       0.98      0.98      0.98      8584
        Loyal       0.79      0.75      0.77      2207
     Sleeping       0.91      0.92      0.91      7620
          VIP       0.72      0.64      0.68      1118

     accuracy                           0.92     33924
    macro avg       0.88      0.86      0.87     33924
 weighted avg       0.92      0.92      0.92     33924



In [None]:
X_train_np = np.asarray(X_train)
y_train_np = np.asarray(y_train)

params, score = coordinate_descent_nn(X_train_np, y_train_np, {
    'hidden_layer_sizes': [(128, 64), (64,), (64, 32)],
    'alpha': [0.0001, 0.01]
})
