In [None]:
!pip install pandas numpy scipy matplotlib seaborn scikit-learn keras tensorflow

# Исследование предсказуемости потребительского поведения по оценке Колмогоровской сложности

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.linalg import hankel

import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dropout, Dense

from sklearn.model_selection import train_test_split

In [30]:
from lempel_ziv import *
from huffman import *

In [31]:
data = pd.read_csv('data/transact_18_22.csv', parse_dates=[2])
data.head(3)

Unnamed: 0,client,card,date,amt,mcc,group,value
0,390989,3048567,2018-01-01,878.0,5814,food,survival
1,1309946,3226559,2018-01-01,249.0,5411,food,survival
2,1386278,2715990,2018-01-01,50.0,5499,food,survival


## Предварительная обработка данных

In [32]:
filtered_data = data[(data['date'] >= '2019-01-15') & (data['date'] <= '2019-12-14')]
filtered_data.head(3)

Unnamed: 0,client,card,date,amt,mcc,group,value
3278420,1378858,2391344,2019-01-15,350.87,5499,food,survival
3278421,3103680,3187433,2019-01-15,2660.0,5331,nonfood,socialization
3278422,2136426,3762147,2019-01-15,485.0,5331,nonfood,socialization


In [33]:
filtered_data['value'].unique()

array(['survival', 'socialization', 'money', 'self_realization', '7372',
       '4411', '2741', '5681', '6211', '8249', '5094', '5983', '7278',
       '7699', '5734', '7210', '7998', '7393', '5310', '3008', '5074',
       '7216', '7251', '5950', '5713', '7629', '5047', '5733', '7929',
       '7394', '3301', '5111', '3211', '5946', '8351', '5697', '5611',
       '5131', '7622', '5976', '5970', '7379', '5655', '6513', '4214',
       '5967', '5122', '5521', '5932', '5065', '5044', '5013', '7375',
       '5811', '5933', '5099', '8111', '5072', '8661', '5300', '5532',
       '7395', '8641', '5818', '5994', '8050', '7535', '5971', '7338',
       '5045', '7996', '5172', '5051', '5817', '7333', '4582', '3010',
       '5021', '5137', '7911', '7392', '3586', '7933', '5231', '5718',
       '780', '7297', '7032', '5085', '1711', '8211', '1799', '7534',
       '8699', '5046', '5940', '1731', '7829', '3553', '4119', '3042',
       '5960', '3005', '5169', '5972', '5571', '3533', '3831', '7273',
     

In [34]:
values_to_keep = ['socialization', 'survival', 'money', 'self_realization']
filtered_data = filtered_data[filtered_data['value'].isin(values_to_keep)]

In [35]:
filtered_data['group'].unique()

array(['food', 'nonfood', 'money', 'outfit', 'travel', 'health', 'fun',
       'dwelling', 'beauty', 'telecom', 'kids', 'misc', 'remote',
       'charity'], dtype=object)

In [36]:
filtered_data.head(3)

Unnamed: 0,client,card,date,amt,mcc,group,value
3278420,1378858,2391344,2019-01-15,350.87,5499,food,survival
3278421,3103680,3187433,2019-01-15,2660.0,5331,nonfood,socialization
3278422,2136426,3762147,2019-01-15,485.0,5331,nonfood,socialization


In [37]:
def binary_to_hex(row):
    binary_string = ''.join(map(str, row))
    decimal_value = int(binary_string, 2)
    return hex(decimal_value)[2:]

In [38]:
grouped = filtered_data.groupby(['client', 'date'])
new_data = []
for (client, date), group in grouped:
    row = {'client': client, 'date': date}
    for val in values_to_keep:
        row[val] = 1 if val in group['value'].values else 0  
    new_data.append(row)

new_df = pd.DataFrame(new_data)
new_df['code'] = new_df[['socialization', 'survival', 'money', 'self_realization']].apply(binary_to_hex, axis=1)

In [39]:
new_df.head(10)

Unnamed: 0,client,date,socialization,survival,money,self_realization,code
0,224,2019-01-15,1,0,0,0,8
1,224,2019-01-16,1,1,0,1,d
2,224,2019-01-17,1,1,0,0,c
3,224,2019-01-18,1,0,0,0,8
4,224,2019-01-19,1,0,0,1,9
5,224,2019-01-21,1,1,0,1,d
6,224,2019-01-22,1,0,0,0,8
7,224,2019-01-23,1,0,0,0,8
8,224,2019-01-24,1,0,0,0,8
9,224,2019-01-25,1,0,0,0,8


## Алгоритм оценки Колмогоровской сложности 

### Пример работы для одного клиента

In [40]:
selected_columns = ['date', 'socialization', 'survival', 'money', 'self_realization', 'code']
client_224_transactions = new_df[new_df['client'] == 224].loc[:, selected_columns]

In [41]:
client_224_transactions.shape

(259, 6)

In [42]:
text = client_224_transactions['code'].values
huffman_codes = huffman_code(text)
encoded_string = ''.join(huffman_codes[symbol] for symbol in text)

In [43]:
print("Результаты кодирования Хаффмана:")
print("-" * 30)
print(f"Закодированная строка:\n{encoded_string}")
print("Таблица кодов Хаффмана:")
print(pd.DataFrame.from_dict(huffman_codes, orient='index', columns=['Код Хаффмана']))
complexity = len(encoded_string) / len(text)
print(f"Сложность: {complexity:.3f}")

Результаты кодирования Хаффмана:
------------------------------
Закодированная строка:
100101011110010100001010110010010010000000001011000000000000000000110110000111010010111110001011010010011101011001000100110011000000011000100000000010001001001111101111101000110001110010000000001110110011000001011111011010100101000111011000010000110111001001110001100100100001010000101010000000000010000000010111001011000101001101100001011000100110001110011100011110011010011111011100100100010010000110100100111010000111001001001111010011100110111110011111110101111010001001100010011110011001111000100100110
Таблица кодов Хаффмана:
  Код Хаффмана
c           11
5         1011
d        10101
6      1010011
1      1010010
e      1010001
9      1010000
8          100
4            0
Сложность: 1.958


In [44]:
def create_dataset(time_series, lzw_depth, forecast_horizon):
    """
    Создает обучающий набор данных для прогнозирования временных рядов.

    Args:
        time_series: Временной ряд (numpy array).
        lzw_depth: Глубина Lempel-Ziv кодирования (int).
        forecast_horizon: Горизонт прогнозирования (int).

    Returns:
        Кортеж (X, y):  X - входные данные, y - целевые значения.
    """
    hankel_matrix = hankel(time_series) 
    X0 = hankel_matrix[:-lzw_depth - forecast_horizon + 1, :lzw_depth]
    X = []
    for i in range(X0.shape[0] - forecast_horizon - 1):
        X.append(X0[i:i + forecast_horizon + 1, :].T)
    X = np.array(X)
    y = hankel_matrix[:-lzw_depth - 2 * forecast_horizon, lzw_depth + forecast_horizon:lzw_depth + 2 * forecast_horizon]
    return X, y

In [45]:
text = client_224_transactions['survival'].values
forecast_horizon = 7 
lzw_depth = 56   
X, y = create_dataset(text, lzw_depth, forecast_horizon)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X.shape, y.shape

((189, 56, 8), (189, 7))

In [46]:
model = keras.Sequential([
    LSTM(X_train.shape[1], return_sequences=True),
    Dropout(0.3),
    LSTM(X_train.shape[1], return_sequences=False),
    Dense(X_train.shape[1], activation='tanh'),
    Dense(y_train.shape[1], activation='sigmoid')
])

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [48]:
model.fit(X_train, y_train, epochs=15, batch_size=1, verbose=0) 

<keras.src.callbacks.history.History at 0x217dc8094c0>

In [23]:
def calculate_f1_accuracy(y_pred, y_true):
    """Вычисляет F1-меру и точность.

    Args:
        y_pred: Предсказанные бинарные метки (массив NumPy).
        y_true: Истинные бинарные метки (массив NumPy).

    Returns:
        Кортеж: (F1-мера, точность). Возвращает (0, 0), если нет положительных предсказаний.
    """
    y_pred = y_pred.astype(int)  
    y_true = y_true.astype(int)  

    tp = np.sum((y_pred == 1) & (y_true == 1)) 
    tn = np.sum((y_pred == 0) & (y_true == 0))  
    fp = np.sum((y_pred == 1) & (y_true == 0))  
    fn = np.sum((y_pred == 0) & (y_true == 1))  

    if tp + fp == 0 or tp + fn == 0:
        precision = 0.0
        recall = 0.0
        f1 = 0.0
    else:
        precision = tp / (tp + fp)  
        recall = tp / (tp + fn)     
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0.0 

    accuracy = (tp + tn) / (tp + tn + fp + fn) * 100 if (tp + tn + fp + fn) != 0 else 0.0

    return f1, accuracy

In [24]:
predictions = model.predict(X_test) 

for i in range(y_test.shape[1]):
    f1, accuracy = calculate_f1_accuracy(predictions[:, i].round(), y_test[:, i])
    print(f'День {i+1}: F1={f1:.2f}; Точность: {accuracy:.1f}%')

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 335ms/step
День 1: F1=0.90; Точность: 81.6%
День 2: F1=0.88; Точность: 78.9%
День 3: F1=0.84; Точность: 73.7%
День 4: F1=0.75; Точность: 60.5%
День 5: F1=0.88; Точность: 78.9%
День 6: F1=0.86; Точность: 76.3%
День 7: F1=0.83; Точность: 71.1%


### Пример работы для нескольких клиентов

In [50]:
num_clients = 3  # Количество клиентов (обработка одного занимает 4-5 минут)
prediction_start_index = 52  # Индекс, с которого начинаем прогнозирование
forecast_horizons = [1, 7, 14, 28]  # Горизонты прогнозирования (в днях)
client_ids = list(data['client'].value_counts().index)  

results_df = pd.DataFrame(columns=['client_id'] + [f'f1_{horizon}' for horizon in forecast_horizons])
results_df['client_id'] = client_ids  

In [53]:
for i in range(num_clients):
    client_transactions = new_df[new_df['client'] == client_ids[i]].loc[:, selected_columns]['survival'].values  

    for j, forecast_horizon in enumerate(forecast_horizons):
        X, y = create_dataset(client_transactions, lzw_depth, forecast_horizon)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = keras.Sequential([
            LSTM(X_train.shape[1], return_sequences=True),
            Dropout(0.3),
            LSTM(X_train.shape[1], return_sequences=False),
            Dense(X_train.shape[1], activation='tanh'),
            Dense(y_train.shape[1], activation='sigmoid')
        ])

        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        model.fit(X_train, y_train, epochs=15, batch_size=1, verbose=0) 
        
        predictions = model.predict(X_test) 
        f1, accuracy = calculate_f1_accuracy(predictions[:, -1].round().astype(int), y_test[:, -1])
        results_df.iloc[i, 0]= client_ids[i]
        results_df.iloc[i, 1 + j]= f1
    del model

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 316ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 338ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 317ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 335ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 300ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 304ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 305ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 300ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 349ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 319ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 326ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 309ms/step


In [55]:
results_df.head(3)

Unnamed: 0,client_id,f1_1,f1_7,f1_14,f1_28
0,224763,0.960784,0.901099,0.921348,0.95
1,498117,0.571429,0.666667,0.457143,0.787879
2,2472046,0.977778,0.925,0.90411,0.955224
