In [None]:
!pip install pandas numpy scipy matplotlib seaborn scikit-learn keras tensorflow

# Исследование предсказуемости потребительского поведения по оценке Колмогоровской сложности

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.linalg import hankel

import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dropout, Dense

from sklearn.model_selection import train_test_split

In [2]:
from lempel_ziv import *
from huffman import *
from utils import *

In [4]:
data = pd.read_csv('data/transact_18_22.csv', parse_dates=[2])
data.head(3)

Unnamed: 0,client,card,date,amt,mcc,group,value
0,390989,3048567,2018-01-01,878.0,5814,food,survival
1,1309946,3226559,2018-01-01,249.0,5411,food,survival
2,1386278,2715990,2018-01-01,50.0,5499,food,survival


## Предварительная обработка данных

In [5]:
filtered_data = data[(data['date'] >= '2019-01-15') & (data['date'] <= '2019-12-14')]
filtered_data.head(3)

Unnamed: 0,client,card,date,amt,mcc,group,value
3278420,1378858,2391344,2019-01-15,350.87,5499,food,survival
3278421,3103680,3187433,2019-01-15,2660.0,5331,nonfood,socialization
3278422,2136426,3762147,2019-01-15,485.0,5331,nonfood,socialization


In [None]:
filtered_data['value'].unique()

In [None]:
values_to_keep = ['socialization', 'survival', 'money', 'self_realization']
filtered_data = filtered_data[filtered_data['value'].isin(values_to_keep)]

In [None]:
filtered_data.head(3)

In [None]:
grouped = filtered_data.groupby(['client', 'date'])
new_data = []
for (client, date), group in grouped:
    row = {'client': client, 'date': date}
    for val in values_to_keep:
        row[val] = 1 if val in group['value'].values else 0  
    new_data.append(row)

new_df = pd.DataFrame(new_data)
new_df['code'] = new_df[['socialization', 'survival', 'money', 'self_realization']].apply(binary_to_hex, axis=1)

In [None]:
new_df.head(5)

In [3]:
#new_df.to_csv('output.csv')
new_df = pd.read_csv('output.csv', parse_dates=[2], index_col=0)
new_df.head(3)

Unnamed: 0,client,date,socialization,survival,money,self_realization,code
0,224,2019-01-15,1,0,0,0,8
1,224,2019-01-16,1,1,0,1,d
2,224,2019-01-17,1,1,0,0,c


## Алгоритм оценки Колмогоровской сложности 

### Пример работы для одного клиента

In [4]:
selected_columns = ['date', 'socialization', 'survival', 'money', 'self_realization', 'code']
client_224_transactions = new_df[new_df['client'] == 224].loc[:, selected_columns]

In [5]:
client_224_transactions.shape

(259, 6)

In [6]:
text = client_224_transactions['code'].values
huffman_codes = huffman_code(text)
encoded_string = ''.join(huffman_codes[symbol] for symbol in text)

In [7]:
print("Результаты кодирования Хаффмана:")
print("-" * 30)
print(f"Закодированная строка:\n{encoded_string}")
print("Таблица кодов Хаффмана:")
print(pd.DataFrame.from_dict(huffman_codes, orient='index', columns=['Код Хаффмана']))
complexity = len(encoded_string) / len(text)
print(f"Сложность: {complexity:.3f}")

Результаты кодирования Хаффмана:
------------------------------
Закодированная строка:
100101011110010100001010110010010010000000001011000000000000000000110110000111010010111110001011010010011101011001000100110011000000011000100000000010001001001111101111101000110001110010000000001110110011000001011111011010100101000111011000010000110111001001110001100100100001010000101010000000000010000000010111001011000101001101100001011000100110001110011100011110011010011111011100100100010010000110100100111010000111001001001111010011100110111110011111110101111010001001100010011110011001111000100100110
Таблица кодов Хаффмана:
  Код Хаффмана
c           11
5         1011
d        10101
6      1010011
1      1010010
e      1010001
9      1010000
8          100
4            0
Сложность: 1.958


In [9]:
text = client_224_transactions['survival'].values
forecast_horizon = 7 
lzw_depth = 56   
X, y = create_dataset(text, lzw_depth, forecast_horizon)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X.shape, y.shape

((189, 56, 8), (189, 7))

In [10]:
model = keras.Sequential([
    LSTM(X_train.shape[1], return_sequences=True),
    Dropout(0.3),
    LSTM(X_train.shape[1], return_sequences=False),
    Dense(X_train.shape[1], activation='tanh'),
    Dense(y_train.shape[1], activation='sigmoid')
])

In [11]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
model.fit(X_train, y_train, epochs=15, batch_size=1, verbose=0) 

<keras.src.callbacks.history.History at 0x1bffb80de20>

In [13]:
predictions = model.predict(X_test) 

for i in range(y_test.shape[1]):
    f1, accuracy = calculate_f1_accuracy(predictions[:, i].round(), y_test[:, i])
    print(f'День {i+1}: F1={f1:.2f}; Точность: {accuracy:.1f}%')

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 300ms/step
День 1: F1=0.91; Точность: 84.2%
День 2: F1=0.85; Точность: 73.7%
День 3: F1=0.85; Точность: 73.7%
День 4: F1=0.73; Точность: 57.9%
День 5: F1=0.88; Точность: 78.9%
День 6: F1=0.85; Точность: 73.7%
День 7: F1=0.85; Точность: 73.7%


### Пример работы для нескольких клиентов

In [None]:
prediction_start_index = 52  # Индекс, с которого начинаем прогнозирование

In [None]:
'''results_df = pd.DataFrame(
        columns=['client_id'] + [f'f1_{horizon}' for horizon in forecast_horizons] +
    [f'huffman_{col}' for col in predicted_columns]
)
results_df['client_id'] = client_ids  
'''
result_df = pd.read_csv('out_1.csv', index_col=0)
result_df.head(5)

Unnamed: 0,client_id,f1_1,f1_7,f1_14,f1_28,huffman_complexity
0,2190618,,,,,2.373494
1,208625,,,,,2.706061
2,3483687,,,,,1.449848
3,3441340,,,,,2.231003
4,1421807,,,,,2.454268


In [18]:
lzw_depth = 56

In [20]:
clients = list(new_df['client'].value_counts().index)

In [None]:
#num_clients = len(clients)
num_clients = 10  # Количество клиентов (обработка одного занимает 4-5 минут)
forecast_horizons = [1, 7, 14, 28]  # Горизонты прогнозирования (в днях)
start = 10

In [None]:
for i in range(start, start + num_clients):
    client_transactions = new_df[new_df['client'] == clients[i]].loc[:, selected_columns]
    text = client_transactions['socialization'].values
    # print(text)

    for forecast_horizon in forecast_horizons:
        X, y = create_dataset(text, lzw_depth, forecast_horizon)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = keras.Sequential([
            LSTM(X_train.shape[1], return_sequences=True),
            Dropout(0.3),
            LSTM(X_train.shape[1], return_sequences=False),
            Dense(X_train.shape[1], activation='tanh'),
            Dense(y_train.shape[1], activation='sigmoid')
        ])

        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        model.fit(X_train, y_train, epochs=15, batch_size=1, verbose=0) 
        
        predictions = model.predict(X_test) 
        f1, accuracy = calculate_f1_accuracy(predictions[:, -1].round().astype(int), y_test[:, -1])
        result_df.loc[result_df['client_id'] == clients[i], f'f1_{forecast_horizon}'] = f1
        # print(f'f1_{forecast_horizon} for client_{clients[i]} = {f1}')
    del model

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 287ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 313ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 299ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 317ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 316ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 301ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 336ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 317ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 301ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 317ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 317ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 313ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 317ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [34]:
result_df.head(15)

Unnamed: 0,client_id,f1_1,f1_7,f1_14,f1_28,huffman_complexity
0,2190618,0.910891,0.822222,0.924731,0.886076,0.593373
1,208625,0.952381,0.96,0.93617,0.964706,0.676515
2,3483687,1.0,1.0,1.0,1.0,0.362462
3,3441340,0.444444,0.375,0.428571,0.352941,0.557751
4,1421807,0.461538,0.590909,0.627451,0.627451,0.613567
5,2117702,0.0,0.095238,0.4,0.428571,0.66997
6,576313,0.813187,0.790698,0.804878,0.830769,0.659021
7,3115758,0.0,0.628571,0.594595,0.6,0.684816
8,1680201,0.746988,0.775,0.769231,0.793103,0.644939
9,1385906,0.0,0.0,0.0,0.0,0.446759


In [35]:
result_df.to_csv('out_1.csv')