In [None]:
!pip install pandas numpy scipy matplotlib seaborn scikit-learn keras tensorflow

In [None]:
!pip install tensorflow==2.18
!pip install tensorflow[and-cuda]==2.18.0

# Исследование предсказуемости потребительского поведения по оценке Колмогоровской сложности

In [1]:
import pandas as pd
import numpy as np
from scipy.linalg import hankel

import tensorflow as tf 
from tensorflow import keras
from keras.layers import LSTM, Dropout, Dense

2025-01-14 17:24:07.631637: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-14 17:24:07.640068: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736864647.650496    4377 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736864647.653519    4377 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-14 17:24:07.664449: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
print(len(tf.config.list_physical_devices('GPU')))

1


In [3]:
from lempel_ziv import *
from huffman import *
from utils import *

In [18]:
data = pd.read_csv('data/transact_18_22.csv', parse_dates=[2])
data.head(3)

Unnamed: 0,client,card,date,amt,mcc,group,value
0,390989,3048567,2018-01-01,878.0,5814,food,survival
1,1309946,3226559,2018-01-01,249.0,5411,food,survival
2,1386278,2715990,2018-01-01,50.0,5499,food,survival


## Предварительная обработка данных

Фильтрация по дате, чтобы не учитывать новогодние праздники и Covid

In [19]:
filtered_data = data[(data['date'] >= '2019-01-15') & (data['date'] <= '2019-12-14')]
filtered_data.head(3)

Unnamed: 0,client,card,date,amt,mcc,group,value
3278420,1378858,2391344,2019-01-15,350.87,5499,food,survival
3278421,3103680,3187433,2019-01-15,2660.0,5331,nonfood,socialization
3278422,2136426,3762147,2019-01-15,485.0,5331,nonfood,socialization


In [20]:
filtered_data['value'].unique()

array(['survival', 'socialization', 'money', 'self_realization', '7372',
       '4411', '2741', '5681', '6211', '8249', '5094', '5983', '7278',
       '7699', '5734', '7210', '7998', '7393', '5310', '3008', '5074',
       '7216', '7251', '5950', '5713', '7629', '5047', '5733', '7929',
       '7394', '3301', '5111', '3211', '5946', '8351', '5697', '5611',
       '5131', '7622', '5976', '5970', '7379', '5655', '6513', '4214',
       '5967', '5122', '5521', '5932', '5065', '5044', '5013', '7375',
       '5811', '5933', '5099', '8111', '5072', '8661', '5300', '5532',
       '7395', '8641', '5818', '5994', '8050', '7535', '5971', '7338',
       '5045', '7996', '5172', '5051', '5817', '7333', '4582', '3010',
       '5021', '5137', '7911', '7392', '3586', '7933', '5231', '5718',
       '780', '7297', '7032', '5085', '1711', '8211', '1799', '7534',
       '8699', '5046', '5940', '1731', '7829', '3553', '4119', '3042',
       '5960', '3005', '5169', '5972', '5571', '3533', '3831', '7273',
     

In [21]:
values_to_keep = ['socialization', 'survival', 'money', 'self_realization']
filtered_data = filtered_data[filtered_data['value'].isin(values_to_keep)]

Бинаризация транзакций определённых групп и кодирование их (с использованием 16-ичных кодов)

In [22]:
grouped = filtered_data.groupby(['client', 'date'])
new_data = []
for (client, date), group in grouped:
    row = {'client': client, 'date': date}
    for val in values_to_keep:
        row[val] = 1 if val in group['value'].values else 0  
    new_data.append(row)

new_df = pd.DataFrame(new_data)
new_df['code'] = new_df[['socialization', 'survival', 'money', 'self_realization']].apply(binary_to_hex, axis=1)
new_df

Unnamed: 0,client,date,socialization,survival,money,self_realization,code
0,224,2019-01-15,1,0,0,0,8
1,224,2019-01-16,1,1,0,1,d
2,224,2019-01-17,1,1,0,0,c
3,224,2019-01-18,1,0,0,0,8
4,224,2019-01-19,1,0,0,1,9
...,...,...,...,...,...,...,...
1588462,3564569,2019-12-10,0,1,0,1,5
1588463,3564569,2019-12-11,1,1,0,0,c
1588464,3564569,2019-12-12,0,1,0,0,4
1588465,3564569,2019-12-13,0,1,0,1,5


Для экспериментов берётся только 1000 клиентов, которые делают максимальное число транзакций 

In [23]:
clients_counts = new_df.groupby('client').size().reset_index(name='transaction_count')
sorted_clients = clients_counts.sort_values('transaction_count', ascending=False)
clients_counts = sorted_clients.head(1000)
new_df = new_df[new_df['client'].isin(clients_counts['client'])]
new_df

Unnamed: 0,client,date,socialization,survival,money,self_realization,code
0,224,2019-01-15,1,0,0,0,8
1,224,2019-01-16,1,1,0,1,d
2,224,2019-01-17,1,1,0,0,c
3,224,2019-01-18,1,0,0,0,8
4,224,2019-01-19,1,0,0,1,9
...,...,...,...,...,...,...,...
1588462,3564569,2019-12-10,0,1,0,1,5
1588463,3564569,2019-12-11,1,1,0,0,c
1588464,3564569,2019-12-12,0,1,0,0,4
1588465,3564569,2019-12-13,0,1,0,1,5


In [None]:
#new_df.to_csv('output.csv')
#new_df = pd.read_csv('output.csv', parse_dates=[2], index_col=0)
#new_df.head(3)

In [28]:
selected_columns = ['date', 'socialization', 'survival', 'money', 'self_realization', 'code']

## Параметры для обучения

In [29]:
split = 52
lzw_depth = 56
forecast_horizons = [7]
start = 0

In [37]:
clients = list(new_df['client'].value_counts().index)

In [54]:
results_df = pd.DataFrame(
        columns=['client_id'] + [f'hp_{horizon}' for horizon in forecast_horizons] + ['huffman_complexity', 'lz_complexity']
)
results_df['client_id'] = clients  
results_df.head(5)

Unnamed: 0,client_id,hp_7,huffman_complexity,lz_complexity
0,2190618,,,
1,208625,,,
2,3441340,,,
3,3483687,,,
4,1421807,,,


In [55]:
clients = list(new_df['client'].value_counts().index)
num_clients = len(clients)

In [56]:
train_days = 180

## Создание моделей

In [None]:
models = [keras.Sequential([
                LSTM(28, return_sequences=True),
                Dropout(0.3),
                LSTM(28, return_sequences=False),
                Dense(28, activation='tanh'),
                Dense(forecast_horizons[i], activation='sigmoid')
            ]) for i in range(len(forecast_horizons))]

for i in range(len(forecast_horizons)):
  models[i].compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## Обучение и сохранение результатов

In [None]:
with tf.device('/GPU:0'):
    for i in range(start, start + num_clients):
        client_transactions = new_df[new_df['client'] == clients[i]].loc[:, selected_columns]

        text = client_transactions['code'].values
        huffman_codes = huffman_code(text)
        encoded_string = ''.join(huffman_codes[symbol] for symbol in text)
        complexity = len(encoded_string) / len(text) / 4.0

        text = client_transactions['self_realization'].values
        lzc = lempel_ziv_complexity(text)

        train_data = text[:train_days]
        test_data = text[train_days:]

        for j, forecast_horizon in enumerate(forecast_horizons):
            X_train, y_train = create_dataset(train_data, lzc, forecast_horizon)
            X_test, y_test = create_dataset(test_data, lzc, forecast_horizon)

            models[j].fit(X_train, y_train, epochs=15, batch_size=1, verbose=0)
            f1_scores, hp_scores = [], []

            for k in range(y_test.shape[0] - 1):
                predictions = models[j].predict(X_test[k:k+1], verbose=0)
                f1, accuracy = calculate_f1_accuracy(predictions.flatten().round(0).astype(int), y_test[k].flatten())
                f1_scores.append(f1)

            if len(f1_scores) > 0:
                hp_score = np.mean(np.array(f1_scores) > 0.75)
            else:
                hp_score = 0.0

            results_df.loc[results_df['client_id'] == clients[i], f'hp_{forecast_horizon}'] = hp_score
            results_df.loc[results_df['client_id'] == clients[i], 'huffman_complexity'] = complexity
            results_df.loc[results_df['client_id'] == clients[i], 'lz_complexity'] = lzc
        if i % 5 == 0:
            results_df.to_csv('self_realization.csv')

Аналогично для остальных категорий