In [None]:
!pip install pandas numpy lempel-ziv-complexity dahuffman

# Исследование предсказуемости потребительского поведения по оценке Колмогоровской сложности

In [120]:
import pandas as pd
import numpy as np

In [121]:
data = pd.read_csv('transaction18_22/transact_18_22.csv', parse_dates=[2])
data.head(3)

Unnamed: 0,client,card,date,amt,mcc,group,value
0,390989,3048567,2018-01-01,878.0,5814,food,survival
1,1309946,3226559,2018-01-01,249.0,5411,food,survival
2,1386278,2715990,2018-01-01,50.0,5499,food,survival


## Предварительная обработка данных

In [122]:
filtered_data = data[(data['date'] >= '2019-01-15') & (data['date'] <= '2019-12-14')]
filtered_data.head(3)

Unnamed: 0,client,card,date,amt,mcc,group,value
3278420,1378858,2391344,2019-01-15,350.87,5499,food,survival
3278421,3103680,3187433,2019-01-15,2660.0,5331,nonfood,socialization
3278422,2136426,3762147,2019-01-15,485.0,5331,nonfood,socialization


In [123]:
filtered_data['value'].unique()

array(['survival', 'socialization', 'money', 'self_realization', '7372',
       '4411', '2741', '5681', '6211', '8249', '5094', '5983', '7278',
       '7699', '5734', '7210', '7998', '7393', '5310', '3008', '5074',
       '7216', '7251', '5950', '5713', '7629', '5047', '5733', '7929',
       '7394', '3301', '5111', '3211', '5946', '8351', '5697', '5611',
       '5131', '7622', '5976', '5970', '7379', '5655', '6513', '4214',
       '5967', '5122', '5521', '5932', '5065', '5044', '5013', '7375',
       '5811', '5933', '5099', '8111', '5072', '8661', '5300', '5532',
       '7395', '8641', '5818', '5994', '8050', '7535', '5971', '7338',
       '5045', '7996', '5172', '5051', '5817', '7333', '4582', '3010',
       '5021', '5137', '7911', '7392', '3586', '7933', '5231', '5718',
       '780', '7297', '7032', '5085', '1711', '8211', '1799', '7534',
       '8699', '5046', '5940', '1731', '7829', '3553', '4119', '3042',
       '5960', '3005', '5169', '5972', '5571', '3533', '3831', '7273',
     

In [124]:
values_to_keep = ['socialization', 'survival', 'money', 'self_realization']
filtered_data = filtered_data[filtered_data['value'].isin(values_to_keep)]

In [125]:
filtered_data['group'].unique()

array(['food', 'nonfood', 'money', 'outfit', 'travel', 'health', 'fun',
       'dwelling', 'beauty', 'telecom', 'kids', 'misc', 'remote',
       'charity'], dtype=object)

In [126]:
filtered_data.head(3)

Unnamed: 0,client,card,date,amt,mcc,group,value
3278420,1378858,2391344,2019-01-15,350.87,5499,food,survival
3278421,3103680,3187433,2019-01-15,2660.0,5331,nonfood,socialization
3278422,2136426,3762147,2019-01-15,485.0,5331,nonfood,socialization


In [127]:
def binary_to_hex(row):
    binary_string = ''.join(map(str, row))
    decimal_value = int(binary_string, 2)
    return hex(decimal_value)[2:]

In [128]:
grouped = filtered_data.groupby(['client', 'date'])
new_data = []
for (client, date), group in grouped:
    row = {'client': client, 'date': date}
    for val in values_to_keep:
        row[val] = 1 if val in group['value'].values else 0  
    new_data.append(row)

new_df = pd.DataFrame(new_data)
new_df['code'] = new_df[['socialization', 'survival', 'money', 'self_realization']].apply(binary_to_hex, axis=1)

In [129]:
new_df.head(10)

Unnamed: 0,client,date,socialization,survival,money,self_realization,code
0,224,2019-01-15,1,0,0,0,8
1,224,2019-01-16,1,1,0,1,d
2,224,2019-01-17,1,1,0,0,c
3,224,2019-01-18,1,0,0,0,8
4,224,2019-01-19,1,0,0,1,9
5,224,2019-01-21,1,1,0,1,d
6,224,2019-01-22,1,0,0,0,8
7,224,2019-01-23,1,0,0,0,8
8,224,2019-01-24,1,0,0,0,8
9,224,2019-01-25,1,0,0,0,8


## Алгоритм оценки Колмогоровской сложности 

### Оценка сложности для one dimensional измерений

In [130]:
from lempel_ziv_complexity import lempel_ziv_complexity

In [139]:
def calculate_lz_complexity(series):
    return lempel_ziv_complexity(series.astype(str).str.cat()) / (len(series) * 4)

In [140]:
client_lz_complexity = new_df.groupby('client')[['socialization', 'survival', 'money', 'self_realization']].agg(calculate_lz_complexity)

client_lz_complexity.head(5)

Unnamed: 0_level_0,socialization,survival,money,self_realization
client,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
224,0.053089,0.044402,0.023166,0.034749
1108,0.03178,0.065678,0.065678,0.057203
1117,0.058511,0.058511,0.058511,0.039007
1197,0.040842,0.058168,0.038366,0.050743
1223,0.0625,0.060268,0.053571,0.060268


### Оценка сложности для multidimensional измерений

#### Алгоритм Хаффмана

In [141]:
from dahuffman import HuffmanCodec

In [142]:
def huffman_compression_ratio(text):
    codec = HuffmanCodec.from_data(text)
    compressed_text = codec.encode(text)
    return len(compressed_text) / (len(text) * 4)

In [143]:
huffman_compression = new_df.groupby('client')[['code']].agg(huffman_compression_ratio)

In [144]:
huffman_compression.head(5)

Unnamed: 0_level_0,code
client,Unnamed: 1_level_1
224,0.061776
1108,0.074153
1117,0.085106
1197,0.07302
1223,0.084821


#### Алгоритм Лемпеля-Зива-Велча (LZW)

In [145]:
client_lz_compression = new_df.groupby('client')[['code']].agg(calculate_lz_complexity)

client_lz_compression.head(5)

Unnamed: 0_level_0,code
client,Unnamed: 1_level_1
224,0.076255
1108,0.103814
1117,0.10461
1197,0.085396
1223,0.109375
