In [2]:
import pandas as pd
import numpy as np

In [31]:
df = pd.read_csv("/Users/tima/Downloads/dataset_telemetry.csv")
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [57]:
snapshot_date = df['timestamp'].max() + pd.Timedelta(days=1)

recency = (
    df.groupby('userid')['timestamp']
      .max()
      .apply(lambda x: (snapshot_date - x).days)
      .rename('Recency')
)


In [59]:
significant_actions = ['category', 'search', 'product', 'cart', 'checkout', 'confirmation']

frequency = (
    df[df['action'].isin(significant_actions)]
    .groupby('userid')
    .size()
    .rename('Frequency')
)


In [61]:
# Частоты действий
action_freq = df['action'].value_counts(normalize=True)

# Глубина воронки
funnel_order = {
    'mainpage': 0,
    'category': 1,
    'search': 2,
    'product': 3,
    'cart': 4,
    'checkout': 5,
    'confirmation': 6
}

# Автоматические веса
weights = {}

for action in funnel_order:
    if action in action_freq:
        weights[action] = funnel_order[action] * (1 / action_freq[action])

weights = pd.Series(weights)

# Нормализация → минимальный вес = 1 (кроме mainpage)
weights = weights / weights[weights > 0].min()
weights = weights.round().astype(int)

weights['mainpage'] = 0  # жёстко

print(weights)


mainpage         0
category         1
search           2
product          2
cart             9
checkout        42
confirmation    99
dtype: int64


In [63]:
df['m_score'] = df['action'].map(weights).fillna(0)

monetary = (
    df.groupby('userid')['m_score']
      .sum()
      .rename('Monetary')
)


In [65]:
rfm = pd.concat([recency, frequency, monetary], axis=1).fillna(0)
rfm.head()


Unnamed: 0_level_0,Recency,Frequency,Monetary
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
user_10000,1,2572,13258
user_10001,2,2679,13899
user_10002,2,2484,11978
user_10003,3,2615,13394
user_10004,1,2678,13266


In [67]:
rfm['R_rank'] = pd.cut(
    rfm['Recency'],
    bins=[-1, 1, 3, 7, 14, 10**9],
    labels=[5, 4, 3, 2, 1]
).astype(int)


In [69]:
rfm['F_rank'] = pd.cut(
    rfm['Frequency'],
    bins=[-1, 1, 3, 5, 10, 10**9],
    labels=[1, 2, 3, 4, 5]
).astype(int)


In [71]:
rfm['M_rank'] = pd.cut(
    rfm['Monetary'],
    bins=[-1, 5, 15, 40, 100, 10**9],
    labels=[1, 2, 3, 4, 5]
).astype(int)


In [76]:
rfm['RFM_score'] = rfm[['R_rank', 'F_rank', 'M_rank']].sum(axis=1)

rfm['RFM_segment'] = (
    rfm['R_rank'].astype(str) +
    rfm['F_rank'].astype(str) +
    rfm['M_rank'].astype(str)
)


In [78]:
rfm[['Recency','Frequency','Monetary',
     'R_rank','F_rank','M_rank','RFM_score']].describe()

rfm[['R_rank','F_rank','M_rank']].value_counts().head(10)


R_rank  F_rank  M_rank
4       5       5         1203
5       5       5         1056
3       5       5          238
2       5       5            3
Name: count, dtype: int64

In [80]:
rfm.head(10)


Unnamed: 0_level_0,Recency,Frequency,Monetary,R_rank,F_rank,M_rank,RFM_score,RFM_segment
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
user_10000,1,2572,13258,5,5,5,15,555
user_10001,2,2679,13899,4,5,5,14,455
user_10002,2,2484,11978,4,5,5,14,455
user_10003,3,2615,13394,4,5,5,14,455
user_10004,1,2678,13266,5,5,5,15,555
user_10005,2,2691,13347,4,5,5,14,455
user_10006,1,2808,14219,5,5,5,15,555
user_10007,3,2744,13779,4,5,5,14,455
user_10008,3,2784,13705,4,5,5,14,455
user_10009,3,2720,12344,4,5,5,14,455
