In [1]:
import pandas as pd
import numpy as np
import random
from tqdm.notebook import trange

In [2]:
data = pd.read_csv('~/Downloads/out-4.csv', sep=';')

сначала надо немного предобработать данные, чтобы колонка 'action' стала индексом. А также подравить, что сессия начинается с page_viewed_1

In [4]:
data = data.fillna(0)
data.drop(['index'], axis=1, inplace=True)
data = data.set_index('action')
data.replace([np.inf], 0, inplace=True)
data.loc['session_start', 'page_viewed_1'] = 1

так как начальных данных нет, они генерируются случайно. Кумулятивно суммируются шансы по стобце, генерируюется случайно число на \[0,1\]. Далее по тому, в какой промежуток попало число, выбирается следующее событие. И так пока сессия не попадет в 'session_end'.

In [7]:
def get_next_action(current_action, rnd):
    return data.columns[(rnd <= data.loc[current_action, :].cumsum()).argmax()]

def generate_logs(number_of_sessions=10):
    logs_data = pd.DataFrame(columns=['session', 'action', 'previous_action'])
    # Maybe you need to add 'date' column with random timestamp and random timedelta
    for session in trange(number_of_sessions):
        current_action = 'session_start'    
        logs_data = logs_data.append({'session': session, 
                                      'action': current_action,
                                      'previous_action': np.NAN}, ignore_index=True)
        while current_action != 'session_end':
            rnd = random.random()
            next_action = get_next_action(current_action, rnd)
            logs_data = logs_data.append({'session': session, 
                                          'action': next_action,
                                          'previous_action': current_action}, ignore_index=True)
            current_action = next_action
    return logs_data

In [9]:
logs_data = generate_logs(number_of_sessions=5 * 10**4)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))




по получившемся логам генерируется граф. Чем больше переходов между событиями, тем толще линия.

In [35]:
def generate_graph_data(logs_data):
    tmp = logs_data.groupby(['previous_action', 'action']).count().reset_index()
    tmp = tmp.append({'previous_action':'session_end', 'action':'session_start', 'session': 0}, ignore_index=True) # костыль, чтобы добавить столбец session_start и строчку session_end
    result = tmp.pivot(index='previous_action', columns='action', values='session')
    result = result.fillna(0)
    return result

In [36]:
new_data = generate_graph_data(logs_data)

In [37]:
new_data

action,Q1_q1a1,Q1_q1a2,Q1_q1a3,Q2_q2a1,Q2_q2a2,Q2_q2a3,Q3_q3a1,Q3_q3a2,Q3_q3a3,Q4_q4a1,...,page_viewed_5,page_viewed_6,page_viewed_7,page_viewed_8,page_viewed_9,session_end,session_start,system_favorite,system_optout_CREATIVE,system_share
previous_action,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q1_q1a1,0.0,11.0,0.0,18.0,24.0,2.0,1.0,3.0,1.0,0.0,...,48.0,0.0,0.0,9.0,0.0,13.0,0.0,0.0,0.0,0.0
Q1_q1a2,0.0,0.0,0.0,33.0,56.0,18.0,3.0,0.0,0.0,0.0,...,62.0,1.0,5.0,8.0,0.0,18.0,0.0,2.0,0.0,1.0
Q1_q1a3,1.0,6.0,0.0,13.0,24.0,4.0,2.0,1.0,0.0,1.0,...,39.0,0.0,2.0,3.0,0.0,21.0,0.0,0.0,0.0,1.0
Q2_q2a1,7.0,0.0,1.0,0.0,0.0,0.0,32.0,27.0,23.0,1.0,...,2850.0,2987.0,33.0,63.0,5.0,5.0,0.0,0.0,0.0,0.0
Q2_q2a2,2.0,4.0,1.0,21.0,0.0,0.0,38.0,32.0,19.0,2.0,...,4756.0,1.0,3966.0,101.0,4.0,19.0,0.0,0.0,0.0,0.0
Q2_q2a3,2.0,0.0,0.0,10.0,0.0,0.0,11.0,10.0,1.0,1.0,...,1455.0,2.0,1169.0,22.0,4.0,8.0,0.0,0.0,0.0,0.0
Q3_q3a1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,15.0,1.0,34.0,...,6.0,5.0,6.0,3282.0,2.0,12.0,0.0,3.0,0.0,0.0
Q3_q3a2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,...,3.0,6.0,1.0,3448.0,3238.0,11.0,0.0,0.0,0.0,0.0
Q3_q3a3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,11.0,0.0,26.0,...,4.0,0.0,0.0,2122.0,0.0,8.0,0.0,0.0,0.0,0.0
Q4_q4a1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,3.0,2.0,18.0,0.0,0.0,0.0,0.0


In [39]:
new_data.to_csv('out.csv', sep=';')

![Screenshot%20from%202021-12-13%2021-45-43.png](attachment:Screenshot%20from%202021-12-13%2021-45-43.png)

получилась вот такая вермишель