In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('log.csv', dayfirst = True, index_col=False)
data['date'] = data.date.replace('_', ' ', regex=True)
data['id'] = np.arange(data.shape[0])

In [3]:
data.head()

Unnamed: 0,date,user,event_type,parameter,id
0,2020-04-01 00:05:46,180397,3,,0
1,2020-04-01 00:00:21,662939,4,music,1
2,2020-04-01 00:20:50,703220,2,,2
3,2020-04-01 00:22:31,405034,1,unknown,3
4,2020-04-01 00:30:21,662939,0,text,4


In [4]:
data['DateTime'] = pd.to_datetime(data.date)
data.drop(['date'], axis=1, inplace = True)
data.head()

Unnamed: 0,user,event_type,parameter,id,DateTime
0,180397,3,,0,2020-04-01 00:05:46
1,662939,4,music,1,2020-04-01 00:00:21
2,703220,2,,2,2020-04-01 00:20:50
3,405034,1,unknown,3,2020-04-01 00:22:31
4,662939,0,text,4,2020-04-01 00:30:21


In [6]:
data.sort_values(by=['user', 'DateTime'], inplace = True, ignore_index = True)
data.head()

Unnamed: 0,user,event_type,parameter,id,DateTime
0,100392,1,text,18912,2020-04-03 13:03:14
1,100392,0,unknown,39539,2020-04-07 03:22:44
2,100392,0,unknown,45393,2020-04-07 17:56:25
3,100392,2,unknown,61641,2020-04-09 20:01:41
4,100392,4,unknown,75764,2020-04-12 22:06:12


In [7]:
data['diff'] = data.groupby('user')['DateTime'].diff(1)  # посчитаем разницу между временем посещения страницы и временем посещения предыдущей страницы
data.head()

Unnamed: 0,user,event_type,parameter,id,DateTime,diff
0,100392,1,text,18912,2020-04-03 13:03:14,NaT
1,100392,0,unknown,39539,2020-04-07 03:22:44,3 days 14:19:30
2,100392,0,unknown,45393,2020-04-07 17:56:25,0 days 14:33:41
3,100392,2,unknown,61641,2020-04-09 20:01:41,2 days 02:05:16
4,100392,4,unknown,75764,2020-04-12 22:06:12,3 days 02:04:31


датафрейм будет содержать события, которые будут считаться первыми событиями сессий. К таким событиям относятся все события, которые произошли спустя более чем 30 минут после предыдущего, либо события, которые были первыми для пользователя (NaT в колонке ’diff’).


In [8]:
sessions_start = data[(data['diff'].isnull()) | (data['diff'] > '1800 seconds')]
sessions_start['session_id'] = sessions_start['id']
sessions_start.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,user,event_type,parameter,id,DateTime,diff,session_id
0,100392,1,text,18912,2020-04-03 13:03:14,NaT,18912
1,100392,0,unknown,39539,2020-04-07 03:22:44,3 days 14:19:30,39539
2,100392,0,unknown,45393,2020-04-07 17:56:25,0 days 14:33:41,45393
3,100392,2,unknown,61641,2020-04-09 20:01:41,2 days 02:05:16,61641
4,100392,4,unknown,75764,2020-04-12 22:06:12,3 days 02:04:31,75764


In [9]:
data = data.sort_values('id')
sessions_start = sessions_start.sort_values('id')
data = pd.merge_asof(data,sessions_start[['id','user','session_id']],on='id',by='user')

In [10]:
data.head()

Unnamed: 0,user,event_type,parameter,id,DateTime,diff,session_id
0,180397,3,,0,2020-04-01 00:05:46,NaT,0
1,662939,4,music,1,2020-04-01 00:00:21,NaT,1
2,703220,2,,2,2020-04-01 00:20:50,NaT,2
3,405034,1,unknown,3,2020-04-01 00:22:31,NaT,3
4,662939,0,text,4,2020-04-01 00:30:21,0 days 00:30:00,1


создадим колонку ’is_first_event_in_session’, для событий, которые были первыми в сессиях


In [11]:
data['is_first_event_in_session'] = data['id'] == data['session_id']
data.head()

Unnamed: 0,user,event_type,parameter,id,DateTime,diff,session_id,is_first_event_in_session
0,180397,3,,0,2020-04-01 00:05:46,NaT,0,True
1,662939,4,music,1,2020-04-01 00:00:21,NaT,1,True
2,703220,2,,2,2020-04-01 00:20:50,NaT,2,True
3,405034,1,unknown,3,2020-04-01 00:22:31,NaT,3,True
4,662939,0,text,4,2020-04-01 00:30:21,0 days 00:30:00,1,False


In [12]:
import datetime
session_test_min = '2020-04-14'
session_test_max = '2020-04-15'

session_date_min = datetime.datetime.strptime(session_test_min, '%Y-%m-%d')
session_date_max = datetime.datetime.strptime(session_test_max, '%Y-%m-%d')

In [13]:
sessins_amount = data[(data.DateTime >= session_date_min) & (data.DateTime < session_date_max) \
                      & (data.is_first_event_in_session == True)]
sessins_amount.head()

Unnamed: 0,user,event_type,parameter,id,DateTime,diff,session_id,is_first_event_in_session
84290,783636,2,video,84290,2020-04-14 00:04:00,0 days 06:25:55,84290,True
84307,728097,3,,84307,2020-04-14 00:07:46,0 days 06:05:17,84307,True
84312,612033,0,video,84312,2020-04-14 00:16:00,1 days 01:38:20,84312,True
84313,265944,4,music,84313,2020-04-14 00:19:16,0 days 06:42:01,84313,True
84314,980582,4,,84314,2020-04-14 00:25:09,0 days 06:11:16,84314,True


In [14]:
sessins_amount.shape[0]

4066