In [1]:
import pandas as pd

In [2]:
views = pd.read_csv("../../datasets/feed-views.log",sep="\t",names=["datetime","user"])

In [3]:
views.datetime = pd.to_datetime(views.datetime)

## Create views

In [4]:
views['datetime'] = pd.to_datetime(views['datetime'], format='%Y-%m-%d %H:%M:%S.%f')
views['year'] = views['datetime'].dt.year
views['month'] = views['datetime'].dt.month
views['day'] = views['datetime'].dt.day
views['hour'] = views['datetime'].dt.hour
views['minute'] = views['datetime'].dt.minute
views['second'] = views['datetime'].dt.second
views.head()

Unnamed: 0,datetime,user,year,month,day,hour,minute,second
0,2020-04-17 12:01:08.463179,artem,2020,4,17,12,1,8
1,2020-04-17 12:01:23.743946,artem,2020,4,17,12,1,23
2,2020-04-17 12:27:30.646665,artem,2020,4,17,12,27,30
3,2020-04-17 12:35:44.884757,artem,2020,4,17,12,35,44
4,2020-04-17 12:35:52.735016,artem,2020,4,17,12,35,52


In [5]:
views.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1076 entries, 0 to 1075
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  1076 non-null   datetime64[ns]
 1   user      1076 non-null   object        
 2   year      1076 non-null   int32         
 3   month     1076 non-null   int32         
 4   day       1076 non-null   int32         
 5   hour      1076 non-null   int32         
 6   minute    1076 non-null   int32         
 7   second    1076 non-null   int32         
dtypes: datetime64[ns](1), int32(6), object(1)
memory usage: 42.2+ KB


## Create daytime

In [6]:
bins = [0,4,7,11,17,20,24]
labels = ['night','early morning','morning','afternoon','early evening','evening']
views['daytime'] = pd.cut(views['hour'],bins=bins,labels=labels, right = False, ordered = True)
views.head()

Unnamed: 0,datetime,user,year,month,day,hour,minute,second,daytime
0,2020-04-17 12:01:08.463179,artem,2020,4,17,12,1,8,afternoon
1,2020-04-17 12:01:23.743946,artem,2020,4,17,12,1,23,afternoon
2,2020-04-17 12:27:30.646665,artem,2020,4,17,12,27,30,afternoon
3,2020-04-17 12:35:44.884757,artem,2020,4,17,12,35,44,afternoon
4,2020-04-17 12:35:52.735016,artem,2020,4,17,12,35,52,afternoon


In [7]:
views = views.set_index('user')
views.head()

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
artem,2020-04-17 12:01:08.463179,2020,4,17,12,1,8,afternoon
artem,2020-04-17 12:01:23.743946,2020,4,17,12,1,23,afternoon
artem,2020-04-17 12:27:30.646665,2020,4,17,12,27,30,afternoon
artem,2020-04-17 12:35:44.884757,2020,4,17,12,35,44,afternoon
artem,2020-04-17 12:35:52.735016,2020,4,17,12,35,52,afternoon


## Calculate count

In [8]:
views.count()

datetime    1076
year        1076
month       1076
day         1076
hour        1076
minute      1076
second      1076
daytime     1076
dtype: int64

In [9]:
views.daytime.value_counts()

daytime
evening          509
afternoon        252
early evening    145
night            129
morning           36
early morning      5
Name: count, dtype: int64

In [10]:
views = views.sort_values(by=['hour','minute','second'])
print(views)

                            datetime  year  month  day  hour  minute  second  \
user                                                                           
valentina 2020-05-15 00:00:13.222265  2020      5   15     0       0      13   
valentina 2020-05-15 00:01:05.153738  2020      5   15     0       1       5   
pavel     2020-05-12 00:01:27.764025  2020      5   12     0       1      27   
pavel     2020-05-12 00:01:38.444917  2020      5   12     0       1      38   
pavel     2020-05-12 00:01:55.395042  2020      5   12     0       1      55   
...                              ...   ...    ...  ...   ...     ...     ...   
artem     2020-05-21 23:49:22.386789  2020      5   21    23      49      22   
anatoliy  2020-05-09 23:53:55.599821  2020      5    9    23      53      55   
pavel     2020-05-09 23:54:54.260791  2020      5    9    23      54      54   
valentina 2020-05-14 23:58:56.754866  2020      5   14    23      58      56   
alexander 2020-05-14 23:59:38.758438  20

## Night max 

print('Максимальный час ночью')
night_max = views[views['daytime'] == 'night']['hour'].max()
print(night_max)
print('Пример пользователя в этот час')
print(views[(views['daytime'] == 'night') & (views['hour'] == night_max)].iloc[0][['datetime']])

## Morning

In [11]:
print('минимальный час утром')
morning_min = views[views['daytime'] == 'morning']['hour'].min()
print(morning_min)
print('Пример пользователя в этот час')
print(views[(views['daytime'] == 'morning') & (views['hour'] == morning_min)].iloc[0][['datetime']])

минимальный час утром
8
Пример пользователя в этот час
datetime    2020-05-15 08:16:03.918402
Name: alexander, dtype: object


## Mode

In [12]:
daytime_mode = views['daytime'].mode()[0]
hour_mode = views['hour'].mode()[0]
print(f'Самое частое время дня: {daytime_mode}')
print(f'Самый частый час: {hour_mode}')

Самое частое время дня: evening
Самый частый час: 22


## Earliest hours

In [13]:
print(views[['hour','datetime']].nsmallest(3,'hour'))

           hour                   datetime
user                                      
valentina     0 2020-05-15 00:00:13.222265
valentina     0 2020-05-15 00:01:05.153738
pavel         0 2020-05-12 00:01:27.764025


## Latest hours

In [14]:
print(views[['hour','datetime']].nlargest(3,'hour'))

           hour                   datetime
user                                      
ekaterina    23 2020-05-14 23:02:11.327532
ekaterina    23 2020-05-14 23:02:14.494985
ekaterina    23 2020-05-14 23:02:15.588808


### Statistic by describe

In [15]:
stat = views.describe()
print(stat)

                            datetime    year        month          day  \
count                           1076  1076.0  1076.000000  1076.000000   
mean   2020-05-10 09:00:41.211420672  2020.0     4.870818    13.552974   
min       2020-04-17 12:01:08.463179  2020.0     4.000000     1.000000   
25%       2020-05-10 01:13:49.857472  2020.0     5.000000    11.000000   
50%    2020-05-11 22:48:35.302552832  2020.0     5.000000    13.000000   
75%    2020-05-14 14:44:34.749530624  2020.0     5.000000    15.000000   
max       2020-05-22 10:36:14.662600  2020.0     5.000000    30.000000   
std                              NaN     0.0     0.335557     4.906567   

              hour       minute       second  
count  1076.000000  1076.000000  1076.000000  
mean     16.249071    29.629182    29.500929  
min       0.000000     0.000000     0.000000  
25%      13.000000    14.000000    14.000000  
50%      19.000000    29.000000    30.000000  
75%      22.000000    46.000000    45.000000  
max 

In [16]:
q1 = stat.loc['25%','hour']
q3 = stat.loc['75%','hour']
iqr = q3-q1
print(f'IQR: {int(iqr)} часов')
print(f'Самый популярный интервал c {int(q1)} до {int(q3)} часов')

IQR: 9 часов
Самый популярный интервал c 13 до 22 часов


In [17]:
views.loc[views.daytime == 'night'].hour.idxmax()

'konstantin'

In [18]:
views.loc[views.daytime == 'morning'].hour.idxmin()

'alexander'

In [19]:
views.hour.mode()

0    22
Name: hour, dtype: int32

In [20]:
views.daytime.mode()

0    evening
Name: daytime, dtype: category
Categories (6, object): ['night' < 'early morning' < 'morning' < 'afternoon' < 'early evening' < 'evening']