In [1]:
import pandas as pd

## Ex 01

Создадим DataFrame с именем `views` с двумя столбцами: `datetime` и `user`, прочитав файл `feed-views.log`.

In [2]:
df = pd.read_csv('../data/feed-views.log', names=['datetime', 'user'], sep='\t', engine='python')
views = pd.DataFrame(df)
views

Unnamed: 0,datetime,user
0,2020-04-17 12:01:08.463179,artem
1,2020-04-17 12:01:23.743946,artem
2,2020-04-17 12:27:30.646665,artem
3,2020-04-17 12:35:44.884757,artem
4,2020-04-17 12:35:52.735016,artem
...,...,...
1071,2020-05-21 18:45:20.441142,valentina
1072,2020-05-21 23:03:06.457819,maxim
1073,2020-05-21 23:23:49.995349,pavel
1074,2020-05-21 23:49:22.386789,artem


Преобразуем datetime к datetime64[ns] Dtype

In [3]:
views['datetime'] = pd.to_datetime(views['datetime'], errors='coerce')
views.dtypes

datetime    datetime64[ns]
user                object
dtype: object

Извлечем год, месяц, день, час, минуту и секунду из значений столбца datetime в новые столбцы

In [4]:
views['year'] = views['datetime'].dt.year
views['month'] = views['datetime'].dt.month
views['day'] = views['datetime'].dt.day
views['hour'] = views['datetime'].dt.hour
views['minute'] = views['datetime'].dt.minute
views['second'] = views['datetime'].dt.second

views[['datetime', 'year', 'month', 'day', 'hour', 'minute', 'second']]
views

Unnamed: 0,datetime,user,year,month,day,hour,minute,second
0,2020-04-17 12:01:08.463179,artem,2020.0,4.0,17.0,12.0,1.0,8.0
1,2020-04-17 12:01:23.743946,artem,2020.0,4.0,17.0,12.0,1.0,23.0
2,2020-04-17 12:27:30.646665,artem,2020.0,4.0,17.0,12.0,27.0,30.0
3,2020-04-17 12:35:44.884757,artem,2020.0,4.0,17.0,12.0,35.0,44.0
4,2020-04-17 12:35:52.735016,artem,2020.0,4.0,17.0,12.0,35.0,52.0
...,...,...,...,...,...,...,...,...
1071,2020-05-21 18:45:20.441142,valentina,2020.0,5.0,21.0,18.0,45.0,20.0
1072,2020-05-21 23:03:06.457819,maxim,2020.0,5.0,21.0,23.0,3.0,6.0
1073,2020-05-21 23:23:49.995349,pavel,2020.0,5.0,21.0,23.0,23.0,49.0
1074,2020-05-21 23:49:22.386789,artem,2020.0,5.0,21.0,23.0,49.0,22.0


Создадим новый столбец daytime с определенным значением времени дня

In [5]:
views['daytime'] = pd.cut(x=views['datetime'].dt.hour, 
                          bins=[0, 4, 7, 11, 17, 20, 24], 
                          labels=['night', 'early morning', 'morning', 'afternoon', 'early evening', 'evening'], right=False)
views

Unnamed: 0,datetime,user,year,month,day,hour,minute,second,daytime
0,2020-04-17 12:01:08.463179,artem,2020.0,4.0,17.0,12.0,1.0,8.0,afternoon
1,2020-04-17 12:01:23.743946,artem,2020.0,4.0,17.0,12.0,1.0,23.0,afternoon
2,2020-04-17 12:27:30.646665,artem,2020.0,4.0,17.0,12.0,27.0,30.0,afternoon
3,2020-04-17 12:35:44.884757,artem,2020.0,4.0,17.0,12.0,35.0,44.0,afternoon
4,2020-04-17 12:35:52.735016,artem,2020.0,4.0,17.0,12.0,35.0,52.0,afternoon
...,...,...,...,...,...,...,...,...,...
1071,2020-05-21 18:45:20.441142,valentina,2020.0,5.0,21.0,18.0,45.0,20.0,early evening
1072,2020-05-21 23:03:06.457819,maxim,2020.0,5.0,21.0,23.0,3.0,6.0,evening
1073,2020-05-21 23:23:49.995349,pavel,2020.0,5.0,21.0,23.0,23.0,49.0,evening
1074,2020-05-21 23:49:22.386789,artem,2020.0,5.0,21.0,23.0,49.0,22.0,evening


Назначим столбец user в качестве индекса

In [6]:
views.index = views['user']

Рассчитаем количество элементов в DataFrame 

In [7]:
views.count()

datetime    1075
user        1075
year        1075
month       1075
day         1075
hour        1075
minute      1075
second      1075
daytime     1075
dtype: int64

Рассчитаем количество элементов в каждой категории времени дня

In [8]:
views['daytime'].value_counts()

daytime
evening          509
afternoon        252
early evening    145
night            129
morning           35
early morning      5
Name: count, dtype: int64

Отсортируем значения в DataFrame по часу, минуте и секунде одновременно в восходящем порядке

In [9]:
views_sorted = views.sort_values(by=['hour', 'minute', 'second'])
views_sorted

Unnamed: 0_level_0,datetime,user,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
valentina,2020-05-15 00:00:13.222265,valentina,2020.0,5.0,15.0,0.0,0.0,13.0,night
valentina,2020-05-15 00:01:05.153738,valentina,2020.0,5.0,15.0,0.0,1.0,5.0,night
pavel,2020-05-12 00:01:27.764025,pavel,2020.0,5.0,12.0,0.0,1.0,27.0,night
pavel,2020-05-12 00:01:38.444917,pavel,2020.0,5.0,12.0,0.0,1.0,38.0,night
pavel,2020-05-12 00:01:55.395042,pavel,2020.0,5.0,12.0,0.0,1.0,55.0,night
...,...,...,...,...,...,...,...,...,...
anatoliy,2020-05-09 23:53:55.599821,anatoliy,2020.0,5.0,9.0,23.0,53.0,55.0,evening
pavel,2020-05-09 23:54:54.260791,pavel,2020.0,5.0,9.0,23.0,54.0,54.0,evening
valentina,2020-05-14 23:58:56.754866,valentina,2020.0,5.0,14.0,23.0,58.0,56.0,evening
alexander,2020-05-14 23:59:38.758438,alexander,2020.0,5.0,14.0,23.0,59.0,38.0,evening


Рассчитаем минимум и максимум для часов и моду для категорий времени дня

In [10]:
views['daytime'].mode()[0]

'evening'

In [11]:
f"Минимальный час: {min(views['hour'])}, максимальный: {max(views['hour'])}"

'Минимальный час: 0.0, максимальный: 23.0'

Рассчитаем максимальный час для строк, где время дня - "night"

In [12]:
views['time'] = views['datetime'].dt.time
views['tmp_col_night'] = views.apply(lambda x: x['time'] if x['daytime'] == 'night' else None, axis=1)
night_records = views.dropna(subset=['tmp_col_night'])
del views['tmp_col_night']
del night_records['tmp_col_night']
max(night_records['hour'])

3.0

Рассчитаем минимальный час для строк, где время дня - "morning"

In [13]:
views['tmp_col_morning'] = views.apply(lambda x: x['time'] if x['daytime'] == 'morning' else None, axis=1)
morning_records = views.dropna(subset=['tmp_col_morning'])
del views['tmp_col_morning']
del morning_records['tmp_col_morning']
min(morning_records['hour'])

8.0

Выясним, кто посещал страницу в эти часы

In [14]:
night_records[['datetime', 'daytime']].head()

Unnamed: 0_level_0,datetime,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1
artem,2020-04-18 00:30:45.051569,night
konstantin,2020-04-19 00:17:22.720860,night
konstantin,2020-04-19 00:17:28.624573,night
konstantin,2020-04-19 00:43:03.766276,night
konstantin,2020-04-19 00:44:18.631684,night


In [15]:
morning_records[['datetime', 'daytime', 'time']].head()

Unnamed: 0_level_0,datetime,daytime,time
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
konstantin,2020-04-18 10:53:52.623447,morning,10:53:52.623447
maxim,2020-04-18 10:56:55.833899,morning,10:56:55.833899
konstantin,2020-04-19 10:48:18.715956,morning,10:48:18.715956
aleksey,2020-04-22 10:45:25.601360,morning,10:45:25.601360
artem,2020-04-24 09:42:47.598208,morning,09:42:47.598208


Рассчитаем моду для часа и времени дня

In [16]:
print(f"Мода часа: {views['hour'].mode()[0]}, мода времени дня: {views['time'].mode()[0]}")

Мода часа: 22.0, мода времени дня: 00:00:13.222265


Покажем три самых ранних и самых поздних часа дня и соответствующие им имена пользователей 

In [17]:
views['time_second'] = views['time'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
print('Самые ранние:')
print((views.nsmallest(3, 'time_second'))[['time']])
print('Самые поздние:')
print((views.nlargest(3, 'time_second'))[['time']])

Самые ранние:
                      time
user                      
valentina  00:00:13.222265
valentina  00:01:05.153738
pavel      00:01:27.764025
Самые поздние:
                      time
user                      
alexander  23:59:38.758438
valentina  23:58:56.754866
pavel      23:54:54.260791


Используем метод для получения базовой статистики по столбцам

In [18]:
views.describe()

Unnamed: 0,datetime,year,month,day,hour,minute,second,time_second
count,1075,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0,1075.0
mean,2020-05-10 09:30:02.870843904,2020.0,4.871628,13.548837,16.254884,29.603721,29.493953,60323.298605
min,2020-04-17 12:01:08.463179,2020.0,4.0,1.0,0.0,0.0,0.0,13.0
25%,2020-05-10 01:13:52.574879488,2020.0,5.0,11.0,13.0,14.0,14.0,49846.5
50%,2020-05-11 22:48:40.637413888,2020.0,5.0,13.0,19.0,29.0,30.0,69585.0
75%,2020-05-14 14:44:35.500612608,2020.0,5.0,15.0,22.0,46.0,45.0,79789.5
max,2020-05-22 10:36:14.662600,2020.0,5.0,30.0,23.0,59.0,59.0,86378.0
std,,0.0,0.334659,4.906973,6.956112,17.677885,17.412102,25026.754532


Рассчитаем наиболее популярный интервал посещений, используя межквартильные интервалы

In [19]:
hour_stats = views['hour'].describe()
q1 = hour_stats['25%']
q3 = hour_stats['75%']  
iqr = q3 - q1
print(f"Первый квартиль: {q1}")
print(f"Третий квартиль: {q3}")
print(f"Межквартильный размах: {iqr}")

Первый квартиль: 13.0
Третий квартиль: 22.0
Межквартильный размах: 9.0
