In [25]:
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
import re

from google.colab import drive
import warnings


warnings.filterwarnings("ignore")
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
# Универсальное регулярное выражение для вашего формата
log_pattern = r'(\S+) - - \[([^\]]+)\] "(\S+) (\S+) ([^"]+)" (\d+) (\d+) (\S+)'

# Создаем пустой список для хранения распарсенных данных
parsed_data = []

# Открываем файл и читаем построчно
with open('/content/drive/MyDrive/access.log', 'r') as file:
    for line in file:
        # Ищем совпадения с нашим шаблоном в каждой строке
        match = re.match(log_pattern, line.strip())
        if match:
            # Если строка соответствует шаблону, извлекаем группы
            parsed_data.append(match.groups())
        else:
            print(f"Не удалось распарсить строку: {line.strip()}")  # Для отладки

# Определяем названия колонок для будущего DataFrame
column_names = ['ip', 'timestamp', 'http_method', 'endpoint', 'http_version', 'status_code', 'response_size', 'session_id']

# Создаем DataFrame
if parsed_data:
    df = pd.DataFrame(parsed_data, columns=column_names)
    print(f"Успешно распарсено {len(df)} строк")
    df.head()
else:
    print("Не удалось распарсить ни одной строки. Проверьте формат файла.")
df

Успешно распарсено 239202 строк


Unnamed: 0,ip,timestamp,http_method,endpoint,http_version,status_code,response_size,session_id
0,179.203.10.126,1/Jan/2015:00:00:02 +03:00,POST,/catalog.phtml,HTTP 1.1,200,4407,ID4887
1,179.203.10.126,1/Jan/2015:00:00:13 +03:00,POST,/search.phtml,HTTP 1.1,200,4025,ID4887
2,179.203.10.126,1/Jan/2015:00:00:15 +03:00,POST,/search.phtml,HTTP 1.1,200,3482,ID4887
3,179.203.10.126,1/Jan/2015:00:00:24 +03:00,POST,/catalog.phtml,HTTP 1.1,200,4904,ID4887
4,179.203.10.126,1/Jan/2015:00:00:30 +03:00,POST,/search.phtml,HTTP 1.1,200,2104,ID4887
...,...,...,...,...,...,...,...,...
239197,121.64.17.46,31/Jan/2015:23:59:40 +03:00,POST,/catalog.phtml,HTTP 1.1,200,2760,ID55681
239198,13.190.220.64,31/Jan/2015:23:59:47 +03:00,POST,/search.phtml,HTTP 1.1,200,2778,ID55720
239199,121.64.17.46,31/Jan/2015:23:59:51 +03:00,POST,/search.phtml,HTTP 1.1,200,2881,ID55681
239200,2.57.156.197,31/Jan/2015:23:59:56 +03:00,POST,/search.phtml,HTTP 1.1,200,3288,ID55721


In [27]:
df.describe()

Unnamed: 0,ip,timestamp,http_method,endpoint,http_version,status_code,response_size,session_id
count,239202,239202,239202,239202,239202,239202,239202,239202
unique,50850,239202,2,102,1,1,3000,50850
top,143.167.87.213,31/Jan/2015:23:59:57 +03:00,POST,/catalog.phtml,HTTP 1.1,200,4026,ID22347
freq,39,1,219864,110410,239202,239202,112,39


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239202 entries, 0 to 239201
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   ip             239202 non-null  object
 1   timestamp      239202 non-null  object
 2   http_method    239202 non-null  object
 3   endpoint       239202 non-null  object
 4   http_version   239202 non-null  object
 5   status_code    239202 non-null  object
 6   response_size  239202 non-null  object
 7   session_id     239202 non-null  object
dtypes: object(8)
memory usage: 14.6+ MB


In [29]:
df.drop(['http_version', 'status_code'], inplace=True, axis=1)

df.head()

Unnamed: 0,ip,timestamp,http_method,endpoint,response_size,session_id
0,179.203.10.126,1/Jan/2015:00:00:02 +03:00,POST,/catalog.phtml,4407,ID4887
1,179.203.10.126,1/Jan/2015:00:00:13 +03:00,POST,/search.phtml,4025,ID4887
2,179.203.10.126,1/Jan/2015:00:00:15 +03:00,POST,/search.phtml,3482,ID4887
3,179.203.10.126,1/Jan/2015:00:00:24 +03:00,POST,/catalog.phtml,4904,ID4887
4,179.203.10.126,1/Jan/2015:00:00:30 +03:00,POST,/search.phtml,2104,ID4887


In [30]:
df['http_method'].value_counts()

Unnamed: 0_level_0,count
http_method,Unnamed: 1_level_1
POST,219864
GET,19338


In [31]:
df['session_id'].value_counts()

Unnamed: 0_level_0,count
session_id,Unnamed: 1_level_1
ID22347,39
ID52666,35
ID21489,33
ID38085,33
ID13435,33
...,...
ID55685,1
ID55686,1
ID4831,1
ID4926,1


In [32]:
df['ip'].value_counts()

Unnamed: 0_level_0,count
ip,Unnamed: 1_level_1
143.167.87.213,39
128.6.12.28,35
94.214.254.139,33
185.59.50.102,33
170.48.140.210,33
...,...
146.95.216.18,1
53.186.86.51,1
136.149.171.73,1
74.56.147.165,1


In [33]:
df['endpoint'].value_counts()

Unnamed: 0_level_0,count
endpoint,Unnamed: 1_level_1
/catalog.phtml,110410
/search.phtml,109454
/order.phtml,1801
/addbasket.phtml?id_book=47,1258
/addbasket.phtml?id_book=72,1248
...,...
/addbasket.phtml?id_book=94,96
/addbasket.phtml?id_book=86,96
/addbasket.phtml?id_book=27,94
/addbasket.phtml?id_book=6,90


#Создание признаков

session_duration: session_end - session_start (в секундах) - 1 признак.

num_requests: count() количество запросов в сессии - 2 признак.

num_unique_endpoints: nunique() количество уникальных страниц - 3 признак.

ratio_search_to_catalog: отношение запросов к поиску к запросам к каталогу - 4 признак.

total_traffic: sum(size) общего трафика сессии - 5 признак.

is_conversion: был ли запрос к addbasket.phtml или order.phtml (если есть) - 6 признак.

session_weekday: день недели сессии - 7 признак.

avg_time_between_requests: среднее время между запросами пользователя - 8 признак.

activity_intensity: интенсивность сессии - 9 признак

In [34]:
df['datetime'] = pd.to_datetime(df['timestamp'], format='%d/%b/%Y:%H:%M:%S %z')

df

Unnamed: 0,ip,timestamp,http_method,endpoint,response_size,session_id,datetime
0,179.203.10.126,1/Jan/2015:00:00:02 +03:00,POST,/catalog.phtml,4407,ID4887,2015-01-01 00:00:02+03:00
1,179.203.10.126,1/Jan/2015:00:00:13 +03:00,POST,/search.phtml,4025,ID4887,2015-01-01 00:00:13+03:00
2,179.203.10.126,1/Jan/2015:00:00:15 +03:00,POST,/search.phtml,3482,ID4887,2015-01-01 00:00:15+03:00
3,179.203.10.126,1/Jan/2015:00:00:24 +03:00,POST,/catalog.phtml,4904,ID4887,2015-01-01 00:00:24+03:00
4,179.203.10.126,1/Jan/2015:00:00:30 +03:00,POST,/search.phtml,2104,ID4887,2015-01-01 00:00:30+03:00
...,...,...,...,...,...,...,...
239197,121.64.17.46,31/Jan/2015:23:59:40 +03:00,POST,/catalog.phtml,2760,ID55681,2015-01-31 23:59:40+03:00
239198,13.190.220.64,31/Jan/2015:23:59:47 +03:00,POST,/search.phtml,2778,ID55720,2015-01-31 23:59:47+03:00
239199,121.64.17.46,31/Jan/2015:23:59:51 +03:00,POST,/search.phtml,2881,ID55681,2015-01-31 23:59:51+03:00
239200,2.57.156.197,31/Jan/2015:23:59:56 +03:00,POST,/search.phtml,3288,ID55721,2015-01-31 23:59:56+03:00


In [35]:
df['response_size'] = df['response_size'].astype(int)
session_features = df.groupby('session_id').agg({
    'datetime': ['min', 'max'],  # session_start и session_end
    'endpoint': ['count', 'nunique'],  # num_requests и num_unique_endpoints
    'response_size': 'sum'  # total_traffic
}).reset_index()

session_features.columns = [
    'session_id',
    'session_start',
    'session_end',
    'num_requests',
    'num_unique_endpoints',
    'total_traffic'
]

# Создаем session_duration (в секундах)
session_features['session_duration'] = (
    session_features['session_end'] - session_features['session_start']
).dt.total_seconds()

# Создаем ratio_search_to_catalog
search_requests = df[df['endpoint'].str.contains('search', case=False, na=False)]
catalog_requests = df[df['endpoint'].str.contains('catalog', case=False, na=False)]

search_counts = search_requests.groupby('session_id').size()
catalog_counts = catalog_requests.groupby('session_id').size()

# Объединяем с основным DataFrame
session_features = session_features.merge(
    search_counts.rename('search_count'),
    on='session_id',
    how='left'
).merge(
    catalog_counts.rename('catalog_count'),
    on='session_id',
    how='left'
)

# Заполняем NaN нулями и вычисляем отношение
session_features['search_count'] = session_features['search_count'].fillna(0)
session_features['catalog_count'] = session_features['catalog_count'].fillna(0)
session_features['ratio_search_to_catalog'] = session_features.apply(
    lambda x: x['search_count'] / x['catalog_count'] if x['catalog_count'] > 0 else 0,
    axis=1
)

# Создаем is_conversion
conversion_sessions = df[
    df['endpoint'].str.contains('addbasket|order', case=False, na=False)
]['session_id'].unique()

session_features['is_conversion'] = session_features['session_id'].isin(conversion_sessions)

# Убираем временные колонки
session_features = session_features.drop(['search_count', 'catalog_count'], axis=1)

session_features

Unnamed: 0,session_id,session_start,session_end,num_requests,num_unique_endpoints,total_traffic,session_duration,ratio_search_to_catalog,is_conversion
0,ID10000,2015-01-04 02:57:26+03:00,2015-01-04 02:58:38+03:00,5,3,19741,72.0,3.0,True
1,ID10001,2015-01-04 02:58:32+03:00,2015-01-04 03:00:01+03:00,6,2,23735,89.0,0.5,False
2,ID10002,2015-01-04 02:58:40+03:00,2015-01-04 02:58:40+03:00,1,1,2513,0.0,0.0,False
3,ID10003,2015-01-04 03:00:25+03:00,2015-01-04 03:00:50+03:00,3,2,9111,25.0,0.5,False
4,ID10004,2015-01-04 03:01:28+03:00,2015-01-04 03:01:28+03:00,1,1,3483,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...
50845,ID9995,2015-01-04 02:50:30+03:00,2015-01-04 02:53:02+03:00,4,3,14170,152.0,0.5,True
50846,ID9996,2015-01-04 02:53:37+03:00,2015-01-04 02:53:48+03:00,3,2,9306,11.0,2.0,False
50847,ID9997,2015-01-04 02:53:45+03:00,2015-01-04 02:57:02+03:00,7,3,24475,197.0,2.0,True
50848,ID9998,2015-01-04 02:54:26+03:00,2015-01-04 02:56:11+03:00,2,1,6863,105.0,0.0,False


In [38]:
# День недели (0-6, где 0 - понедельник)
session_features['session_weekday'] = session_features['session_start'].dt.weekday

# Среднее время между запросами
session_features['avg_time_between_requests'] = session_features['session_duration'] / session_features['num_requests']

# Интенсивность сессии
session_features['activity_intensity'] = session_features['num_requests'] * session_features['total_traffic'] / (session_features['session_duration'] + 1)
session_features

Unnamed: 0,session_id,session_start,session_end,num_requests,num_unique_endpoints,total_traffic,session_duration,ratio_search_to_catalog,is_conversion,session_weekday,avg_time_between_requests,activity_intensity
0,ID10000,2015-01-04 02:57:26+03:00,2015-01-04 02:58:38+03:00,5,3,19741,72.0,3.0,True,6,14.400000,1352.123288
1,ID10001,2015-01-04 02:58:32+03:00,2015-01-04 03:00:01+03:00,6,2,23735,89.0,0.5,False,6,14.833333,1582.333333
2,ID10002,2015-01-04 02:58:40+03:00,2015-01-04 02:58:40+03:00,1,1,2513,0.0,0.0,False,6,0.000000,2513.000000
3,ID10003,2015-01-04 03:00:25+03:00,2015-01-04 03:00:50+03:00,3,2,9111,25.0,0.5,False,6,8.333333,1051.269231
4,ID10004,2015-01-04 03:01:28+03:00,2015-01-04 03:01:28+03:00,1,1,3483,0.0,0.0,False,6,0.000000,3483.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
50845,ID9995,2015-01-04 02:50:30+03:00,2015-01-04 02:53:02+03:00,4,3,14170,152.0,0.5,True,6,38.000000,370.457516
50846,ID9996,2015-01-04 02:53:37+03:00,2015-01-04 02:53:48+03:00,3,2,9306,11.0,2.0,False,6,3.666667,2326.500000
50847,ID9997,2015-01-04 02:53:45+03:00,2015-01-04 02:57:02+03:00,7,3,24475,197.0,2.0,True,6,28.142857,865.277778
50848,ID9998,2015-01-04 02:54:26+03:00,2015-01-04 02:56:11+03:00,2,1,6863,105.0,0.0,False,6,52.500000,129.490566


In [39]:
session_features['session_weekday'].value_counts()

Unnamed: 0_level_0,count
session_weekday,Unnamed: 1_level_1
4,8193
3,8146
5,8145
1,6670
6,6622
0,6542
2,6532
