In [60]:

import pandas as pd
import numpy as np
from pathlib import Path
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
import scipy.stats as scs


In [61]:
file_path=Path().cwd().joinpath('data').joinpath('test_data.xlsx')

In [62]:
def getyour_xls(path:str=file_path):
    df_dirt=pd.read_excel(file_path)
    df=df_dirt[['date','visiters']]
    df = df.set_index('date')
    df=df.loc[df['visiters'] != 'close']
    df.index = pd.to_datetime(df.index)
    df["hour"] = df.index.hour
    df["weekday"] = df.index.weekday
    df['is_weekend'] = df.weekday.isin([5,6])*1
    df=df.dropna()
    return df

data=(getyour_xls())
data

Unnamed: 0_level_0,visiters,hour,weekday,is_weekend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-01-01 08:00:00,2,8,6,1
2023-01-01 08:15:00,0,8,6,1
2023-01-01 08:30:00,9,8,6,1
2023-01-01 08:45:00,0,8,6,1
2023-01-01 09:00:00,8,9,6,1
...,...,...,...,...
2023-01-14 17:00:00,30,17,5,1
2023-01-14 17:15:00,26,17,5,1
2023-01-14 17:30:00,22,17,5,1
2023-01-14 17:45:00,18,17,5,1


In [63]:
def fill_dataframe(df, start_date, end_date, frequency='M', value=0):
    """
    """
    date_range = pd.date_range(start=start_date, end=end_date, freq=frequency)
    for date in date_range:
        if date not in df.index:
            df.loc[date] = value


In [64]:
fill_dataframe(data,'2023-01-14 18:00:00.00','2023-01-14 22:00:00','15T')

In [65]:
data

Unnamed: 0_level_0,visiters,hour,weekday,is_weekend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-01-01 08:00:00,2,8,6,1
2023-01-01 08:15:00,0,8,6,1
2023-01-01 08:30:00,9,8,6,1
2023-01-01 08:45:00,0,8,6,1
2023-01-01 09:00:00,8,9,6,1
...,...,...,...,...
2023-01-14 21:00:00,0,0,0,0
2023-01-14 21:15:00,0,0,0,0
2023-01-14 21:30:00,0,0,0,0
2023-01-14 21:45:00,0,0,0,0


In [66]:
def code_mean(data, cat_feature, real_feature):
    """
    Возвращает словарь, где ключами являются уникальные категории признака cat_feature, 
    а значениями - средние по real_feature
    """
    return dict(data.groupby(cat_feature)[real_feature].mean())

def prepareData(data, split_date, lag_start=5, lag_end=20, test_size=0.15):

    data = pd.DataFrame(data.copy())

    # добавляем лаги исходного ряда в качестве признаков
    for i in range(lag_start, lag_end):
        data["lag_{}".format(i)] = data.visiters.shift(i)

    data["hour"] = data.index.hour
    data["weekday"] = data.index.weekday
    data['is_weekend'] = data.weekday.isin([5, 6]) * 1

    # считаем средние только по тренировочной части, чтобы избежать лика
    data['weekday_average'] = data['weekday'].map(code_mean(data.loc[:split_date], 'weekday', "visiters"))
    data["hour_average"] = data['hour'].map(code_mean(data.loc[:split_date], 'hour', "visiters"))

    # выкидываем закодированные средними признаки
    data.drop(["hour", "weekday"], axis=1, inplace=True)

    data = data.dropna()
    
    # разбиваем весь датасет на тренировочную и тестовую выборку
    X_train = data.loc[data.index <= split_date].drop(["visiters"], axis=1)
    y_train = data.loc[data.index <= split_date]["visiters"]
    X_test = data.loc[data.index > split_date].drop(["visiters"], axis=1)
    y_test = data.loc[data.index > split_date]["visiters"]

    return X_train, X_test, y_train, y_test


In [67]:
code_mean(data, "weekday", "visiters")

{0: 0.0,
 2: 8.768292682926829,
 3: 10.304878048780488,
 4: 9.707317073170731,
 5: 9.878048780487806,
 6: 9.0}

In [76]:
from sklearn.linear_model import LassoLars
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
X_train, X_test, y_train, y_test = prepareData(data,split_date="2023-01-14 18:00:00.00" ,test_size=0.3, lag_start=12, lag_end=48)
lr = RandomForestRegressor()
lr.fit(X_train, y_train)
prediction = lr.predict(X_test)
print(f"Ошибка в кол-во человек составляет {round(mean_absolute_error(prediction, y_test))}")

Ошибка составляет 10


In [77]:
lr.predict(X_test)

array([10.28, 11.17,  9.32])

In [70]:
X_test.index.strftime('%Y-%m-%d %H:%M:%S').tolist()

['2023-01-14 18:15:00', '2023-01-14 18:30:00', '2023-01-14 18:45:00']