In [138]:
import json
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import prophet
import warnings

warnings.filterwarnings("ignore")

In [139]:
# Экспоненциальное сглаживание
def exponential_smoothing(series, alpha):
    result = [series[0]]
    for n in range(1, len(series)):
        result.append(alpha * series[n] + (1 - alpha) * result[n - 1])
    return result

# Получим первое число месяца
def get_first_day_of_month(date):
    return date.replace(day=1)

In [140]:
# Получим все даты с 2022-01-01 до 2023-01-31 во время 00:00:00 (например 2022-01-01 00:00:00)
def get_all_dates():
    dates = pd.date_range(start='2022-01-01 00:00:00', end='2023-01-31 00:00:00')
    return dates

# Получим первое число месяца в виде datetime.datetime



In [141]:
# Скользящее среднее
def rolling_mean(series, value):
    return series.rolling(window=value).mean()

In [142]:
def make_series(df,
                category,
                is_exponential_smoothing=False,
                alpha=0.25,
                is_rolling_mean=False,
                rolling_value=20
                ):

    if isinstance(category, str):
        category = [category]

    # Выделим нужные данные
    data = df[df['category'].isin(category)]
    # Оставим только дату и нормированный кешбек
    data = data[['day', 'normalized_cashback']]

    # Получим все даты с 01.01.2022 по 01.01.2023
    dates = list(map(str, get_all_dates()))

    values = []
    for date in dates:
        # Если дата есть в данных, то добавим значение кешбека
        if date in data['day'].values:
            temp = data[data['day'] == date]['normalized_cashback'].values

        # Иначе добавим 0
        else:
            temp = np.array([0])

        values.append(np.mean(temp))

    values = np.array(values)

    if is_exponential_smoothing:
        values = exponential_smoothing(values, alpha)
    elif is_rolling_mean:
        values= rolling_mean(values, rolling_value)
    return values


In [143]:
# Тест Дики-Фуллера на стационарность
def test_stationarity(timeseries):
    timeseries = pd.Series(timeseries)

    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)' % key] = value

    return dfoutput['p-value']

In [144]:
# Получим предсказания
def get_predictions(data, predictions):
    df = pd.DataFrame({'ds': get_all_dates(), 'y': data})
    model = prophet.Prophet()
    model.add_seasonality(name='monthly', period=300, fourier_order=50)
    model.fit(df)

    future = model.make_future_dataframe(periods=predictions, freq='D')
    forecast = model.predict(future)

    df_forecast = pd.DataFrame({'ds': forecast['ds'], 'y': forecast['yhat']})
    df_forecast['ds'] = pd.to_datetime(df_forecast['ds'])
    

    df_forecast = df_forecast[df_forecast['ds'] >= '2023-02-08']
    return df_forecast
        


In [145]:
def denormalize(value, mean, std):
    return round(value * std + mean, 2)

In [146]:
# Загрузим данные
ans_df = pd.read_csv('tasks/merch_cb_hack_8_9.csv')
ans_df['day'] = pd.to_datetime(ans_df['day'])
df = pd.read_csv('data.csv')

with open('test_categories.json', 'r', encoding="utf-8") as file:
    categories = json.load(file)

organisations = ans_df['merchant_name'].unique()

In [147]:
# Получим предсказания
for organisation in organisations:
    series = make_series(df, categories[organisation], is_exponential_smoothing=True, alpha=0.2)
    predictions = get_predictions(series, 28)
    
    # Получим среднее и стандратное отклонение organisation из ans_df
    mean = ans_df[ans_df['merchant_name'] == organisation]['cashback'].mean()
    std = ans_df[ans_df['merchant_name'] == organisation]['cashback'].std()
    
    for date in predictions['ds']:
        if date not in ans_df[ans_df['merchant_name'] == organisation]['day'].values:
            ans_df = ans_df._append(
                {'merchant_name': organisation, 
                 'day': date, 
                 'month': get_first_day_of_month(date),
                 'cashback': denormalize(predictions[predictions['ds'] == date]['y'].values[0], mean, std)},
                ignore_index=True)

# Отсортируем данные по именам и возрастании даты
ans_df = ans_df.sort_values(by=['merchant_name', 'day'])

20:57:17 - cmdstanpy - INFO - Chain [1] start processing
20:57:17 - cmdstanpy - INFO - Chain [1] done processing
20:57:18 - cmdstanpy - INFO - Chain [1] start processing
20:57:18 - cmdstanpy - INFO - Chain [1] done processing
20:57:18 - cmdstanpy - INFO - Chain [1] start processing
20:57:19 - cmdstanpy - INFO - Chain [1] done processing
20:57:19 - cmdstanpy - INFO - Chain [1] start processing
20:57:19 - cmdstanpy - INFO - Chain [1] done processing
20:57:20 - cmdstanpy - INFO - Chain [1] start processing
20:57:20 - cmdstanpy - INFO - Chain [1] done processing
20:57:20 - cmdstanpy - INFO - Chain [1] start processing
20:57:21 - cmdstanpy - INFO - Chain [1] done processing
20:57:21 - cmdstanpy - INFO - Chain [1] start processing
20:57:21 - cmdstanpy - INFO - Chain [1] done processing
20:57:22 - cmdstanpy - INFO - Chain [1] start processing
20:57:22 - cmdstanpy - INFO - Chain [1] done processing
20:57:22 - cmdstanpy - INFO - Chain [1] start processing
20:57:22 - cmdstanpy - INFO - Chain [1]

In [148]:
# Сохраним результат
ans_df.to_csv('ans.csv', index=False)