In [None]:
# Линейная регрессия с измененной функцией и расчете пропущенных значений в процентах. Рандомная вырезка
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
import random
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split



'''
Блок функций
'''

def try_parsing_date(text):
    for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):
        try:
            return pd.to_datetime(text, format=fmt)
        except ValueError:
            pass
    raise ValueError('no valid date format found')


def fill_with_linear_regression(chunk):
    filled_chunk = chunk.copy()
    data, target = [], []

    for idx in tqdm(filled_chunk.index, total=len(filled_chunk)):
        values_from_previous_and_next_years = []

        # Собираем данные из предыдущих годов
        for i in range(1, 4):
            if (idx - pd.DateOffset(years=i)) in filled_chunk.index:
                values_from_previous_and_next_years.append(filled_chunk.loc[idx - pd.DateOffset(years=i), 'P_l'])
            elif (idx + pd.DateOffset(years=i)) in filled_chunk.index:
                values_from_previous_and_next_years.append(filled_chunk.loc[idx + pd.DateOffset(years=i), 'P_l'])
            else:
                values_from_previous_and_next_years.append(np.nan)

        if not np.isnan(filled_chunk.loc[idx, 'P_l']) and not any(np.isnan(values_from_previous_and_next_years)):
            data.append(values_from_previous_and_next_years)
            target.append(filled_chunk.loc[idx, 'P_l'])

    regressor = LinearRegression()
    regressor.fit(data, target)

    for idx in tqdm(filled_chunk[filled_chunk['P_l'].isnull()].index, total=filled_chunk['P_l'].isnull().sum()):
        values_from_previous_and_next_years = []

        for i in range(1, 4):
            if (idx - pd.DateOffset(years=i)) in filled_chunk.index:
                values_from_previous_and_next_years.append(filled_chunk.loc[idx - pd.DateOffset(years=i), 'P_l'])
            elif (idx + pd.DateOffset(years=i)) in filled_chunk.index:
                values_from_previous_and_next_years.append(filled_chunk.loc[idx + pd.DateOffset(years=i), 'P_l'])
            else:
                values_from_previous_and_next_years.append(np.nan)

        if not any(np.isnan(values_from_previous_and_next_years)):
            filled_chunk.at[idx, 'P_l'] = regressor.predict([values_from_previous_and_next_years])[0]
        else:
            # Если нет данных для регрессии, заполняем средним значением из доступных годов
            available_values = [val for val in values_from_previous_and_next_years if not np.isnan(val)]
            if available_values:
                filled_chunk.at[idx, 'P_l'] = np.mean(available_values)
            else:
                # Если и тут нет данных, используем линейную интерполяцию
                filled_chunk['P_l'].interpolate(method='linear', inplace=True)

    return filled_chunk


def calculate_mape(df_orig, df_test, drop_index):
    df_test.reset_index(inplace=True)
    df_orig.reset_index(inplace=True)
    df_init = df_orig[df_orig['time'].isin(drop_index)]
    df_mape = df_test[df_test['time'].isin(drop_index)]
    mape = (abs((df_init['P_l'] - df_mape['P_l']) / df_mape['P_l']) * 100).mean()
    return mape

'''
Блок рассчетов
'''

df = pd.read_csv('/content/filled_P_l.csv')


df['time'] = df['time'].apply(try_parsing_date)
df.set_index('time', inplace=True)
df = df.asfreq('5T')
df_without_nan = df.dropna()

# Устанавливаем процент пропущенных значений, который хотим удалить
percentage_of_missing_values = 0  # например, 10%

# Вычисляем общее количество значений
total_values = len(df)

# Вычисляем количество значений для удаления
num_values_to_remove = int(total_values * percentage_of_missing_values / 100)

# Генерируем индексы для удаления
drop_index = random.sample(df_without_nan.index.tolist(), num_values_to_remove)

# Создаем копию DataFrame для тестирования
df_test = df.copy()

# Устанавливаем значения в колонке 'P_l' на NaN на основе drop_index
df_test.loc[drop_index, 'P_l'] = np.nan

# Применяем функцию заполнения
df_test = fill_with_linear_regression(df_test)

# Вычисляем MAPE
mape = calculate_mape(df, df_test, drop_index)
print(f'MAPE = {mape}')

# Визуализация
plt.figure(figsize=(14, 7))
df_test['P_l'].plot(title="Dataset with Filled Values using Linear Regression")
plt.xlabel("Time")
plt.ylabel("P_l Value")
plt.show()

# Сохраняем в CSV
df_test.to_csv('/content/imputed_filled_P_l_LR.csv')

In [None]:
# Расчет метрик
def calculate_metrics(df_orig, df_imputed, drop_index):
    df_orig.reset_index(inplace=True)
    df_imputed.reset_index(inplace=True)
    y_true = df_orig[df_orig['time'].isin(drop_index)]['P_l']
    y_pred = df_imputed[df_imputed['time'].isin(drop_index)]['P_l']
    y_true_mean = y_true.mean()

    # Расчет RMSE
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # Расчет R^2
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - y_true_mean) ** 2)
    r2 = 1 - (ss_res / ss_tot)

    # Расчет MAE
    mae = mean_absolute_error(y_true, y_pred)

    # Расчет MAPE
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    # Расчет WMAPE
    wmape = np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true)) * 100

    return rmse, r2, mae, mape, wmape

# Используем созданную функцию
rmse, r2, mae, mape, wmape = calculate_metrics(df, df_test, drop_index)
print(f'RMSE = {rmse}')
print(f'R-squared = {r2}')
print(f'MAE = {mae}')
print(f'MAPE = {mape}')
print(f'WMAPE = {wmape}')

In [None]:
# Линейная регрессия с измененной функцией и расчете пропущенных значений в процентах. Продолжительная вырезка
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from datetime import datetime
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.dates import date2num

format_string = '%Y-%m-%d %H:%M:%S%z'

def create_error_rate_data(initial_df, percent_gaps):
    df_orig = initial_df.copy()
    df_comparative = df_orig.dropna()
    df_test = initial_df.copy()
    df_for_search_indexes = df_orig.dropna()
    len_deleted = int(len(df_for_search_indexes) * (percent_gaps/100))
    start_index_delete_random = random.randint(0, len(df_for_search_indexes) - len_deleted)
    long_random_drop_indexes_list = list(range(start_index_delete_random, start_index_delete_random + len_deleted))
    date = df_for_search_indexes.loc[df_for_search_indexes.index.isin(long_random_drop_indexes_list), 'time'].tolist()
    drop_indexes = df_test[df_test['time'].isin(date)].index.tolist()
    df_test.loc[df_test['time'].isin(date), 'P_l'] = np.nan
    return df_orig, df_test, drop_indexes

def fill_with_linear_regression(df, time_window_years=3):
    filled_df = df.copy()
    filled_df['time'] = pd.to_datetime(filled_df['time'])
    filled_df = filled_df.set_index('time')

    for time_point in tqdm(filled_df[filled_df['P_l'].isna()].index):
        data = []
        target = []
        for i in range(1, time_window_years + 1):
            previous_year = time_point - pd.DateOffset(years=i)
            next_year = time_point + pd.DateOffset(years=i)

            if previous_year in filled_df.index and not np.isnan(filled_df.loc[previous_year, 'P_l']):
                data.append([-i])
                target.append(filled_df.loc[previous_year, 'P_l'])
            if next_year in filled_df.index and not np.isnan(filled_df.loc[next_year, 'P_l']):
                data.append([i])
                target.append(filled_df.loc[next_year, 'P_l'])

        if data and target:
            regressor = LinearRegression()
            regressor.fit(np.array(data), target)
            filled_df.at[time_point, 'P_l'] = regressor.predict(np.array([[0]]))[0]
        else:
            filled_df['P_l'].interpolate(method='linear', inplace=True)

    return filled_df.reset_index()


def calculate_metrics(df_orig, df_test, drop_index):
    y_true = df_orig.loc[drop_index, 'P_l']
    y_pred = df_test.loc[drop_index, 'P_l']

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    y_true_mean = y_true.mean()
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - y_true_mean) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    wmape = np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true)) * 100

    return rmse, r2, mae, mape, wmape

# Чтение данных
df_with_season = pd.read_csv('/content/filled_P_l.csv')
df_with_season, df_test_with_season, drop_index_with_season = create_error_rate_data(df_with_season, 10)# Put the rpocents of the cut data

# Заполнение пропусков с помощью линейной регрессии
filled_df_with_season = fill_with_linear_regression(df_test_with_season)

# Расчет метрик
rmse, r2, mae, mape, wmape = calculate_metrics(df_with_season, filled_df_with_season, drop_index_with_season)

# Вывод результатов
print(f'RMSE = {rmse}')
print(f'R-squared = {r2}')
print(f'MAE = {mae}')
print(f'MAPE = {mape}')
print(f'WMAPE = {wmape}')

# Визуализация результатов
plt.figure(figsize=(14, 7))
plt.plot(date2num(filled_df_with_season['time']), filled_df_with_season['P_l'], linewidth=0.1)
plt.title("Dataset with Filled Values using Linear Regression. Count gaps = 30%")
plt.xlabel("Time")
plt.ylabel("P_l Value")
plt.show()
