In [7]:
# KNN with season

import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import holidays
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import random
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import norm
from scipy.interpolate import make_interp_spline
   


def add_additional_features(df):
    df['year'] = df.index.year
    df['week'] = df.index.isocalendar().week
    df['day_of_week'] = df.index.dayofweek
    df['hour'] = df.index.hour
    df['minute'] = df.index.minute
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['week_sin'] = np.sin(2 * np.pi * df['week'] / 52)
    df['week_cos'] = np.cos(2 * np.pi * df['week'] / 52)

    # Получаем список праздников
    it_holidays = holidays.Italy(years=df['year'].unique().tolist())
    # Создаем список дат праздников в формате datetime.date
    holiday_dates = [date for date in it_holidays.keys()]

    # Проверяем на принадлежность к праздникам, используя преобразованный список дат
    df['is_holiday'] = df.index.normalize().isin(holiday_dates).astype(int)
    
    return df

def fill_gaps_with_knn(df, n_neighbors=3):
    # Добавление дополнительных признаков сезонности
    df_with_features = add_additional_features(df)
    
    # Создание копии DataFrame для изменений
    prefilled_df = df_with_features.copy()

    # Индексы с пропущенными значениями в колонке 'P_l'
    missing_indices = prefilled_df[prefilled_df['load_consumption'].isnull()].index

    # Отбор признаков для KNN импутации, включая 'P_l' для вычисления индекса
    features_to_impute = prefilled_df.drop(columns=['P_l_interval', 'is_droped']) if 'P_l_interval' in prefilled_df and 'is_droped' in prefilled_df else prefilled_df
    
    # Использование KNN для заполнения пропущенных значений
    imputer = KNNImputer(n_neighbors=n_neighbors)
    features_filled = imputer.fit_transform(features_to_impute)

    # Создание DataFrame на основе заполненных данных, включая 'P_l'
    filled_data_df = pd.DataFrame(features_filled, columns=features_to_impute.columns, index=features_to_impute.index)
    
    # Обновление только пропущенных значений в 'P_l'
    prefilled_df.loc[missing_indices, 'load_consumption'] = filled_data_df.loc[missing_indices, 'load_consumption']
    # Отбор нужных колонок для конечного DataFrame
    final_columns = ['load_consumption']
    filled_df = prefilled_df[final_columns]
    return filled_df


In [8]:
# Read data
df_with_season = pd.read_csv('/Users/dmitrii/Desktop/PhD/Python/PhD_code_project/Machine-Learning-Techniques-for-Ensuring-the-Health-of-Citizens/notebooks/results/load_consumption_per_Marco1.csv', parse_dates=['time'], index_col='time')
df_with_season.index.name = 'time'


In [9]:
df_with_season.index = pd.to_datetime(df_with_season.index)



In [10]:
print(df_with_season.columns)


Index(['Unnamed: 0', 'load_consumption'], dtype='object')


In [11]:
# Filling gaps using KNN
filled_df = fill_gaps_with_knn(df_with_season)



In [12]:
print(filled_df)

                     load_consumption
time                                 
2017-05-12 00:03:21           24399.6
2017-05-12 00:08:21           24181.4
2017-05-12 00:13:21           23567.1
2017-05-12 00:18:21           23130.0
2017-05-12 00:23:21           22605.6
...                               ...
2023-12-18 13:56:07           31193.0
2023-12-18 14:01:07           31774.8
2023-12-18 14:06:07           33777.5
2023-12-18 14:11:07           33673.7
2023-12-18 14:16:07           33245.4

[556692 rows x 1 columns]


In [13]:
filled_df.to_csv("/Users/dmitrii/Desktop/PhD/Python/PhD_code_project/Machine-Learning-Techniques-for-Ensuring-the-Health-of-Citizens/notebooks/results/load_consumption_per_Marco_filled1.csv")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Ensure that your DataFrame's index is of datetime type
filled_df.index = pd.to_datetime(filled_df.index)

# Filter for the last month's data
last_month = filled_df.last('1m')  # Adjust '1M' if you need exactly the last calendar month

# Plotting
plt.figure(figsize=(12, 6))
plt.plot(last_month.index, last_month['load_consumption'], label='Load Consumption')
plt.title('Load Consumption Over the Last Month')
plt.xlabel('Date')
plt.ylabel('Load Consumption')
plt.legend()
plt.grid(True)  # Optional: adds grid lines for better readability
plt.show()
