In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import plotly.graph_objs as go
from plotly.offline import iplot

device = "cpu"

In [2]:

def plot_dataset(df, title):
    data = []
    
    value = go.Scatter(
        x=df.index,
        y=df.value,
        mode="lines",
        name="values",
        marker=dict(),
        text=df.index,
        line=dict(color="rgba(0,0,0, 0.3)"),
    )
    data.append(value)

    layout = dict(
        title=title,
        xaxis=dict(title="Date", ticklen=5, zeroline=False),
        yaxis=dict(title="Value", ticklen=5, zeroline=False),
    )

    fig = dict(data=data, layout=layout)
    iplot(fig)
    

In [3]:

df = pd.read_csv('C:\PYTHON\PythonLab5\dataset.csv')
df.columns = ['data', 'temp_day', 'wind', 'pressure_day', 'temp_even', 'pressure_even']  # 2

nan_value = float("NaN")                             # 3  ( null / None / Nan ) 
df.replace(" ", nan_value, inplace=True)
df = df.dropna()
df[['temp_day', "temp_even"]] = df[['temp_day', "temp_even"]].astype(int)  # 4


In [4]:

df = df.set_index(['data'])
df = df.rename(columns={'temp_day': 'value'})

df.index = pd.to_datetime(df.index)
if not df.index.is_monotonic_increasing:
    df = df.sort_index()
    
plot_dataset(df, title='PJM East (PJME) Region: estimated energy consumption in Megawatts (MW)')
    

In [5]:
def generate_time_lags(df, n_lags):
    df_n = df.copy()
    for n in range(1, n_lags + 1):
        df_n[f"lag{n}"] = df_n["value"].shift(n)
    df_n = df_n.iloc[n_lags:]
    return df_n

input_dim = 100

df_timelags = generate_time_lags(df, input_dim)
df_timelags


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



Unnamed: 0_level_0,value,wind,pressure_day,temp_even,pressure_even,lag1,lag2,lag3,lag4,lag5,...,lag91,lag92,lag93,lag94,lag95,lag96,lag97,lag98,lag99,lag100
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-04-14,9,Ю 1м/с,751,6,750,9.0,7.0,6.0,2.0,0.0,...,-16.0,-13.0,-15.0,-16.0,-15.0,-12.0,-16.0,-14.0,-8.0,-10.0
2009-04-15,12,Ю 3м/с,750,8,749,9.0,9.0,7.0,6.0,2.0,...,-20.0,-16.0,-13.0,-15.0,-16.0,-15.0,-12.0,-16.0,-14.0,-8.0
2009-04-16,14,ЮЗ 4м/с,748,10,747,12.0,9.0,9.0,7.0,6.0,...,-18.0,-20.0,-16.0,-13.0,-15.0,-16.0,-15.0,-12.0,-16.0,-14.0
2009-04-17,3,СВ 3м/с,750,0,750,14.0,12.0,9.0,9.0,7.0,...,-16.0,-18.0,-20.0,-16.0,-13.0,-15.0,-16.0,-15.0,-12.0,-16.0
2009-04-18,16,Ю 7м/с,742,14,741,3.0,14.0,12.0,9.0,9.0,...,-4.0,-16.0,-18.0,-20.0,-16.0,-13.0,-15.0,-16.0,-15.0,-12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-25,11,Ш,742,10,742,17.0,24.0,26.0,17.0,16.0,...,28.0,22.0,21.0,23.0,25.0,23.0,25.0,28.0,26.0,21.0
2022-09-26,11,З 2м/с,746,9,748,11.0,17.0,24.0,26.0,17.0,...,24.0,28.0,22.0,21.0,23.0,25.0,23.0,25.0,28.0,26.0
2022-09-27,13,З 1м/с,752,11,752,11.0,11.0,17.0,24.0,26.0,...,23.0,24.0,28.0,22.0,21.0,23.0,25.0,23.0,25.0,28.0
2022-09-28,10,В 4м/с,748,10,746,13.0,11.0,11.0,17.0,24.0,...,27.0,23.0,24.0,28.0,22.0,21.0,23.0,25.0,23.0,25.0


In [8]:
import sklearn
from sklearn.model_selection import train_test_split

def feature_label_split(df, target_col):
    y = df[[target_col]]
    X = df.drop(columns=[target_col])
    return X, y

def train_val_test_split(df, target_col, test_ratio):
    val_ratio = test_ratio / (1 - test_ratio)
    X, y = feature_label_split(df, target_col)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio, shuffle=False)
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(df_timelags, 'value', 0.2)