In [18]:
import pandas as pd
from torch import tensor
from torch.utils.data import Dataset

# Load the CSV file
df = pd.read_csv('./consumption_and_temperatures.csv')
print(df.isna().sum())


# Select the first three columns
df = df.iloc[:, :3]
df['year'] = pd.to_datetime(df['timestamp']).dt.year
df['month'] = pd.to_datetime(df['timestamp']).dt.month
df['day'] = pd.to_datetime(df['timestamp']).dt.day
df['hour'] = pd.to_datetime(df['timestamp']).dt.hour

df["consumption_t-1"] = df["NO1_consumption"].shift(1)

# Reorder columns
new_df = df[['year', 'month', 'day', 'hour', 'NO1_temperature',
             'consumption_t-1', 'NO1_consumption']]

# remove the first row
new_df = new_df.iloc[1:]
new_df

timestamp          0
NO1_consumption    0
NO1_temperature    0
NO2_consumption    0
NO2_temperature    0
NO3_consumption    0
NO3_temperature    0
NO4_consumption    0
NO4_temperature    0
NO5_consumption    0
NO5_temperature    0
dtype: int64


Unnamed: 0,year,month,day,hour,NO1_temperature,consumption_t-1,NO1_consumption
1,2017,5,1,1,1.8,3325.431995,3344.690998
2,2017,5,1,2,3.2,3344.690998,3398.359002
3,2017,5,1,3,3.6,3398.359002,3430.220001
4,2017,5,1,4,3.4,3430.220001,3606.750000
5,2017,5,1,5,3.1,3606.750000,3739.876998
...,...,...,...,...,...,...,...
58459,2023,12,31,19,-4.7,5791.351612,5608.067736
58460,2023,12,31,20,-4.6,5608.067736,5469.722792
58461,2023,12,31,21,-4.5,5469.722792,5321.221040
58462,2023,12,31,22,-4.5,5321.221040,5222.770756


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

# Convert DataFrame to NumPy array
data = new_df.to_numpy()

# Split the target and features
X = data[:, :-1]
y = data[:, -1]

X= X.reshape(X.shape[0],X.shape[1], 1)
y= y.reshape(y.shape[0],1)

# Perform train-validation-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Print the shapes of the splits
print("Train set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Train set shape: (35077, 6, 1) (35077, 1)
Validation set shape: (11693, 6, 1) (11693, 1)
Test set shape: (11693, 6, 1) (11693, 1)


In [16]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y,sequence_length):
        self.X = X
        self.X = [
            self.X[i:i + sequence_length] for i in range(len(self.X) - sequence_length)
        ]
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return tensor(self.X[idx]), tensor(self.y[idx])

In [14]:

class DataFrameDataset(Dataset):
    def __init__(self, dataframe, feature_columns, label_column, sequence_length):
        self.data = dataframe[feature_columns]
        self.num_features = len(feature_columns)
        # convert data to sequences
        self.data = [
            self.data[i: i + sequence_length] for i in range(len(self.data) - sequence_length)
        ]
        self.labels = dataframe[label_column].view(-1, 1)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return tensor(self.data[idx]).float(), tensor(self.labels[idx]).float()

In [17]:
sequence_length = 24
dataset = TimeSeriesDataset(X_train, y_train, sequence_length)
