### Notatnik sprawdzający skuteczność działania architektury opierającej się na klastrach
Poniżej przeprowadzone eksperymenty w oparciu o przetworzone i sklastrowane dane geograficzne

In [1]:
## importy 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from models.architectures.temperature_prediction.simple_model import ModelSimple
from models.architectures.temperature_prediction.model_v2 import Modelv2
from models.architectures.temperature_prediction.model_v3 import Modelv3
from models.architectures.temperature_prediction.model_v4 import Modelv4
from sklearn.preprocessing import MinMaxScaler

from torch.utils.data import Dataset, DataLoader

from pathlib import Path
from aux_classes.WeatherDataset import WeatherDataset

In [64]:
CLUSTER_DATA_PATH = Path("Data/data_clustered_nontransformed")

In [75]:
df = pd.read_csv(CLUSTER_DATA_PATH / "cluster_1_preprocessed.csv")
df.head()

Unnamed: 0,datetime,humidity,pressure,temperature,weather_description,wind_direction,wind_speed
0,2012-10-01 12:00:00,,,,,,
1,2012-10-01 13:00:00,23.0,1013.0,296.6,sky is clear,10.0,2.0
2,2012-10-01 14:00:00,23.0,1013.0,296.608509,sky is clear,9.0,2.0
3,2012-10-01 15:00:00,23.0,1013.0,296.631487,sky is clear,9.0,2.0
4,2012-10-01 16:00:00,23.0,1013.0,296.654466,sky is clear,9.0,2.0


In [76]:
df.describe

<bound method NDFrame.describe of                   datetime  humidity  pressure  temperature  \
0      2012-10-01 12:00:00       NaN       NaN          NaN   
1      2012-10-01 13:00:00      23.0    1013.0   296.600000   
2      2012-10-01 14:00:00      23.0    1013.0   296.608509   
3      2012-10-01 15:00:00      23.0    1013.0   296.631487   
4      2012-10-01 16:00:00      23.0    1013.0   296.654466   
...                    ...       ...       ...          ...   
45248  2017-11-29 20:00:00      68.0    1018.0   294.710000   
45249  2017-11-29 21:00:00      73.0    1018.0   295.590000   
45250  2017-11-29 22:00:00      60.0    1017.0   296.250000   
45251  2017-11-29 23:00:00      33.0    1016.0   297.150000   
45252  2017-11-30 00:00:00      23.0    1016.0   297.150000   

      weather_description  wind_direction  wind_speed  
0                     NaN             NaN         NaN  
1            sky is clear            10.0         2.0  
2            sky is clear             9.0

In [77]:
df["datetime"] = pd.to_datetime(df["datetime"])
df["hour"] = df["datetime"].dt.hour
df["date"] = pd.to_datetime(df["datetime"].dt.date)
df = df.drop(columns = ["weather_description"])
df = df.groupby(by="date").mean().drop(columns=["datetime"]).reset_index()
df["day_of_year"] = df["date"].dt.day_of_year
df["day_of_year_radians"] = 2 * np.pi * df["day_of_year"] / 365.25
df['day_of_year_sin'] = np.sin(df["day_of_year_radians"])
df['day_of_year_cos'] = np.cos(df["day_of_year_radians"])
df.drop(columns = ['day_of_year_radians', 'wind_direction', 'date'], inplace=True)
df['target_temperature'] = df['temperature'].shift(-4)

features_to_expand = ['humidity', 'pressure', 'temperature']

for feature in features_to_expand:
    df[f'{feature}_1'] = df[feature]
    df[f'{feature}_2'] = df[feature].shift(-1)
    df[f'{feature}_3'] = df[feature].shift(-2)

# Drop the original columns that were expanded
df = df.drop(columns=features_to_expand)

df = df[:-4]

df

Unnamed: 0,wind_speed,hour,day_of_year,day_of_year_sin,day_of_year_cos,target_temperature,humidity_1,humidity_2,humidity_3,pressure_1,pressure_2,pressure_3,temperature_1,temperature_2,temperature_3
0,1.636364,17.5,275,-0.999833,0.018277,301.811250,24.181818,24.476190,18.380952,1013.000000,1012.761905,1011.000000,296.701739,301.211968,302.867083
1,1.000000,11.5,276,-0.999371,0.035473,299.810417,24.476190,18.380952,20.238095,1012.761905,1011.000000,1011.863636,301.211968,302.867083,302.232917
2,1.250000,11.5,277,-0.998613,0.052658,299.248333,18.380952,20.238095,27.777778,1011.000000,1011.863636,1014.555556,302.867083,302.232917,301.811250
3,0.916667,11.5,278,-0.997559,0.069828,299.532083,20.238095,27.777778,27.428571,1011.863636,1014.555556,1015.222222,302.232917,301.811250,299.810417
4,1.291667,11.5,279,-0.996210,0.086977,299.025625,27.777778,27.428571,23.416667,1014.555556,1015.222222,1012.583333,301.811250,299.810417,299.248333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1878,2.416667,11.5,326,-0.625050,0.780585,292.880833,43.875000,38.666667,52.458333,1019.333333,1018.333333,1013.958333,292.630833,293.941667,293.033750
1879,2.166667,11.5,327,-0.611530,0.791221,292.214583,38.666667,52.458333,38.083333,1018.333333,1013.958333,1015.458333,293.941667,293.033750,293.020000
1880,2.125000,11.5,328,-0.597829,0.801624,290.284167,52.458333,38.083333,43.208333,1013.958333,1015.458333,1018.000000,293.033750,293.020000,292.880833
1881,1.958333,11.5,329,-0.583951,0.811789,290.006667,38.083333,43.208333,34.041667,1015.458333,1018.000000,1012.791667,293.020000,292.880833,292.214583


In [83]:
#normalize all the values
scaler = MinMaxScaler()

normalized_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
normalized_df = normalized_df.dropna()

In [85]:
data = normalized_df.drop(columns=['target_temperature']).to_numpy()
targets = normalized_df['target_temperature'].to_numpy()

# Split data into training and testing
train_data, test_data = data[:1400], data[1400:]
train_targets, test_targets = targets[:1400], targets[1400:]

# Create dataset and data loaders
train_dataset = WeatherDataset(train_data, train_targets)
test_dataset = WeatherDataset(test_data, test_targets)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [98]:
# Number of features
input_size = data.shape[1] 

# Initialize the model
model = Modelv2(input_size=input_size, learning_rate=0.0005)

# Train the model
model.train_model(train_loader, epochs=80)

# Evaluate the model on the test set
model.test_model(test_loader)

absolute_differences = []
max_value = df['target_temperature'].max()
min_value = df['target_temperature'].min()

for row_idx in range(len(test_data)):
    prediction = model.predict(torch.tensor(test_data[row_idx], dtype=torch.float32).unsqueeze(0))

    unnormalized_prediction = prediction * (max_value - min_value) + min_value
    unnormalized_target = test_targets[row_idx] * (max_value - min_value) + min_value
    
    absolute_difference = abs(unnormalized_prediction - unnormalized_target)
    absolute_differences.append(absolute_difference)

# Optionally, convert the differences into a NumPy array or Tensor
absolute_differences = torch.tensor(absolute_differences)

# Print summary
print(f"Mean Absolute Difference: {torch.mean(absolute_differences)}")

Epoch 1/80, Loss: 0.0778
Epoch 10/80, Loss: 0.0038
Epoch 20/80, Loss: 0.0036
Epoch 30/80, Loss: 0.0035
Epoch 40/80, Loss: 0.0033
Epoch 50/80, Loss: 0.0033
Epoch 60/80, Loss: 0.0033
Epoch 70/80, Loss: 0.0031
Epoch 80/80, Loss: 0.0030
Test Loss: 0.0034
Mean Absolute Difference: 1.9717345391493117


In [142]:
## second type of pre-processing
df = pd.read_csv(CLUSTER_DATA_PATH / "cluster_1_preprocessed.csv")
df.head()

Unnamed: 0,datetime,humidity,pressure,temperature,weather_description,wind_direction,wind_speed
0,2012-10-01 12:00:00,,,,,,
1,2012-10-01 13:00:00,23.0,1013.0,296.6,sky is clear,10.0,2.0
2,2012-10-01 14:00:00,23.0,1013.0,296.608509,sky is clear,9.0,2.0
3,2012-10-01 15:00:00,23.0,1013.0,296.631487,sky is clear,9.0,2.0
4,2012-10-01 16:00:00,23.0,1013.0,296.654466,sky is clear,9.0,2.0


In [147]:
fft_values = np.fft.fft(df['temperature'].dropna())
frequencies = np.fft.fftfreq(len(df['temperature'].dropna()))

In [150]:
len(fft_values)

45250

In [None]:
df['lag1'] = df['value'].shift(1)
df['lag2'] = df['value'].shift(2)

In [None]:
df['rolling_mean'] = df['value'].rolling(window=3).mean()
df['rolling_std'] = df['value'].rolling(window=3).std()

In [None]:
fft_values = np.fft.fft(df['value'])
frequencies = np.fft.fftfreq(len(df['value']))

# Convert to DataFrame for easier manipulation
fft_df = pd.DataFrame({'frequency': frequencies, 'amplitude': np.abs(fft_values)})

print(fft_df)

In [137]:
df["datetime"] = pd.to_datetime(df["datetime"])
df["hour"] = df["datetime"].dt.hour
df["date"] = pd.to_datetime(df["datetime"].dt.date)
df = df.drop(columns = ["weather_description"])
df = df.groupby(by="date").mean().drop(columns=["datetime"]).reset_index()
df["day_of_year"] = df["date"].dt.day_of_year
df["day_of_year_radians"] = 2 * np.pi * df["day_of_year"] / 365.25
df['day_of_year_sin'] = np.sin(df["day_of_year_radians"])
df['day_of_year_cos'] = np.cos(df["day_of_year_radians"])
df.drop(columns = ['day_of_year_radians', 'wind_direction', 'date', "day_of_year", "wind_speed", "hour"], inplace=True)
df['target_temperature'] = df['temperature'].shift(-4)

features_to_expand = ['humidity', 'pressure', 'temperature']

for feature in features_to_expand:
    df[f'{feature}_1'] = df[feature]
    df[f'{feature}_2'] = df[feature].rolling(window=5).mean()
    df[f'{feature}_3'] = df[feature].rolling(window=5).std()

# Drop the original columns that were expanded
df = df.drop(columns=features_to_expand)

df = df[:-4]

df

Unnamed: 0,day_of_year_sin,day_of_year_cos,target_temperature,humidity_1,humidity_2,humidity_3,pressure_1,pressure_2,pressure_3,temperature_1,temperature_2,temperature_3
0,-0.999833,0.018277,301.811250,24.181818,,,1013.000000,,,296.701739,,
1,-0.999371,0.035473,299.810417,24.476190,,,1012.761905,,,301.211968,,
2,-0.998613,0.052658,299.248333,18.380952,,,1011.000000,,,302.867083,,
3,-0.997559,0.069828,299.532083,20.238095,,,1011.863636,,,302.232917,,
4,-0.996210,0.086977,299.025625,27.777778,,,1014.555556,,,301.811250,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1878,-0.625050,0.780585,292.880833,43.875000,39.305556,9.307650,1019.333333,1016.063889,1.727022,292.630833,292.864141,1.547355
1879,-0.611530,0.791221,292.214583,38.666667,38.697222,9.006728,1018.333333,1016.158333,1.813557,293.941667,292.829811,1.514872
1880,-0.597829,0.801624,290.284167,52.458333,39.427778,9.670290,1013.958333,1016.088889,1.879813,293.033750,292.702934,1.401223
1881,-0.583951,0.811789,290.006667,38.083333,39.272222,9.672013,1015.458333,1016.227778,1.736475,293.020000,292.653184,1.373752


In [138]:
#normalize all the values
scaler = MinMaxScaler()

normalized_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
normalized_df = normalized_df.dropna()

In [139]:
data = normalized_df.drop(columns=['target_temperature']).to_numpy()
targets = normalized_df['target_temperature'].to_numpy()

# Split data into training and testing
train_data, test_data = data[:1400], data[1400:]
train_targets, test_targets = targets[:1400], targets[1400:]

# Create dataset and data loaders
train_dataset = WeatherDataset(train_data, train_targets)
test_dataset = WeatherDataset(test_data, test_targets)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [141]:
# Number of features
input_size = data.shape[1] 

# Initialize the model
model = Modelv2(input_size=input_size, learning_rate=0.0005)

# Train the model
model.train_model(train_loader, epochs=80)

# Evaluate the model on the test set
model.test_model(test_loader)

absolute_differences = []
max_value = df['target_temperature'].max()
min_value = df['target_temperature'].min()

for row_idx in range(len(test_data)):
    prediction = model.predict(torch.tensor(test_data[row_idx], dtype=torch.float32).unsqueeze(0))

    unnormalized_prediction = prediction * (max_value - min_value) + min_value
    unnormalized_target = test_targets[row_idx] * (max_value - min_value) + min_value
    
    absolute_difference = abs(unnormalized_prediction - unnormalized_target)
    absolute_differences.append(absolute_difference)

# Optionally, convert the differences into a NumPy array or Tensor
absolute_differences = torch.tensor(absolute_differences)

# Print summary
print(f"Mean Absolute Difference: {torch.mean(absolute_differences)}")

Epoch 1/80, Loss: 0.0906
Epoch 10/80, Loss: 0.0045
Epoch 20/80, Loss: 0.0042
Epoch 30/80, Loss: 0.0041
Epoch 40/80, Loss: 0.0039
Epoch 50/80, Loss: 0.0038
Epoch 60/80, Loss: 0.0037
Epoch 70/80, Loss: 0.0035
Epoch 80/80, Loss: 0.0037
Test Loss: 0.0075
Mean Absolute Difference: 3.1160040570099476


In [None]:
def build_ensemble(model_config, input_size, learning_rate=0.0005):
    ensemble = []
    name_to_class = {
        'Modelv2': Modelv2,
        'Modelv3': Modelv3,
        'ModelSimple': ModelSimple,
        'Modelv4': Modelv4,
    }
    
    for model_name, count in model_config.items():
        for _ in range(count):
            model_class = name_to_class.get(model_name)
            if not model_class:
                raise ValueError(f"No class found for model name '{model_name}'!")
            ensemble.append(model_class(input_size=input_size, learning_rate=learning_rate))
    
    return ensemble

def train_ensemble(ensemble, train_loader, epochs=50):
    for idx, model in enumerate(ensemble):
        print(f"Training model {idx+1}/{len(ensemble)}...")
        model.train_model(train_loader, epochs=epochs)

def evaluate_ensemble(ensemble, test_data, test_targets, df):
    max_value = df['target_temperature'].max()
    min_value = df['target_temperature'].min()
    
    absolute_differences = []
    for model in ensemble:
        model.eval()
    
    for i in range(len(test_data)):
        x = torch.tensor(test_data[i], dtype=torch.float32).unsqueeze(0)
        
        predictions = []
        for model in ensemble:
            pred = model.predict(x) 
            predictions.append(pred.item())
        
        avg_prediction = sum(predictions) / len(predictions)
        
        unnormalized_prediction = avg_prediction * (max_value - min_value) + min_value
        unnormalized_target = test_targets[i] * (max_value - min_value) + min_value
        
        absolute_difference = abs(unnormalized_prediction - unnormalized_target)
        absolute_differences.append(absolute_difference)
    
    absolute_differences = torch.tensor(absolute_differences)
    
    mean_abs_diff = torch.mean(absolute_differences).item()
    print(f"Ensemble Mean Absolute Difference: {mean_abs_diff}")
    
    return mean_abs_diff

def run_ensemble_experiment(
    model_config, 
    train_data, train_targets, 
    test_data, test_targets, 
    df,
    epochs=80, 
    learning_rate=0.0005,
    batch_size=32
):

    train_dataset = torch.utils.data.TensorDataset(
        torch.tensor(train_data, dtype=torch.float32),
        torch.tensor(train_targets, dtype=torch.float32).unsqueeze(1)
    )
    test_dataset = torch.utils.data.TensorDataset(
        torch.tensor(test_data, dtype=torch.float32),
        torch.tensor(test_targets, dtype=torch.float32).unsqueeze(1)
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    input_size = train_data.shape[1]
    ensemble = build_ensemble(model_config, input_size, learning_rate)
    train_ensemble(ensemble, train_loader, epochs=epochs)
    
    print("\nIndividual Model Evaluation:")
    for idx, model in enumerate(ensemble):
        print(f"Model {idx+1}:")
        model.test_model(test_loader)
    
    print("\nEnsemble Evaluation:")
    evaluate_ensemble(ensemble, test_data, test_targets, df)

In [None]:
my_model_config = {
        'Modelv2': 2,
        'Modelv3': 1
    }
    
run_ensemble_experiment(
        model_config=my_model_config,
        train_data=train_data,
        train_targets=train_targets,
        test_data=test_data,
        test_targets=test_targets,
        df=df,
        epochs=80,
        learning_rate=0.0005,
        batch_size=32
    )