In [50]:
import pandas as pd

In [51]:
data = pd.read_csv('train.csv')

In [52]:
# Checking for missing values in the train dataset
missing_values = data.isnull().sum()
missing_values_percentage = (missing_values / len(data)) * 100

missing_values_summary = pd.DataFrame({
    'Missing Values': missing_values, 
    'Percentage': missing_values_percentage
})

missing_values_summary[missing_values_summary['Missing Values'] > 0]


Unnamed: 0,Missing Values,Percentage


In [53]:
from sklearn.preprocessing import MinMaxScaler

# Selecting the columns to be scaled (all except date and time columns)
columns_to_scale = data.columns.difference(['Year', 'Month', 'Day', 'Hour', 'Minute'])

# Applying Min-Max Scaling
scaler = MinMaxScaler()
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

# Displaying the first few rows of the scaled data
scaled_data_head = data.head()
scaled_data_head

Unnamed: 0,Year,Month,Day,Hour,Minute,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag
0,2009,1,1,0,0,0.0,0.0,0.0,0.0,0.387244,0.285714,0.740741,0.734296,0.598773,0.049749,0.961389,0.24031,0.0
1,2009,1,1,0,30,0.0,0.0,0.0,0.0,0.410023,0.285714,0.740741,0.793233,0.63638,0.048562,0.961389,0.24031,0.0
2,2009,1,1,1,0,0.0,0.0,0.0,0.4,0.387244,0.285714,0.740741,0.765866,0.67454,0.047506,0.966389,0.248062,0.0
3,2009,1,1,1,30,0.0,0.0,0.0,0.4,0.387244,0.265306,0.740741,0.765866,0.713067,0.046978,0.966389,0.24031,0.0
4,2009,1,1,2,0,0.0,0.0,0.0,0.4,0.387244,0.265306,0.740741,0.746256,0.751963,0.046582,0.972222,0.232558,0.0


In [54]:
# Checking for categorical columns in the dataset
categorical_columns = data.select_dtypes(include=['object']).columns

# Listing categorical columns, if any
categorical_columns_list = categorical_columns.tolist()
categorical_columns_list


[]

In [55]:
import pandas as pd

# Converting the date and time columns to datetime format
# Get DateTime column and index it
data['DateTime'] = pd.to_datetime(data[['Year', 'Month', 'Day', 'Hour', 'Minute']])
data = data.set_index('DateTime')

# Drop the original columns if they are no longer needed
data.drop(['Year', 'Month', 'Day', 'Hour', 'Minute'], axis=1, inplace=True)

data

Unnamed: 0_level_0,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2009-01-01 00:00:00,0.000000,0.000000,0.000000,0.0,0.387244,0.285714,0.740741,0.734296,0.598773,0.049749,0.961389,0.240310,0.00
2009-01-01 00:30:00,0.000000,0.000000,0.000000,0.0,0.410023,0.285714,0.740741,0.793233,0.636380,0.048562,0.961389,0.240310,0.00
2009-01-01 01:00:00,0.000000,0.000000,0.000000,0.4,0.387244,0.285714,0.740741,0.765866,0.674540,0.047506,0.966389,0.248062,0.00
2009-01-01 01:30:00,0.000000,0.000000,0.000000,0.4,0.387244,0.265306,0.740741,0.765866,0.713067,0.046978,0.966389,0.240310,0.00
2009-01-01 02:00:00,0.000000,0.000000,0.000000,0.4,0.387244,0.265306,0.740741,0.746256,0.751963,0.046582,0.972222,0.232558,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31 21:30:00,0.090265,0.533654,0.158043,0.4,0.829157,0.608163,0.703704,0.911324,0.425215,0.472156,0.566667,0.271318,1.00
2018-12-31 22:00:00,0.065487,0.373077,0.079022,0.4,0.822323,0.593878,0.703704,0.934059,0.456933,0.485352,0.580556,0.248062,1.00
2018-12-31 22:30:00,0.026549,0.110577,0.016933,0.7,0.822323,0.583673,0.703704,0.965844,0.489387,0.485352,0.577778,0.201550,0.57
2018-12-31 23:00:00,0.000000,0.000000,0.000000,0.7,0.813212,0.575510,0.722222,0.965952,0.526319,0.472156,0.572222,0.162791,0.00


In [None]:
import numpy as np

# Define a function to create sequences from the dataset
def create_sequences(data, window_size, target_columns):
    sequences = []
    targets = []
    
    for i in range(len(data) - window_size):
        sequence = data.iloc[i:i + window_size].to_numpy()
        target = data.iloc[i + window_size][target_columns].to_numpy()
        
        sequences.append(sequence)
        targets.append(target)
    
    return np.array(sequences), np.array(targets)

# Choosing a window size (24 for demonstration)
window_size = 24

# Target columns for prediction
target_columns = ['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI']

# Creating sequences
X, y = create_sequences(data, window_size, target_columns)

# Displaying the shape of the created sequences and targets
X.shape, y.shape

In [58]:
# 80% for training
split_ratio = 0.9
split_index = int(X.shape[0] * split_ratio)

# Split the data into training and validation sets
X_train, X_val = X[:split_index], X[split_index:]
y_train, y_val = y[:split_index], y[split_index:]

# Verify the shape of the datasets
X_train.shape, X_val.shape, y_train.shape, y_val.shape


((157744, 24, 13), (17528, 24, 13), (157744, 3), (17528, 3))

In [78]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# Define the CNN model
class CNN1D(nn.Module):
    def __init__(self):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=24, out_channels=64, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(128, 100)  # Adjusted input size
        self.fc2 = nn.Linear(100, 3)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


# Create the model instance
model = CNN1D()

# Check if GPU is available and move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


CNN1D(
  (conv1): Conv1d(24, 64, kernel_size=(3,), stride=(1,))
  (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=128, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=3, bias=True)
)

In [79]:
# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train).float().to(device)
y_train_tensor = torch.tensor(y_train).float().to(device)
X_val_tensor = torch.tensor(X_val).float().to(device)
y_val_tensor = torch.tensor(y_val).float().to(device)

# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [80]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):  # Number of epochs
    model.train()
    for inputs, targets in train_loader:
        # Move data to the device
        inputs, targets = inputs.to(device), targets.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

    # Validation step
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, targets).item()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss / len(val_loader)}')


Epoch 1, Loss: 0.00011851948511321098, Validation Loss: 0.0002730683871740552
Epoch 2, Loss: 0.00011859635560540482, Validation Loss: 0.0002010127658530667
Epoch 3, Loss: 9.321977267973125e-05, Validation Loss: 0.00014686010773356343
Epoch 4, Loss: 0.0005989854107610881, Validation Loss: 0.0001674097465130768
Epoch 5, Loss: 6.126724474597722e-05, Validation Loss: 0.0001696820033768984
Epoch 6, Loss: 0.0005031580803915858, Validation Loss: 0.00020393149211519354
Epoch 7, Loss: 6.157470488687977e-05, Validation Loss: 0.00011295573611371028
Epoch 8, Loss: 4.770743180415593e-05, Validation Loss: 0.00011578606664230777
Epoch 9, Loss: 0.00012749289453495294, Validation Loss: 0.00011072453967360831
Epoch 10, Loss: 0.00011072985944338143, Validation Loss: 0.00012490717155915065
