In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.model_selection

In [2]:
from torch.utils.data import DataLoader, TensorDataset
import torch
from torch import nn

In [3]:
# Load the data
df_test = pd.read_csv('prepped_data/test_data.csv')
df_train = pd.read_csv('prepped_data/train_data.csv')

#split the training data into training and validation
df_val = df_train[df_train['date'] > '2022-03-01']
df_train = df_train[df_train['date'] <= '2022-03-01']

In [4]:
# #Split train into X and Y
Xtrain = df_train.iloc[:, 8:].copy()
ytrain = df_train["PM25_ugm3"].copy()

# #Split test into X and Y
Xtest = df_test.iloc[:, 8:].copy()
ytest = df_test["PM25_ugm3"].copy()

# #Split val into X and Y
Xval = df_val.iloc[:, 8:].copy()
yval = df_val["PM25_ugm3"].copy()

In [6]:
#running a simple imputer and scaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

#imputer = SimpleImputer(strategy = "mean")
imputer = SimpleImputer()
Xtrain = imputer.fit_transform(Xtrain)
Xtest = imputer.transform(Xtest)
Xval = imputer.transform(Xval)

# Convert arrays back to DataFrame for easier manipulation
Xtrain = pd.DataFrame(Xtrain, columns=df_train.columns[8:])
Xtest = pd.DataFrame(Xtest, columns=df_test.columns[8:])
Xval = pd.DataFrame(Xval, columns=df_val.columns[8:])

In [7]:
from sklearn.preprocessing import OneHotEncoder

# Assuming you know the names of the categorical columns
categorical_columns = ['SiteID', 'day_of_week']  # List of categorical column names

# Initialize the OneHotEncoder
encoder = OneHotEncoder()

# Fit the encoder on the training data
encoder.fit(Xtrain[categorical_columns])

# Transform both training and test data
Xtrain_encoded = encoder.transform(Xtrain[categorical_columns])
Xtest_encoded = encoder.transform(Xtest[categorical_columns])
Xval_encoded = encoder.transform(Xval[categorical_columns])

# Convert the encoded data to a DataFrame
Xtrain_encoded = pd.DataFrame(Xtrain_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_columns))
Xtest_encoded = pd.DataFrame(Xtest_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_columns))
Xval_encoded = pd.DataFrame(Xval_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_columns))

# Drop the original categorical columns from the training and test data
Xtrain = Xtrain.drop(columns=categorical_columns)
Xtest = Xtest.drop(columns=categorical_columns)
Xval = Xval.drop(columns=categorical_columns)

# Concatenate the one-hot encoded columns to the training and test data
Xtrain = pd.concat([Xtrain, Xtrain_encoded], axis=1)
Xtest = pd.concat([Xtest, Xtest_encoded], axis=1)
Xval = pd.concat([Xval, Xval_encoded], axis=1)

Scaling

In [8]:
# Standardize the data
scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled = scaler.transform(Xtest)
Xval_scaled = scaler.transform(Xval)

# Convert arrays back to DataFrame for easier manipulation
Xtrain = pd.DataFrame(Xtrain_scaled, columns=Xtrain.columns)
Xtest = pd.DataFrame(Xtest_scaled, columns=Xtest.columns)
Xval = pd.DataFrame(Xval_scaled, columns=Xval.columns)

In [9]:
train_bs = 64
val_bs = 64
test_bs = 64
#Batchsize is a Hyperparameter, it is a tradeoff between speed and accuracy. We use 32 here to start as this was the value from the tutorials.

# Create tensor datasets
train_dataset = TensorDataset(torch.tensor(Xtrain.values).float(), torch.tensor(ytrain.values).unsqueeze(1).float())
val_dataset = TensorDataset(torch.tensor(Xval.values).float(), torch.tensor(yval.values).unsqueeze(1).float())
test_dataset = TensorDataset(torch.tensor(Xtest.values).float(), torch.tensor(ytest.values).unsqueeze(1).float())

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=train_bs, shuffle=True, drop_last=True) #dataset
val_dataloader = DataLoader(val_dataset, batch_size=val_bs, shuffle=False, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=test_bs, shuffle=False, drop_last=True)

Start of Model fitting

In [10]:
def train_loop(dataloader, model, loss_fn, optimiser):
    avg_loss = 0 # to store running loss
    model.train() # Set model in training mode
    for batch_idx, (X, y) in enumerate(dataloader):
        pred = model(X) # make prediction on current batch
        loss = loss_fn(pred,y) # calculate loss
        loss.backward() # calculates gradients
        optimiser.step() # update weights
        optimiser.zero_grad() # set gradients to zero for next batch
        avg_loss += loss.item() # add loss to running loss
    print(f'Average training Loss: {avg_loss/len(dataloader):.5f}')
    train_loss = avg_loss/len(dataloader)
    return train_loss

In [11]:
def val_loop(dataloader, model, loss_fn):
    loss = 0
    model.eval()
    bs = dataloader.batch_size
    for batch_idx, (X, y) in enumerate(dataloader):
        pred = model(X) # make prediction on current batch
        loss += loss_fn(pred,y).item() # calculate loss

    loss /= len(dataloader)
    print(f'Avg val loss: {loss:.5f} \n')

    return loss

In [12]:
def nonlinearity_constructor(name):
    if (name == "relu"):
        return nn.ReLU()
    if (name == "sigmoid"):
        return nn.Sigmoid()
    if (name == "leakyrelu"):
        return nn.LeakyReLU()
    raise ValueError("Unknown nonlinearity!")

In [13]:
class Network(nn.Module):
    def __init__(self,hid_layer=[100,100,100,100,100],nonlinearity = 'relu'): #Start with a default of 10, can change this in the call
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hid_layer[0]),
            nonlinearity_constructor(nonlinearity),
            nn.Linear(hid_layer[0], hid_layer[1]),
            nonlinearity_constructor(nonlinearity),
            nn.Linear(hid_layer[1], hid_layer[2]),
            nonlinearity_constructor(nonlinearity),
            nn.Linear(hid_layer[2], hid_layer[3]),
            nonlinearity_constructor(nonlinearity),
            nn.Linear(hid_layer[3], hid_layer[4]),
            nonlinearity_constructor(nonlinearity),
            nn.Linear(hid_layer[4], 1)
        )
    def forward(self,x):
        return self.net(x)

input_dim = Xtrain.shape[1]
model = Network(nonlinearity='leakyrelu')

learning_rate = 0.005
loss_fn = torch.nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-6)

train_loss_nn1 = []
val_loss_nn1 = []
epochs = 200
for epoch in range(epochs):
    print(f'Epoch {epoch+1}')
    train_loss_nn1.append(train_loop(train_dataloader, model, loss_fn, optimiser))
    val_loss_nn1.append(val_loop(val_dataloader, model, loss_fn))


#plot the training and validation loss
plt.plot(train_loss_nn1)
plt.plot(val_loss_nn1)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()


Epoch 1
Average training Loss: 60.27385
Avg val loss: 25.63456 

Epoch 2
Average training Loss: 53.67719
Avg val loss: 35.32888 

Epoch 3
Average training Loss: 52.87491
Avg val loss: 26.45234 

Epoch 4
Average training Loss: 49.59874
Avg val loss: 27.55585 

Epoch 5
Average training Loss: 47.64116
Avg val loss: 59.69711 

Epoch 6
Average training Loss: 43.92708
Avg val loss: 46.08252 

Epoch 7
Average training Loss: 43.40008
Avg val loss: 48.38584 

Epoch 8
Average training Loss: 40.76896
Avg val loss: 50.30310 

Epoch 9
Average training Loss: 38.60134
Avg val loss: 54.62297 

Epoch 10
Average training Loss: 37.41964
Avg val loss: 69.22416 

Epoch 11
Average training Loss: 32.51389
Avg val loss: 281.64350 

Epoch 12
Average training Loss: 31.82360
Avg val loss: 68.98284 

Epoch 13
Average training Loss: 35.83362
Avg val loss: 133.47155 

Epoch 14
Average training Loss: 31.62163
Avg val loss: 75.73818 

Epoch 15
Average training Loss: 29.85430
Avg val loss: 82.41375 

Epoch 16
Average 

KeyboardInterrupt: 

In [13]:
#Performing a time series split on the training data to split it into 5 folds for cross validation
from sklearn.model_selection import TimeSeriesSplit
# Set the number of splits
n_splits = 5  # You can adjust this as needed

# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=n_splits)

#make a 
def Cross_validation(model, Xtrain, Ytrain, tscv):
    train_loss_nn1 = []
    val_loss_nn1 = []
    epochs = 40
    for train_index, val_index in tscv.split(Xtrain):
        X_train_fold, X_val_fold = Xtrain[train_index], Xtrain[val_index]
        y_train_fold, y_val_fold = Ytrain[train_index], Ytrain[val_index]

        train_dataset = TensorDataset(torch.tensor(X_train_fold).float(), torch.tensor(y_train_fold).float())
        val_dataset= TensorDataset(torch.tensor(X_val_fold).float(), torch.tensor(y_val_fold).float())

        ## Dataloader

        train_bs = 32
        val_bs = 32
        #Batchsize is a Hyperparameter, it is a tradeoff between speed and accuracy. We use 32 here to start as this was the value from the tutorials.

        train_dataloader = DataLoader(train_dataset, batch_size=train_bs, shuffle=True, drop_last=True) #dataset
        val_dataloader = DataLoader(val_dataset, batch_size=val_bs, shuffle=False, drop_last=True)


        # Now you have X_train_fold, y_train_fold as your training data for this fold
        # And X_val_fold, y_val_fold as your validation data for this fold

        # You can train your neural network model on X_train_fold, y_train_fold
        # And evaluate it on X_val_fold, y_val_fold
        # Repeat this process for each fold

        for epoch in range(epochs):
            print(f'Epoch {epoch+1}')
            train_loss_nn1.append(train_loop(train_dataloader, model, loss_fn, optimiser))
            val_loss_nn1.append(val_test_loop(val_dataloader, model, loss_fn))
    return train_loss_nn1, val_loss_nn1
    
    
    




In [14]:
from torch import nn

class Network(nn.Module):
    def __init__(self,hid_layer=[100]): #Start with a default of 10, can change this in the call
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hid_layer[0]),
            nn.ReLU(),
            nn.Linear(hid_layer[0], 1)
        )
    def forward(self,x):
        return self.net(x)

input_dim = Xtrain.shape[1]
model = Network([10])

learning_rate = 0.01
loss_fn = torch.nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate)

#train the model
train_loss_nn1, val_loss_nn1 = Cross_validation(model, Xtrain, Ytrain, tscv)

#plot the training and validation loss
plt.plot(train_loss_nn1)
plt.plot(val_loss_nn1)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()



KeyError: "None of [Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,\n       ...\n       5812, 5813, 5814, 5815, 5816, 5817, 5818, 5819, 5820, 5821],\n      dtype='int32', length=5822)] are in the [columns]"