In [1]:
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data as Data
import numpy as np
import pandas as pd

# Add one Hot vector of each
# Identity entry as a new one-hot-vector feature
def Add_One_Hot_Vectors(data):
    ID = data[:,0]
    unique = np.unique(ID)
    m = {}
    for i in range(unique.size):
        m[unique[i]] = i
    arr = []
    for i in range(ID.size):
        vec = [0] * unique.size
        vec[m[ID[i]]] = 1
        arr.append(vec)
    arr = np.array(arr)
    data = np.concatenate((data,arr), 1)
    return data

# Normalize Features in columns [l,r)
def Normalize_Features(data,l,r):
    Min = np.amin(data,0)
    Max = np.amax(data,0)
    Min = Min.astype(float)
    Max = Max.astype(float)
    data = data.astype(float)
    data[:,l:r] = (data[:,l:r] - Min[l:r])/(Max[l:r] - Min[l:r]) 
    return data

# use RMSE as loss function for neural network
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = torch.nn.MSELoss()
        
    def forward(self,yhat,y):
        return torch.sqrt(self.mse(yhat,y))
    
# function to compute RMSE from given numpy arrays
def RMSE(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

# Create Neural Network with 3 hidden layers
# D: number of features
# H1: nodes in first hidden layer
# H2: nodes in second hidden layer
# H3: nodes in third hidden layer
# OP: output nodes
def Get_NN(D,H1,H2,H3,OP):
    net = torch.nn.Sequential(
        torch.nn.Linear(D,H1),
        torch.nn.ReLU(),
        torch.nn.Linear(H1,H2),
        torch.nn.ReLU(), 
        torch.nn.Linear(H2,H3),
        torch.nn.ReLU(),
        torch.nn.Linear(H3,OP),
    )
    return net

def main():
    # read data from files
    Data_Set = pd.read_csv("training_data.csv")
    Test_Set = pd.read_csv("test_data.csv")
    
    # switch data to numpy arrays 
    data = Data_Set.values
    test = Test_Set.values
    
    # shuffle training data
    np.random.shuffle(data)
    
    # create index array to select features
    idx = [True] * (54);
    idx[0] = False
    idx[6] = False
    
    # define constants
    N = 40000
    D = sum(idx)
    H1 = 100
    H2 = 200
    H3 = 100
    OP = 1
    LR = 0.001
    BATCH_SIZE = 32
    EPOCH = 200
    
    # Add one-hot-vectors of Identity to training data
    data = Add_One_Hot_Vectors(data)
    # normalize features
    data = Normalize_Features(data,1,6)
    # transform data into tensors
    X = torch.from_numpy(data[:N,idx]).float()
    Y = torch.from_numpy(data[:N,6]).float()
    # transform validation data into tensors
    Validate_X = torch.from_numpy(data[N:,idx]).float()
    Validate_Y = torch.from_numpy(data[N:,6]).float()
    # Add one-hot-vectors to testing data
    test = Add_One_Hot_Vectors(test)
    # normalize features
    test = Normalize_Features(test,1,3)
    test = Normalize_Features(test,5,6)
    test[:,3] = test[:,3] / test[1,3]
    test[:,4] = test[:,4] / test[1,4]
    
    # create index array to select features from array
    Test_idx = [True] * 53
    Test_idx[0] = False
    
    # transform testing data to tensor
    Test_X = torch.from_numpy(test[:,Test_idx]).float()
    
    # define neural network and setup
    Net = Get_NN(D,H1,H1,H2,OP)
    Optimizer = torch.optim.Adam(Net.parameters(), lr = LR)
    Loss_Func = RMSELoss()
    
    # define data loader
    Torch_Dataset = Data.TensorDataset(X,Y)
    Loader = Data.DataLoader(
        dataset = Torch_Dataset,
        batch_size = BATCH_SIZE,
        shuffle = True,
        num_workers = 2,
    )
    
    # main traning loop
    for Epoch in range(EPOCH):
        for step, (Batch_X,Batch_Y) in enumerate(Loader):
            Optimizer.zero_grad()
            B_X = Variable(Batch_X)
            B_Y = Variable(Batch_Y)
            Prediction = Net(B_X)
            Loss = Loss_Func(Prediction.squeeze(),B_Y)
            Loss.backward()
            Optimizer.step()
    
    # predict for validation data and print RMSE
    Validate_P = Net(Validate_X)
    print(Loss_Func(Validate_P.squeeze(),Validate_Y))
    
    # predict for testing data and print to csv file
    Test_Y = Net(Test_X).squeeze()
    Test_DataFrame = pd.DataFrame(Test_Y.data.numpy())
    Test_DataFrame.to_csv('test_predictions.csv')

if __name__ == '__main__':
    main()


tensor(19.2593, grad_fn=<SqrtBackward>)
