In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from tensorboardX import SummaryWriter
import os

In [2]:

class CustomDataSet(Dataset):
    def __init__(self, x,y):
        self.data = x
        self.label = y
        
    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, index):
        data = self.data[index]
        label = self.label[index]

        return data, label

In [3]:
class AE_Encoder(nn.Module):
  def __init__(self):
    super(AE_Encoder,self).__init__()
    self.fc1 = nn.Linear(23,20)
    self.fc2 = nn.Linear(20,10)
    self.fc3 = nn.Linear(10,5)
    self.fc4 = nn.Linear(5,3)
  def forward(self,x):
    x = torch.relu(self.fc1(x))
    x = torch.relu(self.fc2(x))
    x = torch.relu(self.fc3(x))
    x = torch.relu(self.fc4(x))
    return x
class AE_Decoder(nn.Module):
  def __init__(self):
    super(AE_Decoder,self).__init__()
    self.fc4 = nn.Linear(20,23)
    self.fc3 = nn.Linear(10,20)
    self.fc2 = nn.Linear(5,10)
    self.fc1 = nn.Linear(3,5)
  def forward(self,x):
    x = torch.relu(self.fc1(x))
    x = torch.relu(self.fc2(x))
    x = torch.relu(self.fc3(x))
    x = torch.relu(self.fc4(x))
    x = torch.sigmoid(x)
    return x

In [4]:
loss_fn = torch.nn.MSELoss()
lr= 0.001
encoder = AE_Encoder()
decoder = AE_Decoder()
params_to_optimize = [
	{'params': encoder.parameters()},
	{'params': decoder.parameters()}
]
optim = torch.optim.Adam(params_to_optimize, lr=lr, weight_decay=1e-05)
# device = torch.device("mps")
# encoder.to(device)
# decoder.to(device)


In [5]:
csv_file_path = "./dataset/train_preprocessed_clean.csv"
df = pd.read_csv(csv_file_path)
y = df["Transported"]
df = df.drop("Transported", axis=1)
x = df.values
print(x.shape)
noise_csv_file_path = "./dataset/train_noise.csv"
ndf = pd.read_csv(noise_csv_file_path)
ny = ndf["Transported"]
ndf = ndf.drop("Transported", axis=1)
nx = ndf.values
print(nx.shape)
train_data_object = CustomDataSet(x,y)
noise_data_object = CustomDataSet(nx,ny)
batch_size = 100
train_loader = torch.utils.data.DataLoader(
        train_data_object,
        batch_size=batch_size,
        shuffle=False
    )
noise_train_loader = torch.utils.data.DataLoader(
        noise_data_object,
        batch_size=batch_size,
        shuffle=False
    )
for (data,_),(ndata,_) in zip(train_loader,noise_train_loader):
    print(data[0],ndata[0])
    break

(7736, 23)
(7736, 23)
tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]) tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1])


In [6]:
def train(epochs,encoder,decoder,dataLoader,noise_dataLoader,loss_fn,optimizer,model_save_path="./model_"):
    encoder.train()
    decoder.train()
    for epoch in range(1, epochs + 1):
        for (data,_),(noise_data,_) in zip(dataLoader,noise_dataLoader):
            data = data.to(torch.float32)
            noise_data = noise_data.to(torch.float32)
            optimizer.zero_grad()
            encoded = encoder(noise_data)
            decoded = decoder(encoded)
            loss = loss_fn(decoded, data)
            loss.backward()
            optimizer.step()
            # print('\t partial train loss (single batch): %f' % (loss.data))
        print('epoch: %d, train loss: %f' % (epoch, loss.data))
    print('Saving NN to %s' % model_save_path)
    torch.save(encoder.state_dict(), model_save_path + "Encoder.pth")
    torch.save(decoder.state_dict(), model_save_path + "Decoder.pth")

In [7]:
def test(encoder,decoder,df,model_path="./model_"):
    x = df.values

    encoder.load_state_dict(torch.load(model_path + "Encoder.pth"))
    decoder.load_state_dict(torch.load(model_path + "Decoder.pth"))
    # put model into test mode
    encoder.eval()
    decoder.eval()
    with torch.no_grad():
    
        encoded = encoder(torch.tensor(x,dtype=torch.float32))
        decoded = decoder(encoded)
        
        print(decoded.shape)
        denoised = pd.DataFrame(decoded.numpy().round())
        return denoised
        

In [8]:
train(1000,encoder,decoder,train_loader,noise_train_loader,loss_fn,optim)


epoch: 1, train loss: 0.250175
epoch: 2, train loss: 0.248338
epoch: 3, train loss: 0.246113
epoch: 4, train loss: 0.243614
epoch: 5, train loss: 0.241902
epoch: 6, train loss: 0.241698
epoch: 7, train loss: 0.241550
epoch: 8, train loss: 0.241415
epoch: 9, train loss: 0.241269
epoch: 10, train loss: 0.241131
epoch: 11, train loss: 0.240982
epoch: 12, train loss: 0.240657
epoch: 13, train loss: 0.240527
epoch: 14, train loss: 0.239707
epoch: 15, train loss: 0.238748
epoch: 16, train loss: 0.237462
epoch: 17, train loss: 0.237005
epoch: 18, train loss: 0.236662
epoch: 19, train loss: 0.236228
epoch: 20, train loss: 0.235859
epoch: 21, train loss: 0.235709
epoch: 22, train loss: 0.235667
epoch: 23, train loss: 0.235625
epoch: 24, train loss: 0.235589
epoch: 25, train loss: 0.235590
epoch: 26, train loss: 0.235561
epoch: 27, train loss: 0.235553
epoch: 28, train loss: 0.235552
epoch: 29, train loss: 0.235544
epoch: 30, train loss: 0.235536
epoch: 31, train loss: 0.235534
epoch: 32, train 

In [9]:
trainDataPath = "./dataset/train_raw_preprocessed.csv"
tdf = pd.read_csv(trainDataPath)
ty = tdf["Transported"]
tdf = tdf.drop("Transported",axis=1)

train_denoised = test(encoder,decoder,df=tdf)
train_denoised["Transported"] = ty
train_denoised.to_csv("./dataset/train_denoised.csv",index=False)
testDataPath = "./dataset/test_preprocessed.csv"

df = pd.read_csv(testDataPath)
y = df["PassengerId"]
df = df.drop("PassengerId",axis=1)
denoised = test(encoder,decoder,df=df)
denoised["PassengerId"] = y
denoised.to_csv("./dataset/test_denoised.csv",index=False)

torch.Size([8693, 23])
torch.Size([4277, 23])
