In [1]:
%pip install torch tensorboardX tensorboard pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader, SubsetRandomSampler
from tensorboardX import SummaryWriter
import os
from time import perf_counter

In [None]:

class CustomDataSet(Dataset):
    def __init__(self, x : np.ndarray,y : np.ndarray):
        # x: numpy array
        # y: numpy array
        self.data = x
        self.label = y
        
    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, index : int):
        data = self.data[index]
        label = self.label[index]
        
        return data, label

In [None]:
class Net(nn.Module):
  def __init__(self,input_shape,layerWidth=512,dropoutRate=0.3):
    """
    Extend of nn.Module
    Wide Neural Network 
    layerWidth: width of the hidden layers
    input_shape: shape of the input
    Binary Classification
    """
    super(Net,self).__init__()
    self.dropoutRate = dropoutRate
    self.fc1 = nn.Linear(input_shape,layerWidth)
    self.fc2 = nn.Linear(layerWidth,layerWidth)
    self.fc3 = nn.Linear(layerWidth,1)
  def forward(self,x):
    # self.training is with nn.Module
    x = torch.relu(self.fc1(x))
    x = torch.dropout(x,self.dropoutRate,train=self.training)
    x = torch.relu(self.fc2(x))
    x = torch.dropout(x,self.dropoutRate,train=self.training)
    x = torch.sigmoid(self.fc3(x))
    return x


In [None]:
# path of CSV file
processedDataDirectoryPath = "processedData"
trainCSVPath = os.path.join(processedDataDirectoryPath,'processed_train_data.csv')
testCSVPath = os.path.join(processedDataDirectoryPath,'processed_test_data.csv')
# read CSV file
# split data and label
label = "smoking"
df = pd.read_csv(trainCSVPath)
y = df[label]
df = df.drop(label, axis=1)
# convert to numpy array
x = df.values
y = y.values
print(x.shape)


(159256, 10)


In [None]:
# Hyperparameters
layerWidth = 512
lr = 0.05
batchSize = 2048
dropout = 0.3
model = Net(x.shape[1],dropoutRate=dropout)
# May try otu different optimizers
optimizer = optim.SGD(model.parameters(), lr=lr)
loss_fn = nn.BCELoss()
validation_split = .2 # 20% of the data is used for validation
dataset = CustomDataSet(x,y)
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
train_indices, val_indices = indices[split:], indices[:split]
# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batchSize,sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batchSize,sampler=valid_sampler)
train_features, train_labels = next(iter(train_loader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
val_features, val_labels = next(iter(validation_loader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")


Feature batch shape: torch.Size([2048, 10])
Labels batch shape: torch.Size([2048])
Feature batch shape: torch.Size([2048, 10])
Labels batch shape: torch.Size([2048])


In [None]:
def validationTest(model, val_loader):
    acc = 0
    loss = 0
    model.eval()
    with torch.no_grad():
        for (data, target) in val_loader:
            data = data.to(torch.float32)
            target = target.to(torch.float32)
            target = target.unsqueeze(1)
            output = model(data)
            loss = loss_fn(output,target)
            acc += (output.reshape(-1).detach().numpy().round() == target.reshape(-1).detach().numpy()).mean()
        loss = loss.item()
    return acc/len(val_loader), loss
    

In [None]:
def train(model,epochs = 10, model_path = "./model.pth",log_path = "log_pytorch"):
   
    tb = SummaryWriter(f"./train_{log_path}/")  # training tensorboard
    vtb = SummaryWriter(f"./val_{log_path}/")  # validation tensorboard

    # put the model into training mode
    model.train()
    outputFormat = "{:^10}|{:^10}|{:^10}|{:^10}|{:^10}|{:10}"
    print(outputFormat.format("Mode","Epoch","Loss","Accuracy","Time Used","Progress"))
    outputFormat = "{:^10}|{:^10.0f}|{:^10.4f}|{:^10.4f}|{:^10.2f}|{:^10}"
    
    for epoch in range(1, epochs + 1):
        startTime = perf_counter()
        length = len(train_loader)
        acc = 0
        loss = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            data = data.to(torch.float32)
            target = target.to(torch.float32)
            target = target.unsqueeze(1)
            optimizer.zero_grad()
            #calculate output
            output = model(data)
            loss = loss_fn(output,target)
            acc += (output.reshape(-1).detach().numpy().round() == target.reshape(-1).detach().numpy()).mean()
            #backprop
            loss.backward()
            optimizer.step()
            print(outputFormat.format("Train",epoch,loss.item(),acc/(batch_idx+1),perf_counter() - startTime,f"{batch_idx+1}/{length}"),end="\r")
        trainAcc = acc/len(train_loader)
        tb.add_scalar("epoch loss", loss.item(), epoch)
        tb.add_scalar("epoch accuracy", trainAcc, epoch)
        print(outputFormat.format("Train",epoch,loss.item(),trainAcc,perf_counter() - startTime,f"{batch_idx+1}/{length}"))

        val_Acc , val_Loss = validationTest(model,validation_loader)
        print(outputFormat.format("Validation",epoch,val_Loss,val_Acc,perf_counter() - startTime,"Done"))
        vtb.add_scalar("epoch loss", val_Loss, epoch)
        vtb.add_scalar("epoch accuracy", val_Acc, epoch)
        for name, weight in model.named_parameters():
            tb.add_histogram(name, weight, epoch)
            tb.add_histogram(f'{name}.grad',weight.grad, epoch)
    # save the model to a .pth file
    print('Saving NN to %s' % model_path)
    torch.save(model.state_dict(), model_path)
    # add graph to tensorboard
    tb.add_graph(model, (data,))

In [None]:
def modelPred(model,test_data,model_path="./model.pth"):

    model.load_state_dict(torch.load(model_path))
    
    # put model into test mode
    model.eval()
    with torch.no_grad():
    
        output = model(torch.tensor(test_data,dtype=torch.float32))
        pred = output.reshape(-1).detach().numpy()
        
        return pred

In [None]:
strLR =  str(lr).replace(".","-")
epoch = 3000
modelStr = f"model_{batchSize}b_{strLR}lr_{layerWidth}w_{epoch}e"
modelPath = f"./{modelStr}.pth"
train(model,epochs=epoch,model_path=modelPath,log_path=modelStr)


   Mode   |  Epoch   |   Loss   | Accuracy |Time Used |Progress  
  Train   |    1     |  0.6706  |  0.5603  |   1.91   |  63/63   
Validation|    1     |  0.6729  |  0.5662  |   2.06   |   Done   
  Train   |    2     |  0.6382  |  0.5958  |   1.27   |  63/63   
Validation|    2     |  0.6379  |  0.6154  |   1.43   |   Done   
  Train   |    3     |  0.6103  |  0.6600  |   1.13   |  63/63   
Validation|    3     |  0.6017  |  0.6852  |   1.32   |   Done   
  Train   |    4     |  0.5863  |  0.6869  |   1.07   |  63/63   
Validation|    4     |  0.5809  |  0.7008  |   1.28   |   Done   
  Train   |    5     |  0.5986  |  0.6979  |   1.26   |  63/63   
Validation|    5     |  0.5684  |  0.7040  |   1.41   |   Done   
  Train   |    6     |  0.5623  |  0.7060  |   1.02   |  63/63   
Validation|    6     |  0.5449  |  0.7035  |   1.16   |   Done   
  Train   |    7     |  0.5439  |  0.7114  |   0.99   |  63/63   
Validation|    7     |  0.5810  |  0.7021  |   1.13   |   Done   
  Train   

In [None]:
testModel = Net(x.shape[1])
test_data = pd.read_csv(testCSVPath)
Id = "id"
y = test_data[Id]
test_data = test_data.drop(Id,axis=1)
predict = modelPred(testModel,test_data=test_data.values,model_path=modelPath)
print(predict)
mapped = {"id":y.values,"smoking" : predict}

df = pd.DataFrame.from_dict(mapped)
df.to_csv("result.csv",index=False)

[0.6939863  0.35345128 0.5978445  ... 0.5620101  0.1402953  0.02095972]
