# Baseline Code for HW1

This is just the baseline code to set up the basic function you need. You need to modify the code yourself to achieve a better result.

## Import packages you need

In [1]:
# import package 
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import csv

In [2]:
# Setting seeds
myseed=666
torch.manual_seed(myseed)
# if you use numpy
np.random.seed(myseed)

## Basic Function
Do not modify this part

In [3]:
class EarlyStopper(object):
    def __init__(self, num_trials, save_path):
        self.num_trials = num_trials
        self.trial_counter = 0
        self.best_loss = 1000000.0
        self.save_path = save_path

    def is_continuable(self, model, loss):
        if loss < self.best_loss:
            self.best_loss = loss
            self.trial_counter = 0
            torch.save(model.state_dict(), self.save_path)
            return True
        elif self.trial_counter + 1 < self.num_trials:
            self.trial_counter += 1
            return True
        else:
            return False

def cal_loss(loader):
    pres = []
    labels = []
    for field, label in loader:
        field, label = field.float(), label.float()
        prediction = model(field)
        pres.append(prediction)
        labels.append(label)
    pres = torch.cat(pres, dim=0)
    labels = torch.cat(labels, dim=0)
    loss = criterion(pres, labels)
    return loss

def predict(test_loader, model):
    pres = []
    for field in test_loader:
        field = field.float()
        prediction = model(field)
        pres.append(prediction)
    pres = torch.cat(pres, dim=0)
    return pres.detach().numpy()


In [4]:
# Loading dataset
class ReadDataset(Dataset):
    def __init__(self, path, is_test=False):
        super().__init__()
        self.is_test = is_test
        self.field = pd.read_csv(path, index_col=0)
        self.field.dropna(axis=0, how='any', inplace=True)
        assert not self.field.isnull().values.any()
        self.field = self.field.values
        if not is_test:
            self.label = self.field[:,-1]
            self.field = self.field[:,:-1]


    def __len__(self):
        return len(self.field)

    def __getitem__(self, item):
        field = self.field[item]
        if not self.is_test:
            label = self.label[item]
            return field, label
        return field

## Define DNN by pytorch

In [5]:
class Net(torch.nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        x = self.linear(x)
        x = self.sig(x).squeeze()
        return x

# Selecting Appropriate Hyperparameters

In [6]:
# hyper-parameters
# your batch size
batch_size = 100
# your learning rate
lr = 0.0001
# a parameter used for splitting train and validation set
split_ratio = 0.7
# maximum training epochs
epochs = 1000
# early stop step in training
num_trials = 100
# path for your trained model
save_path = "./model.pt"
# path for your predictions
test_path = "predictions.txt"

In [7]:
# Loading dataset
train_dataset = ReadDataset("./data/train.csv")
test_data = ReadDataset("./data/test.csv", is_test=True)
len_train = len(train_dataset)

In [8]:
split_num = [int(len_train*split_ratio), len_train-int(len_train*split_ratio)]
train_data, val_data = random_split(
                            dataset=train_dataset,
                            lengths=split_num,
                            generator=torch.Generator().manual_seed(myseed)
                        )
print("Num of Samples: Train: {}, Validation: {}, Test: {}".format(len(train_data), len(val_data), len(test_data)))

Num of Samples: Train: 10479, Validation: 4492, Test: 9982


In [9]:
class_weights = [1, 3]
sample_weights = [0] * len(train_data)

In [10]:
for idx, (data, label) in enumerate(train_data):
    class_weight = class_weights[int(label)]  # Convert label to integer
    sample_weights[idx] = class_weight

sampler = torch.utils.data.WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)

In [11]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False, sampler=sampler)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [12]:
# Loss
criterion = nn.BCELoss()

In [13]:
input_dim = train_data[0][0].shape[0] 
model = Net(input_dim)
# your optimizer
optimizer = torch.optim.RMSprop(
        params=model.parameters(), lr=lr)

In [14]:
# Training

model.train()
early_stopper = EarlyStopper(num_trials, save_path)


for epoch in range(epochs):
    train_loss = 0
    train_correct = 0
    train_total = 0

    for field, label in train_loader:
        field, label = field.float(), label.float()
        prediction = model(field)
        loss = criterion(prediction, label)
        model.zero_grad()
        loss.backward()
        optimizer.step()

        predicted_labels = (prediction >= 0.5).float()
        train_correct += (predicted_labels == label).sum().item()
        train_total += len(label)

    train_accuracy = train_correct / train_total

    # calculate validation
    train_loss = cal_loss(train_loader)
    val_loss = cal_loss(val_loader)
    val_correct = 0
    val_total = 0

    for field, label in val_loader:
        field, label = field.float(), label.float()
        prediction = model(field)
        predicted_labels = (prediction >= 0.5).float()
        val_correct += (predicted_labels == label).sum().item()
        val_total += len(label)

    val_accuracy = val_correct / val_total

    print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

    if not early_stopper.is_continuable(model, loss):
        break

# predicting on the test set
model.eval()
test_predict = predict(test_loader, model) >= 0.5

Epoch 1: Train Loss: 36.6351, Train Accuracy: 0.4966, Val Loss: 39.9958, Val Accuracy: 0.4873
Epoch 2: Train Loss: 36.2468, Train Accuracy: 0.5199, Val Loss: 39.0881, Val Accuracy: 0.4982
Epoch 3: Train Loss: 35.5345, Train Accuracy: 0.5239, Val Loss: 38.2800, Val Accuracy: 0.5020
Epoch 4: Train Loss: 34.6294, Train Accuracy: 0.5309, Val Loss: 37.6971, Val Accuracy: 0.5089
Epoch 5: Train Loss: 34.5530, Train Accuracy: 0.5390, Val Loss: 36.9670, Val Accuracy: 0.5131
Epoch 6: Train Loss: 33.1262, Train Accuracy: 0.5394, Val Loss: 36.1023, Val Accuracy: 0.5207
Epoch 7: Train Loss: 32.4275, Train Accuracy: 0.5453, Val Loss: 35.2016, Val Accuracy: 0.5278
Epoch 8: Train Loss: 31.8707, Train Accuracy: 0.5432, Val Loss: 34.1061, Val Accuracy: 0.5298
Epoch 9: Train Loss: 30.1096, Train Accuracy: 0.5605, Val Loss: 32.8471, Val Accuracy: 0.5341
Epoch 10: Train Loss: 29.2162, Train Accuracy: 0.5611, Val Loss: 31.3766, Val Accuracy: 0.5405
Epoch 11: Train Loss: 27.5906, Train Accuracy: 0.5588, Val 

In [15]:
test_predict.sum()

310

In [16]:
def save_pred(preds, file):
    print('Saving results to {}'.format(file))
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])
save_pred([int(x) for x in test_predict], 'prediction.csv')         # save prediction file to pred.csv

Saving results to prediction.csv


# Hints:

Utilize a New, Powerful Optimizer 

Improve Model Structure

Employ Proper Hyper-Paremeter

Feature Selection

# Rules:

Ensemble models are not allowed.

You may use NumPy or Torch to implement other models such as SVM, but importing other packages is prohibited.
