## ECE 285 - Final Project
## Training a neural network to predict image category using functional magnetic resonance imaging (fMRI) signals.
### Instructor: Xiaolong WANG
*****************************************************************************************************************
Shayne Wang, A15776202, June 2023
******************************************************************************************************************

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch.utils.data import TensorDataset
import numpy as np
import pandas as pd

#from torch.utils.data import Dataset
#import torchvision
#import torchvision.transforms as transforms
#import torchvision.transforms as T

# Option to use GPU
USE_GPU = True  

dtype = torch.float32 

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('using device:', device)

Tensor = torch.cuda.FloatTensor if USE_GPU else torch.FloatTensor
np.random.seed(0)

using device: cuda


In [3]:
###################################################################################
#  Section 1: DATA IMPORT
#  We import the dataset in .csv format and convert each into a data frame
#  The dataset has been divided into 4 files. Note that each file has different
#  number of columns.
#  Generate X and Y in numpy from data frame
###################################################################################

def importData(file_name):
    """
    Convert data in a csv file to numpy
    
    Inputs:
    - one of the csv file name for dataset
    
    Returns: 
    - X: 2-D tensor, float32 
    - Y: 1-D tensor, int
    - class_dict: a dictionary for the classes in this dataset
    """
    # read in csv file and convert to data frame
    pd_fMRI = pd.read_csv(file_name)

    # display the 5 lines of the data frame
    print('*'*100)
    print(pd_fMRI.head())

    # generate X and Y in numpy
    X = pd_fMRI.values[:,2:-1].astype(float)
    Y = pd_fMRI.values[:,-1]
    print('shape of data: ', X.shape, Y.shape)

    # filter out the name of categories
    Y_class = set(Y.tolist())
    # create a dictionary for category vs index
    class_dict = dict(zip(Y_class,range(len(Y_class))))
    print('class in this dataset:\n',class_dict)
    print('*'*100)
    # convert category Y into integer since PyTorch doesn't have dtype for string
    Y = np.array([class_dict.get(a_class) for a_class in Y.tolist()])
    return Tensor(X),Tensor(Y),class_dict

# read out 4 csv files
X_data_1, Y_data_1, class_dict_1 = importData('df_merged_subject1.csv')
X_data_2, Y_data_2, class_dict_2 = importData('df_merged_subject2.csv')
X_data_3, Y_data_3, class_dict_3 = importData('df_merged_subject3.csv')
X_data_4, Y_data_4, class_dict_4 = importData('df_merged_subject4.csv')

****************************************************************************************************
   subject                         image_id  LHEarlyVis#0  LHEarlyVis#1  \
0        1             n01930112_19568.JPEG      0.167271      1.173377   
1        1             n03733281_29214.JPEG      1.258984      0.744704   
2        1              n07695742_5848.JPEG      0.279666     -0.164892   
3        1  COCO_train2014_000000420713.jpg      0.473376     -0.299339   
4        1  COCO_train2014_000000488558.jpg      0.224416      1.852141   

   LHEarlyVis#2  LHEarlyVis#3  LHEarlyVis#4  LHEarlyVis#5  LHEarlyVis#6  \
0     -0.482830      0.561836      0.487629     -1.366299      0.526726   
1      0.264117     -0.199035      0.221795     -1.114712      0.549931   
2     -0.550474      0.587374      0.319142     -0.022280      1.146169   
3      0.365422      0.443424      0.986940      0.916352      0.656573   
4      1.087473     -0.393302      0.129446      0.858346     -0.352569  

In [4]:
###################################################################################
#  Section 2: DATA PREPROCESSING AND SPLIT
#  Preprocess the data in this section
#  Split the dataset into training:validation:test = 80%:10%:10%. 
###################################################################################

def createLoader(X_data,Y_data,batch_size):
    """
    Generate 3 dataloader for training, val and test sets.
    
    Inputs:
    - X_data,Y_data: tensor, dataset ready for split
    - batch_size: int, number of batch size fed to the model at a time
    
    Returns: 
    - 3 loader respectively for training, val and test sets 
    """
    # Encapsulate as a dataset (X,Y) ready to split
    dataset = TensorDataset(X_data, Y_data)
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))
    test_size = len(dataset) - train_size - val_size
    
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_dataloader,valid_dataloader,test_dataloader


In [5]:
###################################################################################
#  Section 3: A TRAINING LOOP AND CHECK ACCURACY FUNCTIONS
#  Used to train any SELF-DEFINED MODEL and return accuracy on training set
#  check_accuracy_part34 return accuracy of dataloadfer.
###################################################################################
# use dataloader to feed the model
def train_part34(model, train_dataloader, valid_dataloader, optimizer, epochs=1):
    """
    A training loop
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - train_dataloader, valid_dataloader: the data loaders for training and val sets.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: The accuracy of the model
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for t, (x, y) in enumerate(train_dataloader):
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            loss = F.cross_entropy(scores, y)

            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()

            # This is the backwards pass: compute the gradient of the loss with
            # respect to each  parameter of the model.
            loss.backward()

            # Actually update the parameters of the model using the gradients
            # computed by the backwards pass.
            optimizer.step()

            if (t + 1) % print_every == 0:
                print('Epoch %d, Iteration %d, loss = %.4f' % (e, t + 1, loss.item()))
                check_accuracy_part34(valid_dataloader, model, isTestSet=False)
                print()
    return check_accuracy_part34(valid_dataloader, model, isTestSet=False)


def check_accuracy_part34(loader, model, isTestSet=True):
    
    if isTestSet:
        print('Checking accuracy on test set')
    else:
        print('Checking accuracy on validation set') 
    
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)
            scores = model(x)
            _, preds = scores.max(1)
            num_correct += (preds == y).sum()
            num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
    return acc

In [6]:
###################################################################################
#  Section 4: Model 1
#  Define a 2-layer MLP. 
#  This is a baseline
###################################################################################

learning_rate = 1e-3
hidden_layer_size = 256
epochs = 100
batch_size = 512

# Constant to control how frequently we print train loss
print_every = 100

def testModel1(X_data, Y_data, class_dict):
    """
    Define, train and test a MLP model 
    
    Inputs:
    - Tensors and the dictionary for a csv file
    
    Returns: NO return. But it prints out the loss during training process
             and accuracy of val and test set for the model
    """
    # the 4 csv files contain different features and classes
    input_dim = X_data.shape[1]
    num_class = len(class_dict_1)
    
    train_dataloader, valid_dataloader, test_dataloader = createLoader(X_data,Y_data,batch_size)
    
    model1 = None
    # define and modify the architecture
    model1 = nn.Sequential(
        #nn.BatchNorm1d(input_dim),
        nn.Linear(input_dim, hidden_layer_size),
        nn.ReLU(),
        nn.Linear(hidden_layer_size, num_class),
        nn.ReLU()
    )

    optimizer = torch.optim.SGD(model1.parameters(), lr=learning_rate,
                     momentum=0.9, nesterov=True)

    train_part34(model1, train_dataloader, valid_dataloader, optimizer, epochs=epochs)
    check_accuracy_part34(test_dataloader, model1)
    print('*'*60)

#test Model1 upon 4 csv files
print('*'*60)
testModel1(X_data_1, Y_data_1, class_dict_1)
testModel1(X_data_2, Y_data_2, class_dict_2)
testModel1(X_data_3, Y_data_3, class_dict_3)
testModel1(X_data_4, Y_data_4, class_dict_4)

************************************************************
Checking accuracy on validation set
Got 289 / 502 correct (57.57)
Checking accuracy on test set
Got 290 / 504 correct (57.54)
************************************************************
Checking accuracy on validation set
Got 254 / 502 correct (50.60)
Checking accuracy on test set
Got 273 / 504 correct (54.17)
************************************************************
Checking accuracy on validation set
Got 271 / 502 correct (53.98)
Checking accuracy on test set
Got 269 / 504 correct (53.37)
************************************************************
Checking accuracy on validation set
Got 163 / 300 correct (54.33)
Checking accuracy on test set
Got 164 / 302 correct (54.30)
************************************************************
