<a href="https://colab.research.google.com/github/HanhengHe/BigDataCourseProj/blob/main/bigdataproj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# GET FILE

from google.colab import drive
import zipfile

drive.mount('/content/drive/', force_remount=False)

with zipfile.ZipFile("/content/drive/MyDrive/Dataset_small/data1.zip","r") as zip_ref:
    zip_ref.extractall("targetdir")

Mounted at /content/drive/


In [34]:
# LOADING TRAINING DATA

lsTrainLabel = []
lsTrainData = []

with open("/content/targetdir/data1/train_label.txt") as f:
  for line in f.readlines():
    expression = line.split(' ')
    lsTrainLabel.append(int(expression[1]))
    dataRGB = np.load("/content/targetdir/data1/train/" + expression[0]).transpose((2,0,1))
    lsTrainData.append(dataRGB)
    # turn training data into grayscale data
    # lsData.append((dataRGB[:, :, 0] + dataRGB[:, :, 1] + dataRGB[:, :, 2]) / 3)

print("Number of traning data: {0}".format(len(lsTrainLabel)), end='\n')
print("Shape of traning data: {0}".format(lsTrainData[0].shape), end='\n')

Number of traning data: 50000
Shape of traning data: (3, 32, 32)


In [31]:
# LOAD TEST DATA

lsTestLabel = []
lsTestData = []

with open("/content/targetdir/data1/test_label.txt") as f:
  for line in f.readlines():
    expression = line.split(' ')
    lsTestLabel.append(int(expression[1]))
    dataRGB = np.load("/content/targetdir/data1/test/" + expression[0]).transpose((2,0,1))
    lsTestData.append(dataRGB)
    # turn training data into grayscale data
    # lsData.append((dataRGB[:, :, 0] + dataRGB[:, :, 1] + dataRGB[:, :, 2]) / 3)

print("Number of testing data: {0}".format(len(lsTestLabel)), end='\n')
print("Shape of testing data: {0}".format(lsTestData[0].shape), end='\n')

Number of testing data: 10000
Shape of testing data: (3, 32, 32)


In [89]:
# NETWORK DEFINE MODULE LET-NET

import numpy as np
import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.optim as optim
import torch.nn.functional as F

torch.cuda._initialized = True

class LeNet(nn.Module):
    def __init__(self, nClasses):
        super().__init__()

        self.cLayer1 = nn.Sequential(
          nn.Conv2d(3, 64, 5, stride=1, padding=2),
          nn.BatchNorm2d(64),
          nn.MaxPool2d(kernel_size=2, stride=2))
        
        self.cLayer2 = nn.Sequential(
          nn.Conv2d(64, 128, 5, stride=1, padding=2),
          nn.BatchNorm2d(128),
          nn.MaxPool2d(kernel_size=2, stride=2))
        
        self.cLayer3 = nn.Sequential(
          nn.Conv2d(128, 256, 5, stride=1, padding=2),
          nn.BatchNorm2d(256),
          nn.MaxPool2d(kernel_size=2, stride=2))
        
        self.fc = nn.Sequential(
            nn.Linear(4096, 2400),
            nn.ReLU(),
            nn.Dropout(),

            nn.Linear(2400, 740),
            nn.ReLU(),
            nn.Dropout(),

            nn.Linear(740, 74),
            nn.ReLU(),
            nn.Dropout(),

            nn.Linear(74, nClasses)
        )

      
    # forward propagation
    def forward(self, X):

        # convolutional layers
        X = self.cLayer1(X)
        X = self.cLayer2(X)
        X = self.cLayer3(X)

        # unfold
        X = torch.flatten(X, start_dim=1)

        # full connect layers
        out = self.fc(X)

        # output
        out = F.log_softmax(out, dim=-1)

        return out

In [102]:
# TRAINING MODULE

import math

NUMCLASS= 10
CPU = torch.device("cpu")
DEVICE = torch.device("cuda:0") if torch.cuda.is_available() else CPU
print("DEVICE = %s" % (DEVICE))
torch.cuda.empty_cache()

trainLabels = np.array(lsTrainLabel)
trainSet = np.array(lsTrainData)

trainSet = torch.from_numpy(trainSet).float()
trainLabels = torch.from_numpy(trainLabels)
# trainLabels = torch.nn.functional.one_hot(torch.from_numpy(trainLabels), num_classes=NUMCLASS)

MINIBATCH_SIZE = 512    # mini batch size

# first transform the data to dataset can be processed by torch
torch_dataset = Data.TensorDataset(trainSet, trainLabels.long())
# put the dataset into DataLoader
loader = Data.DataLoader(
    dataset=torch_dataset,
    batch_size=MINIBATCH_SIZE,
    shuffle=True,
    num_workers=0
)

model = LeNet(NUMCLASS).to(DEVICE)

Epochs = 100
nSteps = math.ceil(trainLabels.shape[0] / MINIBATCH_SIZE)
MINLOSS = 0.01
outputFrequence = 10
LEARNING_RATE = 0.01

optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9)
# optimizer = optim.Adam(model.parameters())

model.train()
for epoch in range(Epochs):

    nCurLoss = 0

    for step, (batch_x, batch_y) in enumerate(loader):
        optimizer.zero_grad()        
        output = model(batch_x.to(DEVICE))
        loss = F.nll_loss(output, batch_y.to(DEVICE))
        loss.backward()
        optimizer.step()
        nCurLoss += loss.item() / nSteps
    if (epoch + 1) % outputFrequence == 0:
        LEARNING_RATE = LEARNING_RATE / math.pow((1 + 10 * (epoch - 1) / Epochs), 0.75)
        optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9)

        print("Epoch {0}, general loss: {1}, lr = {2:.5}".format(epoch+1, nCurLoss, LEARNING_RATE))
    if float(nCurLoss) <= MINLOSS:
        break

DEVICE = cuda:0
Epoch 10, general loss: 1.0042232950122987, lr = 0.006435
Epoch 20, general loss: 0.5061895926387945, lr = 0.0029729
Epoch 30, general loss: 0.15264221501289585, lr = 0.0010923
Epoch 40, general loss: 0.044254567531146566, lr = 0.00033683
Epoch 50, general loss: 0.02888574664081847, lr = 9.0123e-05
Epoch 60, general loss: 0.024998682902707726, lr = 2.1402e-05
Epoch 70, general loss: 0.025208370978658905, lr = 4.5855e-06
Epoch 80, general loss: 0.025168614861156253, lr = 8.9748e-07
Epoch 90, general loss: 0.02459499852883877, lr = 1.6203e-07
Epoch 100, general loss: 0.025427948075289624, lr = 2.7198e-08


In [103]:
# TEST MODULE

testLabels = np.array(lsTestLabel)
testSet = np.array(lsTestData)

testSet = torch.from_numpy(testSet).float()
testLabels = torch.from_numpy(testLabels).to(DEVICE)

model.eval()
test_loss = 0
correct = 0

# first transform the data to dataset can be processed by torch
torch_dataset = Data.TensorDataset(testSet, testLabels.long())
# put the dataset into DataLoader
loader = Data.DataLoader(
    dataset=torch_dataset,
    batch_size=MINIBATCH_SIZE,
    shuffle=True,
    num_workers=0
)

with torch.no_grad():
    for step, (batch_x, batch_y) in enumerate(loader):
        output = model(batch_x.to(DEVICE))
        test_loss += F.nll_loss(output, batch_y, reduction='sum').item()  # sum a batch of loss
        predict = output.max(1, keepdim=True)[1]  # find the prediction
        correct += predict.eq(batch_y.view_as(predict)).sum().item()

testSize = testLabels.shape[0]
test_loss /= testSize
print("Test: Average loss:%s, Accuracy: %s/%s (%s)"
      % (test_loss, correct, testSize, correct / testSize))

Test: Average loss:1.8070598052978515, Accuracy: 7023/10000 (0.7023)
