李宏毅.作业一数据集处理部分

In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import csv
import pandas as pd
import numpy as np

In [2]:
tr_path = './covid.train.csv'
tt_path = './covid.test.csv'

In [3]:
with open(tr_path, 'r') as fp:
    raw_data = list(csv.reader(fp))
    raw_data = np.array(raw_data[1:])[:, 1:].astype(float)

In [4]:
feats = [75, 57, 42, 60, 78, 43, 61, 79, 40, 58, 76, 41, 59, 77]
raw_data[0][feats]

array([20.1518381, 19.586492 , 25.6489069, 25.6791006, 26.0605436,
       21.2420632, 21.2802696, 21.5038315,  0.81461  ,  0.8389952,
        0.8978015,  0.7713562,  0.8077665,  0.8878931])

In [5]:
raw_data[0,feats]

array([20.1518381, 19.586492 , 25.6489069, 25.6791006, 26.0605436,
       21.2420632, 21.2802696, 21.5038315,  0.81461  ,  0.8389952,
        0.8978015,  0.7713562,  0.8077665,  0.8878931])

In [6]:
class COVID19Dataset(Dataset):
    ''' Dataset for loading and preprocessing the COVID19 dataset '''
    def __init__(self,
                 path,
                 mode='train',
                 target_only=True):
        self.mode = mode

        # Read data into numpy arrays
        with open(path, 'r') as fp:
            data = list(csv.reader(fp))
            data = np.array(data[1:])[:, 1:].astype(float)
        
        if not target_only:
            feats = list(range(93))
            # feats = list(range(1, 40)) + [57, 75]
        else:
            # TODO: Using 40 states & 2 tested_positive features (indices = 57 & 75)
            # feats = list(range(1, 41)) + [57, 75]
            feats = [75, 57, 42, 60, 78, 43, 61, 79, 40, 58, 76, 41, 59, 77] #上面挑选的最优特征
        if mode == 'test':
            # Testing data
            # data: 893 x 93 (40 states + day 1 (18) + day 2 (18) + day 3 (17))
            data = data[:, feats]
            self.data = torch.FloatTensor(data)
        else:
            # Training data (train/dev sets)
            # data: 2700 x 94 (40 states + day 1 (18) + day 2 (18) + day 3 (18))
            target = data[:, -1]
            data = data[:, feats]
            
            # Splitting training data into train & dev sets
            # 在确定参数后，使用所有数据进行训练，即训练集=验证集
            if mode == 'train':
                #indices = [i for i in range(len(data)) if i % 10 != 0]
                indices = [i for i in range(len(data))]
            elif mode == 'dev':
                #indices = [i for i in range(len(data)) if i % 10 == 0]
                indices = [i for i in range(len(data))]
            
            # Convert data into PyTorch tensors
            self.data = torch.FloatTensor(data[indices])
            self.target = torch.FloatTensor(target[indices])

        # Normalize features (you may remove this part to see what will happen)
        # 只在 target_only = False 时进行归一化
        self.data[:, 40:] = \
            (self.data[:, 40:] - self.data[:, 40:].mean(dim=0, keepdim=True)) \
            / self.data[:, 40:].std(dim=0, keepdim=True)

        self.dim = self.data.shape[1]

        print('Finished reading the {} set of COVID19 Dataset ({} samples found, each dim = {})'
              .format(mode, len(self.data), self.dim))

    def __getitem__(self, index):
        # Returns one sample at a time
        if self.mode in ['train', 'dev']:
            # For training
            return self.data[index], self.target[index]
        else:
            # For testing (no target)
            return self.data[index]

    def __len__(self):
        # Returns the size of the dataset
        return len(self.data)

In [7]:
tr_dataset = COVID19Dataset(tr_path, mode='train', target_only=True)

Finished reading the train set of COVID19 Dataset (2700 samples found, each dim = 14)


In [8]:
tr_dataset[0]

(tensor([20.1518, 19.5865, 25.6489, 25.6791, 26.0605, 21.2421, 21.2803, 21.5038,
          0.8146,  0.8390,  0.8978,  0.7714,  0.8078,  0.8879]),
 tensor(20.7049))

In [9]:
tr_dataset_all = COVID19Dataset(tr_path, mode='train', target_only=False)

Finished reading the train set of COVID19 Dataset (2700 samples found, each dim = 93)


In [10]:
tr_dataset_all[0]

(tensor([ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         -0.4211, -0.5778, -0.4172, -0.3648, -0.9364,  1.3419,  0.9772,  1.7371,
          1.2597,  1.2648,  1.4035, -0.6428, -1.1475, -0.6367, -0.8175, -1.7962,
         -0.2355,  0.4302, -0.3703, -0.4991, -0.4239, -0.3698, -1.0681,  1.3591,
          1.0116,  1.7946,  1.2462,  1.2317,  1.3125, -0.6879, -1.2323, -0.6249,
         -0.9886, -1.8433, -0.1755,  0.4962, -0.2384, -0.3176, -0.3914, -0.3525,
         -1.0000,  1.2458,  0.9802,  1.6167,  1.2800,  1.1052,  1.2577, -0.7436,
         -1.1814, -0.5957, -0.9383, -1.7432, -0.1843]),
 tensor(20.7049))

In [11]:
len(tr_dataset_all)

2700

## dataloader

In [14]:
def prep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False,drop_last=False):
    ''' Generates a dataset, then is put into a dataloader. '''
    dataset = COVID19Dataset(path, mode=mode, target_only=target_only)  # Construct dataset
    dataloader = DataLoader(
        dataset, batch_size,
        shuffle=(mode == 'train'), drop_last=drop_last,
        num_workers=n_jobs, pin_memory=True)                            # Construct dataloader
    return dataloader

In [13]:
tr_set = prep_dataloader(tr_path, 'train', 256, target_only=True)
l = 0
for x, y in tr_set:
    print(x.shape, y.shape)
    l += x.shape[0]
print(l)

Finished reading the train set of COVID19 Dataset (2700 samples found, each dim = 14)
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([140, 14]) torch.Size([140])
2700


In [15]:
tr_set = prep_dataloader(tr_path, 'train', 256, target_only=True,drop_last=True)
l = 0
for x, y in tr_set:
    print(x.shape, y.shape)
    l += x.shape[0]
print(l)

Finished reading the train set of COVID19 Dataset (2700 samples found, each dim = 14)
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
torch.Size([256, 14]) torch.Size([256])
2560
