# Excel/Csv文件数据转成PyTorch张量导入模型代码逐行讲解

来自b站up主deep_thoughts 合集【PyTorch源码教程与前沿人工智能算法复现讲解】

P_52_Excel/Csv文件数据转成PyTorch张量导入模型代码逐行讲解：

https://www.bilibili.com/video/BV1nR4y1N7Vj/?spm_id_from=333.788&vd_source=18e91d849da09d846f771c89a366ed40

In [1]:
# 安装以下库
# pip install openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple
# pip install pands -i https://pypi.tuna.tsinghua.edu.cn/simple
# pip install numpy -i https://pypi.tuna.tsinghua.edu.cn/simple
# pip install torch torchvision torchaudio -i https://pypi.tuna.tsinghua.edu.cn/simple

In [4]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
import pandas

## 编写ExcelDataset

In [5]:
class ExcelDataset(Dataset):
    
    def __init__(self, filepath="data.xlsx", sheet_name=0):
        
        print(f"reading {filepath}, sheet={sheet_name}")
        
        df = pandas.read_excel(
            filepath, header=0, index_col=0,
            names=['feat1', 'feat2', 'label'],
            sheet_name=sheet_name,
            dtype={"feat1": np.float32, "feat2": np.float32, "label": np.int32}
        )
        
        print(f"the shape of dataframe is {df.shape}")
        
        feat = df.iloc[:, :2].values
        label = df.iloc[:, 2].values
        
        self.x = torch.from_numpy(feat)
        self.y = torch.from_numpy(label)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [None]:
# 测试

print("Test for ExcelDataset")
excel_dataset = ExcelDataset(sheet_name="corpus1")
# excel_dataset = ExcelDataset(sheet_name="corpus2")
# excel_dataset = ExcelDataset(sheet_name=None)  # 读取整个excel数据，如果有多张表pandas.read_excel会存为字典
excel_dataloader = DataLoader(excel_dataset, batch_size=8, shuffle=True)
for idx, (batch_x, batch_y) in enumerate(excel_dataloader):
    print(f"batch_id:{idx}, {batch_x.shape}, {batch_y.shape}")
    print(batch_x, batch_y)
    
    # 以下是伪代码
    # output = model(batch_x)
    # loss = criterion(output, batch_y)
    # optimizer.zero_grad()
    # loss.backward()
    # optimizer.step()

## 编写CsvDataset 方法一

In [6]:
class CsvDataset(Dataset):
    
    def __init__(self, filepath="data.csv"):
        # there is no sheet name definition in csv format file
        
        print(f"reading {filepath}")
        
        df = pandas.read_csv(
            filepath, header=0, index_col=0,
            encoding='utf-8',
            names=['feat1', 'feat2', 'label'],
            dtype={"feat1": np.float32, "feat2": np.float32, "label": np.int32},
            skip_blank_lines=True,
        )
        print(f"the shape of dataframe is {df.shape}")
        
        feat = df.iloc[:, :2].values
        label = df.iloc[:, 2].values
        
        self.x = torch.from_numpy(feat)
        self.y = torch.from_numpy(label)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [None]:
# 测试

print("Test for CsvDataset")
csv_dataset = CsvDataset()
csv_dataloader = DataLoader(csv_dataset, batch_size=8, shuffle=True)
for idx, (batch_x, batch_y) in enumerate(csv_dataloader):
    print(f"batch_id:{idx}, {batch_x.shape}, {batch_y.shape}")
    print(batch_x, batch_y)

## 编写CsvDataset 方法二

In [7]:
class Csv2Dataset(Dataset):
    
    def __init__(self, filepath="data.csv"):
        # there is no sheet name definition in csv format file
        
        print(f"reading {filepath}")
        
        with open(filepath, encoding='utf-8') as f:
            lines = f.readlines()
            
        feat = []
        label = []
        for line in lines[1:]:
            values = line.strip().split(',')
            row_feat = [float(v) for v in values[1:3]]
            row_label = int(values[3])
            
            feat.append(row_feat)
            label.append(row_label)
        
        feat = np.array(feat, dtype=np.float32)
        label = np.array(label, dtype=np.float32)
        
        self.x = torch.from_numpy(feat)
        self.y = torch.from_numpy(label)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [None]:
# 测试

print("Test for CsvDataset")
csv2_dataset = Csv2Dataset()
csv2_dataloader = DataLoader(csv2_dataset, batch_size=8, shuffle=True)
for idx, (batch_x, batch_y) in enumerate(csv2_dataloader):
    print(f"batch_id:{idx}, {batch_x.shape}, {batch_y.shape}")
    print(batch_x, batch_y)