生成人工数据

In [13]:
import pandas as pd
import numpy as np

df_y_neg = pd.DataFrame(np.random.randint(0, 1, size=900)) # 900个0
df_y_pos = pd.DataFrame(np.random.randint(1, 2, size=100)) # 100个1

# 拼起来打乱，生成标签列y
df_y = pd.concat([df_y_neg, df_y_pos], ignore_index=True).sample(frac=1).reset_index(drop=True)

df_X = pd.DataFrame(np.random.normal(1, 0.1, size=(1000, 10)))

df = pd.concat([df_X, df_y], axis=1)
df.columns = ['x'+str(n) for n in range(10)] + ['y']


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,y
0,1.072914,1.099837,1.031139,0.984588,1.188883,0.971645,1.149173,0.933135,0.978319,1.015989,0
1,0.828991,1.046530,0.964619,1.128966,0.942044,1.089252,1.128058,0.860577,1.024709,1.137343,1
2,0.871800,1.031360,1.133053,0.879880,0.849899,1.018831,0.924027,0.894970,0.984896,1.106476,0
3,0.924564,0.689192,1.037686,1.131724,0.816166,1.100463,0.981523,1.005619,0.969602,1.079917,0
4,1.015973,1.060915,1.119027,0.917860,1.018992,1.156030,1.104015,1.033545,0.811283,0.833920,0
...,...,...,...,...,...,...,...,...,...,...,...
995,1.122848,1.160404,0.917935,0.977286,0.845419,1.148230,1.032497,1.132972,1.033164,1.039677,0
996,1.004475,1.041388,0.956927,1.144904,1.202620,1.082196,1.011851,0.957847,1.012935,1.130519,0
997,0.921991,0.863880,1.149279,0.970762,1.071775,0.809688,0.982295,0.877689,1.070065,0.966137,1
998,1.066763,0.894887,0.978457,1.038874,1.159893,0.909546,1.006461,1.101370,0.916218,1.088156,0


计算权重

In [16]:
# 提取出y为1的矩阵，并将其的shape的第一个参数，即y=1的个数
num_pos = df.loc[df['y'] == 1].shape[0]

num_neg = df.loc[df['y'] == 0].shape[0]


pos_weight = (num_pos + num_neg) / num_pos
neg_weight = (num_pos + num_neg) / num_neg

print(pos_weight, neg_weight)


10.0 1.1111111111111112


添加权重列

In [17]:
# 在dataframe里按条件添加一列的操作如下
df['y_weight'] = df['y'].apply(lambda x : pos_weight if x==1 else neg_weight)
df

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,y,y_weight
0,1.072914,1.099837,1.031139,0.984588,1.188883,0.971645,1.149173,0.933135,0.978319,1.015989,0,1.111111
1,0.828991,1.046530,0.964619,1.128966,0.942044,1.089252,1.128058,0.860577,1.024709,1.137343,1,10.000000
2,0.871800,1.031360,1.133053,0.879880,0.849899,1.018831,0.924027,0.894970,0.984896,1.106476,0,1.111111
3,0.924564,0.689192,1.037686,1.131724,0.816166,1.100463,0.981523,1.005619,0.969602,1.079917,0,1.111111
4,1.015973,1.060915,1.119027,0.917860,1.018992,1.156030,1.104015,1.033545,0.811283,0.833920,0,1.111111
...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.122848,1.160404,0.917935,0.977286,0.845419,1.148230,1.032497,1.132972,1.033164,1.039677,0,1.111111
996,1.004475,1.041388,0.956927,1.144904,1.202620,1.082196,1.011851,0.957847,1.012935,1.130519,0,1.111111
997,0.921991,0.863880,1.149279,0.970762,1.071775,0.809688,0.982295,0.877689,1.070065,0.966137,1,10.000000
998,1.066763,0.894887,0.978457,1.038874,1.159893,0.909546,1.006461,1.101370,0.916218,1.088156,0,1.111111


定义WeightRandomSampler抽样器

In [21]:
import torch
# 将dataframe中的数据先转化为numpy类型然后在用torch.tensor转化为tensor类
data_y_w = torch.tensor(df['y_weight'].to_numpy(), dtype=torch.float)
num_samples = df.shape[0]  # 一共抽取多少样本，可设置为与数据集同一数量

# 定义抽样器，目前所需参数为：权重数组weight，抽样总数num，是否选择放回取样replace
sampler = torch.utils.data.sampler.WeightedRandomSampler(data_y_w, num_samples, replacement=True)


构建dataset，建立Dateloader

In [27]:
# 数据弃掉权重列，传入tensor，细节，iloc中第一个:是指的任意行，而第二个:-1，指的是[0, -1）左闭右开，除去最后一列
data_features = torch.tensor(df.iloc[:, :-1].values, dtype=torch.float)

# 分别获得数据部分和表标签部分，标签转为long型
datas = data_features[:, :-1]
labels = data_features[:, -1].long()

# TensorDataset 可以用来对tensor打包，类似zip，形式是数据特征+标签
dataset = torch.utils.data.TensorDataset(datas, labels)

# 建立Dataloader
batch_size = 64
data_iter = torch.utils.data.DataLoader(
    dataset = dataset, sampler = sampler, batch_size = batch_size
)


查看小批次数据

In [30]:
X, y = next(iter(data_iter))
X.shape

torch.Size([64, 10])

使用如下代码查看数据
for epoch in range(num_epochs):
    for X, y in data_iter:
        .....