# <center> 作业四 AFQMC </center>
by Hyr1sky_He

_为了提高任务效率，本ipynb中不再细化每个步骤的操作过程，仅记录重要思路及核心方法，Task题目解答将在实验报告中给出_

In [42]:
import math
import os
import pandas as pd
import torch
import json
import jieba
from torch import nn
from d2l import torch as d2l
from torch.utils.data import Dataset, DataLoader

In [6]:
# read json & vocab
def check_data(path):
    for dir_name, _, file_names in os.walk(path):
        for file_name in file_names:
            print(os.path.join(dir_name, file_name))

check_data('../Assignment4_dataset/data/')

../Assignment4_dataset/data/vocab.txt
../Assignment4_dataset/data/AFQMC数据集/test.json
../Assignment4_dataset/data/AFQMC数据集/dev.json
../Assignment4_dataset/data/AFQMC数据集/train.json


In [12]:
# data processing
def read_data(path):
    sentence_1 = []
    sentence_2 = []
    label = []
    with open (path, 'r', encoding = 'utf-8') as f:
        for line in f.readlines():
            line = json.loads(line)
            sentence_1.append(line['sentence1'])
            sentence_2.append(line['sentence2'])
            label.append(line['label'])
        df = pd.DataFrame({'sentence1': sentence_1, 'sentence2': sentence_2, 'label': label})
    return df

In [19]:
def write_df(path, type):
    df = read_data(path)
    df.to_csv('../Assignment4_dataset/data/AFQMC数据集/' + type + '.csv', index = False)
    return "Generate csv file successfully!"

train_df = read_data('../Assignment4_dataset/data/AFQMC数据集/train.json')
train_df.head()

Unnamed: 0,sentence1,sentence2,label
0,蚂蚁借呗等额还款可以换成先息后本吗,借呗有先息到期还本吗,0
1,蚂蚁花呗说我违约一次,蚂蚁花呗违约行为是什么,0
2,帮我看一下本月花呗账单有没有结清,下月花呗账单,0
3,蚂蚁借呗多长时间综合评估一次,借呗得评估多久,0
4,我的花呗账单是***，还款怎么是***,我的花呗，月结出来说让我还***元，我自己算了一下详细名单我应该还***元,1


In [20]:
write_df('../Assignment4_dataset/data/AFQMC数据集/train.json', 'train')

'Generate csv file successfully!'

In [14]:
dev_df = read_data('../Assignment4_dataset/data/AFQMC数据集/dev.json')
dev_df.head()

Unnamed: 0,sentence1,sentence2,label
0,双十一花呗提额在哪,里可以提花呗额度,0
1,花呗支持高铁票支付吗,为什么友付宝不支持花呗付款,0
2,我的蚂蚁花呗支付金额怎么会有限制,我到支付宝实体店消费用花呗支付受金额限制,1
3,为什么有花呗额度不能分期付款,花呗分期额度不足,0
4,赠品不能设置用花呗付款,怎么不能花呗分期付款,0


In [30]:
write_df('../Assignment4_dataset/data/AFQMC数据集/dev.json', 'dev')

'Generate csv file successfully!'

In [55]:
(train_df.sentence1.str.len() + train_df.sentence2.str.len()).describe()

count    34334.000000
mean        26.732597
std         10.405410
min         10.000000
25%         20.000000
50%         25.000000
75%         30.000000
max        157.000000
dtype: float64

In [63]:
with open('../Assignment4_dataset/data/vocab.txt', 'r', encoding='utf-8') as f:
    vocab = [line.strip() for line in f]

# mapping to id
char_to_id = {char: idx for idx, char in enumerate(vocab)}

# tokenize_sample
sentence_11 = "蚂蚁借呗等额还款可以换成先息后本吗"
sentence_22 = "借呗有先息到期还本吗"
sentence11_ids = [char_to_id.get(char, char_to_id['[MASK]']) for char in sentence_11]
print(sentence11_ids)
sentence22_ids = [char_to_id.get(char, char_to_id['[MASK]']) for char in sentence_22]
print(sentence22_ids)
sentence_ids = [char_to_id['[CLS]']] + sentence11_ids + [char_to_id['[SEP]']] + sentence22_ids + [char_to_id['[SEP]']]
print(sentence_ids)

[3802, 2975, 1051, 4947, 43, 852, 201, 699, 48, 22, 806, 33, 254, 399, 49, 89, 1114]
[1051, 4947, 9, 254, 399, 45, 195, 201, 89, 1114]
[1, 3802, 2975, 1051, 4947, 43, 852, 201, 699, 48, 22, 806, 33, 254, 399, 49, 89, 1114, 2, 1051, 4947, 9, 254, 399, 45, 195, 201, 89, 1114, 2]


In [73]:
BATCH_SIZE = 64
num_hiddens = 256
num_layers = 2
num_heads = 4
dropout = 0.1
max_len = 160
num_epochs = 20
lr = 0.001

In [85]:
class TextDataset(Dataset):
    def __init__(self, dataframe, char_to_id, max_length=160):
        self.data = dataframe
        self.char_to_id = char_to_id
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence1 = self.data.iloc[index]['sentence1']
        sentence2 = self.data.iloc[index]['sentence2']
        label = self.data.iloc[index]['label']

        combined_tokens = ['[CLS]'] + [char for char in sentence1] + ['[SEP]'] + [char for char in sentence2] + ['[SEP]']
        segment_ids = [0] * (len(sentence1) + 2) + [1] * (len(sentence2) + 1)

        combined_ids = [self.char_to_id.get(char, self.char_to_id['[MASK]']) for char in combined_tokens]
        combined_ids = torch.nn.functional.pad(torch.tensor(combined_ids), (0, self.max_length - len(combined_ids)))
        segment_ids = torch.nn.functional.pad(torch.tensor(segment_ids), (0, self.max_length - len(segment_ids)))

        return combined_ids, segment_ids, label

train_dataset = TextDataset(train_df, char_to_id)
dev_dataset = TextDataset(dev_df, char_to_id)

for combined_ids, segment_ids, label in train_dataset:
    print(combined_ids)
    print(segment_ids)
    print(label)
    break

tensor([   1, 3802, 2975, 1051, 4947,   43,  852,  201,  699,   48,   22,  806,
          33,  254,  399,   49,   89, 1114,    2, 1051, 4947,    9,  254,  399,
          45,  195,  201,   89, 1114,    2,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   

In [75]:
"""
class TextDataset(Dataset):
    def __init__(self, dataframe, char_to_id, max_len = 160):
        self.data = dataframe
        self.char_to_id = char_to_id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence1 = self.data.iloc[index]['sentence1']
        sentence2 = self.data.iloc[index]['sentence2']
        label = self.data.iloc[index]['label']

        combined_tokens = ['[CLS]'] + [char for char in sentence1] + ['[SEP]'] + [char for char in sentence2] + ['[SEP]']
        segment_ids = [0] * (len(sentence1) + 2) + [1] * (len(sentence2) + 1)
        combined_ids = [self.char_to_id.get(char, self.char_to_id['[MASK]']) for char in combined_tokens]

        return combined_ids, segment_ids, label

def custom_collate_fn(batch):
    combined_ids, segment_ids, label = zip(*batch)
    return combined_ids, segment_ids, label

train_dataset = TextDataset(train_df, char_to_id)
dev_dataset = TextDataset(dev_df, char_to_id)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn, drop_last=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn, drop_last=True)

for combined_ids, segment_ids, label in train_dataloader:
    print(combined_ids)
    print(segment_ids)
    print(label)
    break
"""

"\nclass TextDataset(Dataset):\n    def __init__(self, dataframe, char_to_id， max_len = 160):\n        self.data = dataframe\n        self.char_to_id = char_to_id\n\n    def __len__(self):\n        return len(self.data)\n\n    def __getitem__(self, index):\n        sentence1 = self.data.iloc[index]['sentence1']\n        sentence2 = self.data.iloc[index]['sentence2']\n        label = self.data.iloc[index]['label']\n\n        combined_tokens = ['[CLS]'] + [char for char in sentence1] + ['[SEP]'] + [char for char in sentence2] + ['[SEP]']\n        segment_ids = [0] * (len(sentence1) + 2) + [1] * (len(sentence2) + 1)\n        combined_ids = [self.char_to_id.get(char, self.char_to_id['[MASK]']) for char in combined_tokens]\n\n        return combined_ids, segment_ids, label\n\ndef custom_collate_fn(batch):\n    combined_ids, segment_ids, label = zip(*batch)\n    return combined_ids, segment_ids, label\n\ntrain_dataset = TextDataset(train_df, char_to_id)\ndev_dataset = TextDataset(dev_df, cha

In [78]:
class PositionalEncoding(nn.Module):
    """Positional Encoding"""
    def __init__(self, num_hiddens, dropout, max_len=160):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        # create a long P
        self.P = torch.zeros((1, max_len, num_hiddens))
        X = torch.arange(max_len, dtype=torch.float32).reshape(
            -1, 1) / torch.pow(10000, torch.arange(
            0, num_hiddens, 2, dtype=torch.float32) / num_hiddens)
        self.P[:, :, 0::2] = torch.sin(X)
        self.P[:, :, 1::2] = torch.cos(X)

    def forward(self, X):
        X = X + self.P[:, :X.shape[1], :].to(X.device)
        return self.dropout(X)

In [86]:
embeddings = nn.Embedding(len(vocab), num_hiddens)
PE = PositionalEncoding(num_hiddens, dropout, max_len)
X = embeddings(torch.tensor(sentence_ids))
Y = PE(X)
print(X.shape, Y.shape)

RuntimeError: The size of tensor a (30) must match the size of tensor b (160) at non-singleton dimension 1