In [1]:
import re

def parse_chat_record(record):
    # 使用正则表达式提取发言者、时间和内容
    # 玖尔巴奇 (2021-05-27 22:37:05):我来啦！
    speaker = record.split(' ')[0]
    time = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', record)
    if time:
        time = time.group()
    else:
        time = None
    content = record.split(':')[-1]
    if speaker and time and content:
        return speaker, time, content
    else:
        return None

def parse_chat_records(filename):
    parsed_records = []
    with open(filename, 'r') as file:
        for line in file:
            parsed_record = parse_chat_record(line.strip())
            if parsed_record:
                parsed_records.append(parsed_record)
    return parsed_records

# 读取聊天记录
filename = 'chat.txt'
records = parse_chat_records(filename)

In [10]:
# 说话人改变则分块
chunks = []
chunk = []
for record in records:
    if chunk and record[0] != chunk[-1][0]:
        chunks.append(chunk)
        chunk = []
    # 记录只有[表情]，[视频/语音通话]，[图片]，则跳过
    if record[2] in ['[表情]', '[视频/语音通话]', '[图片]']:
        continue
    chunk.append(record)
chunks.append(chunk)

In [19]:
from datetime import datetime
# 对于每个块，如果一条记录离上一条记录的时间超过五个小时，则分割这个块。
# 例如，如果一条记录的时间是2021-05-27 22:37:05，而上一条记录的时间是2021-05-27 17:37:05，则分割这个块。
# 这样做的目的是为了防止一次聊天中间有很长时间的空档。
# 例如，如果一次聊天中间有一段时间没有聊天记录，那么这段时间的记录就不应该被用于训练。
new_chunks = []
for chunk in chunks:
    new_chunk = []
    for i in range(len(chunk)):
        if i == 0 or (datetime.strptime(chunk[i][1], '%Y-%m-%d %H:%M:%S') - datetime.strptime(chunk[i - 1][1], '%Y-%m-%d %H:%M:%S')).seconds <= 5 * 60 * 60:
            new_chunk.append(chunk[i])
        else:
            new_chunks.append(new_chunk)
            new_chunk = []
    new_chunks.append(new_chunk)
# 删除空块
new_chunks = [chunk for chunk in new_chunks if chunk]

In [22]:
dataset = []
# 找到玖尔巴奇的chunk，以及后面的及🐔的chunk，组成pair
for i in range(len(new_chunks) - 1):
    if new_chunks[i][0][0] == '玖尔巴奇' and new_chunks[i + 1][0][0] == '及🐔':
        dataset.append({"content":'。'.join([record[2] for record in new_chunks[i]]),"summary":'。'.join([record[2] for record in new_chunks[i + 1]])})

In [24]:
import json
with open('dataset.json', 'w') as file:
    json.dump(dataset, file, ensure_ascii=False, indent=4)
# 划分训练集和测试集1:99
import random
random.shuffle(dataset)
testset = dataset[:int(len(dataset) * 0.01)]
trainset = dataset[int(len(dataset) * 0.01):]
# 保存训练集和测试集
with open('train.json', 'w') as file:
    json.dump(trainset, file, ensure_ascii=False, indent=4)
with open('dev.json', 'w') as file:
    json.dump(testset, file, ensure_ascii=False, indent=4)