In [None]:
import re

def check_srt_format(filename):
    with open(filename, 'r', encoding='UTF-8') as file:
        lines = file.readlines()

    # 正则表达式匹配时间码行
    timecode_pattern = re.compile(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}')
    error_messages = []

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # 检查序号
        if i % 4 == 0:
            if not line.isdigit() or int(line) != (i // 4) + 1:
                error_messages.append(f"Line {i+1}: Expected sequence number {(i // 4) + 1}, found '{line}'.")

        # 检查时间码格式
        elif i % 4 == 1:
            if not timecode_pattern.match(line):
                error_messages.append(f"Line {i+1}: Timecode format is incorrect.")
            else:
                # 检查时间码逻辑
                start, end = line.split(' --> ')
                if start >= end:
                    error_messages.append(f"Line {i+1}: Start time is not earlier than end time.")

        # 检查字幕文本
        elif i % 4 == 2:
            if line == '':
                error_messages.append(f"Line {i+1}: Missing subtitle text.")

        # 检查空行
        elif i % 4 == 3:
            if line != '':
                error_messages.append(f"Line {i+1}: Expected blank line.")

        i += 1

    # 最后一行检查是否为空行
    if lines[-1].strip() != '':
        error_messages.append("The last line should be a blank line.")

    if not error_messages:
        print("The file is in the correct SRT format.")
    else:
        print("Errors found in SRT format:")
        for message in error_messages:
            print(message)

# 使用文件名调用函数
check_srt_format("./IPZZ-218_cn.srt")


In [39]:
import re

def fix_srt_format(filename):
    with open(filename, 'r', encoding='UTF-8') as file:
        lines = file.readlines()

    # 正则表达式匹配时间码行
    timecode_pattern = re.compile(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}')
    fixed_lines = []
    sequence_number = 1
    i = 0

    while i < len(lines):
        line = lines[i].rstrip()  # Remove trailing whitespace
        # line = lines[i]
        # 打印行号和行内容
        # print(f"{i+1}: {line}")
        # 序号行
        if line.isdigit():
        # if i % 4 == 0:
            fixed_lines.append(f"{sequence_number}\n")
            sequence_number += 1
        # 时间码行
        elif timecode_pattern.match(line):
            fixed_lines.append(f"{line}\n")
        # 空白行或字幕文本
        else:
            # 如果是字幕文本行，直接添加
            if line != '' or timecode_pattern.match(lines[i-1].strip()):
                fixed_lines.append(f"{line}\n")
                # i += 1
                # 处理可能的多行字幕文本
                # while i < len(lines) and not lines[i].strip().isdigit() and not timecode_pattern.match(lines[i].strip()):
                #     fixed_lines.append(f"{lines[i].rstrip()}\n")
                i += 1
                # 添加字幕块之后的空白行
                fixed_lines.append("\n")
                continue  # Continue at the top of the while loop
            else:
                # 对于额外的空行，忽略它们，因为我们已经在字幕文本后添加了空白行
                pass
        i += 1

    # 确保文件以空白行结束
    if fixed_lines[-1].strip() != '':
        fixed_lines.append("\n")

    # 保存修正后的文件
    with open(f"fixed_{filename}", 'w', encoding='UTF-8') as file:
        file.writelines(fixed_lines)

    print(f"Fixed file saved as fixed_{filename}")

# 输入和输出文件名
input_filename = "IPZZ-218_cn_1.srt"

# 调用函数处理和保存修正后的文件
output_filename = fix_srt_format(input_filename)
check_srt_format(f'fixed_{input_filename}')

1: 1
2: 00:00:00,000 --> 00:00:11,500
3: 音乐
4: 
5: 2
6: 00:00:11,500 --> 00:00:14,500
7: 音乐渐渐消失
8: 
9: 3
10: 00:00:14,500 --> 00:00:16,500
11: 早上好！
12: 
13: 4
14: 00:00:16,500 --> 00:00:17,500
15: 早上好！
16: 
17: 5
18: 00:00:17,500 --> 00:00:20,500
19: 你不应该在这潮湿的地方！
20: 
21: 6
22: 00:00:20,500 --> 00:00:22,500
23: 没关系。
24: 
25: 7
26: 00:00:22,500 --> 00:00:23,500
27: 你是蓝娜吗？
28: 
29: 8
30: 00:00:23,500 --> 00:00:24,500
31: 是的。
32: 
33: 9
34: 00:00:24,500 --> 00:00:25,500
35: 很高兴见到你。
36: 
37: 10
38: 00:00:25,500 --> 00:00:26,500
39: 很高兴见到你。
40: 
41: 11
42: 00:00:26,500 --> 00:00:29,500
43: 我是你的导演，伊娜·巴特。
44: 
45: 12
46: 00:00:29,500 --> 00:00:31,500
47: 很高兴见到你。
48: 
49: 13
50: 00:00:31,500 --> 00:00:33,500
51: 摄像机在这里。
52: 
53: 14
54: 00:00:33,500 --> 00:00:35,500
55: 我们要开始拍摄了。
56: 
57: 15
58: 00:00:35,500 --> 00:00:36,500
59: 好的。
60: 
61: 16
62: 00:00:36,500 --> 00:00:37,500
63: 你紧张吗？
64: 
65: 17
66: 00:00:37,500 --> 00:00:38,500
67: 有点。
68: 
69: 18
70: 00:00:38,500 --> 00:00:39,500
71: 有点？
