### 参考文献结构信息格式化

根据 GB/T 7714-2015 信息与文献 参考文献著录规则 中的要求实现，可能存在一些格式标准问题，可以根据具体要求进行更改。

将结构信息格式化的方法，主要解决空缺信息的格式组成变化的问题。

formats.yaml中包含了格式和一些例子

In [40]:
import re
from collections import UserDict


class DefaultDict(UserDict):
    def __missing__(self, key):
        return f"{{{key}}}"


def wrap(fmt, data, return_str=""):
    try:
        res = fmt.format_map(data)
    except:
        res = fmt if return_str is None else return_str
    return res


class ReplaceFunc:
    def __init__(self, data):
        self.data = data
        self.flag = True

    def __call__(self, match):
        self.flag = True
        fmt = match.groups()[0]
        fmt = fmt.replace("&lt;", "<").replace("&gt;", ">")
        return wrap(fmt, self.data)


def wrapFormat(fmt, data):
    fmt = fmt.replace("\<", "&lt;").replace("\>", "&gt;")
    data = {k: v for k, v in data.items() if v}
    fun = ReplaceFunc(data)
    # print(data)

    while fun.flag:
        fun.flag = False
        fmt = re.sub("<([^<>]*)>", fun, fmt)

    # print(fmt)
    # print(data)
    fmt = wrap(fmt, DefaultDict(data), None)
    # print(fmt)

    # fmt = fmt.format_map(data)
    fmt = fmt.replace("&lt;", "<").replace("&gt;", ">")
    return fmt


In [42]:
import yaml


def parseYAML(file):
    with open(file, encoding="utf-8") as f:
        data = yaml.load(f, yaml.FullLoader)
    return data


formats = parseYAML("./formats.yaml")


fmtType = "连续出版物"

out = wrapFormat(formats[fmtType], formats["example"][fmtType][-1])
ans = formats["example"][fmtType][-1]["原例"]


print(out)
print(ans)



# formats["example"][fmtType][-1]

中华医学会湖北分会.临床内科杂志[J].1984,1(1)-.武汉:中华医学会湖北分会,1984-.
中华医学会湖北分会.临床内科杂志[J].1984,1(1)-.武汉:中华医学会湖北分会,1984-.


In [28]:
formats = parseYAML("./formats.yaml")



In [43]:
import json

formats = parseYAML("./formats.yaml")
example = formats["example"].copy()
del formats["example"]


with open("./fmt7714.json", "w", encoding="utf8") as f:
    json.dump(formats, f, ensure_ascii=False)

with open("./example.json", "w", encoding="utf8") as f:
    json.dump(example, f, ensure_ascii=False)

    # json.dump(formats, f, ensure_ascii=False)


# formats["example"]["连续出版物析出文献"]

# formats


### 杂项

In [None]:
# 标点格式
marks = {
    "．": ". ",
    "，": ", ",
    "［": "[",
    "］": "]",
    "—": "-",
}
# http://wjk.usst.edu.cn/

### Old Version: 值和格式的二元组

In [None]:
# (content, fmt) -> fmt.format(content) if fmt else ""

def wrap(content: str, wrapper: str):
    return wrapper.format(content) if content else ""


def wrap_with_fmt(fmt: list, data=None):
    if len(fmt) == 2:
        return wrap(data.get(fmt[0], "") if data else fmt[0], fmt[1])

    s = "".join([wrap_with_fmt(fmt_i, data) for fmt_i in fmt[:-1]])

    return wrap(s, fmt[-1])


In [None]:
# format_map 可以接受字典，直接使用整个字典表示content

p = ("{a}", ["[", "{}", "/{b}", "]"])


def wrapfmt(s, data):
    if isinstance(s, tuple):
        return wrap(wrapfmt(s[0], data), wrapfmt(s[1], data))
    elif isinstance(s, str):
        try:
            res = s.format("{}", **data)
        except:
            res = ""
        return res
    elif isinstance(s, list):
        return "".join([wrapfmt(i, data) for i in s])


# wrapfmt(p, {"a": "1", "c": "23"})


### Test: 正则表达式分析

In [10]:
import re

regex = "(\[\d+\])\s+(?:(?!http)(.+?)\.)?(?:(?!http)(.+?)\.)?(?:(?!http)(.+?)\.)?(?:(?!http)(.+?)\.)?(?:(?:(?:(https?.*)\.(?=DOI))?(DOI.*)\.)|(?:(https?.*)\.$))"


a = re.match(regex, formats["example"]["专著"][0]["原例"])


a
# a.groups()

