/
token_filters.py
49 lines (38 loc) · 1.57 KB
/
token_filters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import re
from typing import Any
from hojichar.core.filter_interface import TokenFilter
from hojichar.core.models import Token
class TokenAddDebagTag(TokenFilter):
"""トークン末尾にデバッグ用のタグを追加します."""
def apply(self, token: Token) -> Token:
"""
>>> TokenAddDebagTag()("hello")
'hello<sep>'
"""
token.text += "<sep>"
return token
class SEOTokenRemover(TokenFilter):
"""
The process is migrated from legacy code.
I couldn't understand what this process was about, mainly because
the regex pattern is too complex.
"""
def __init__(self, min_average_seo_char_length: int = 5, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.token_split_pat = re.compile(r"\ |-|・|,")
self.min_average_seo_char_length = min_average_seo_char_length
self.replace_pat = re.compile(
r"\-{5,},@[a-zA-Z0-9]+,[#\$\%\-]{4,},[_=#\$\%\-]{4,}[\ ]*.+?[\ ]*[_=#\$\%\-]{4,}|★[…━]+★" # noqa
)
def apply(self, token: Token) -> Token:
seo_words = self.token_split_pat.split(token.text.strip())
n_words = len(seo_words)
if n_words == 0:
return token
avg_char_len = len(token.text) / n_words
if avg_char_len <= self.min_average_seo_char_length:
return token
replace_patterns = self.replace_pat.search(token.text)
if replace_patterns is not None:
token.text = token.text.replace(replace_patterns.group(0), "", 1)
return token