Skip to content

Commit

Permalink
Merge pull request #125 from aquatiko/fix-punctuation
Browse files Browse the repository at this point in the history
Fixed strip_punctuation regex
  • Loading branch information
aviks committed Jan 21, 2019
2 parents 18aa868 + 76d922d commit f33d1d0
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/preprocessing.jl
Expand Up @@ -340,7 +340,7 @@ function _build_regex_patterns(lang, flags::UInt32, patterns::Set{T}, words::Set
if (flags & strip_non_letters) > 0
push!(patterns, "[^a-zA-Z\\s]")
else
((flags & strip_punctuation) > 0) && push!(patterns, "[,;:.!?()-\\\\]+")
((flags & strip_punctuation) > 0) && push!(patterns, "[-.,:;,!?'\"\\[\\]\\(\\)\\{\\}|\\`#\$%@^&*_+<>]+")
((flags & strip_numbers) > 0) && push!(patterns, "\\d+")
end
if (flags & strip_articles) > 0
Expand Down
11 changes: 11 additions & 0 deletions test/preprocessing.jl
Expand Up @@ -90,4 +90,15 @@
crps = Corpus(StringDocument.(sample_texts))
@test isempty(setdiff(frequent_terms(crps),["string","is"]))
@test isempty(setdiff(sparse_terms(crps,0.3),["!"]))

#Tests strip_punctuation regex conditions
str = Document("These punctuations should be removed [-.,:;,!?'\"[](){}|\`#\$%@^&*_+<>")
answer = Document("These punctuations should be removed ")
prepare!(str, strip_punctuation)
@test isequal(str.text, answer.text)

str = Document("Intel(tm) Core i5-3300k, is a geat CPU! ")
answer = Document("Intel tm Core i5 3300k is a geat CPU ") #tests old implementation
prepare!(str, strip_punctuation)
@test isequal(str.text, answer.text)
end

0 comments on commit f33d1d0

Please sign in to comment.