Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions TelegramSearchBot.Search.Test/PhraseQueryProcessorTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
using System.Linq;
using System.Text.RegularExpressions;
using Lucene.Net.Analysis.Cn.Smart;
using Lucene.Net.Index;
using Lucene.Net.Store;
using Lucene.Net.Util;
using TelegramSearchBot.Search.Tool;
using TelegramSearchBot.Tokenizer.Abstractions;

namespace TelegramSearchBot.Search.Test {
public class PhraseQueryProcessorTests {
[Fact]
public void ExtractPhraseQueries_PreservesDuplicateTermsForPhraseQueries() {
var processor = new PhraseQueryProcessor(new RepeatedTokenizer(), new ExtFieldQueryOptimizer());
using var directory = new RAMDirectory();
var config = new IndexWriterConfig(LuceneVersion.LUCENE_48, new SmartChineseAnalyzer(LuceneVersion.LUCENE_48));
using (var writer = new IndexWriter(directory, config)) {
writer.Commit();
}

using var reader = DirectoryReader.Open(directory);

var (phraseQueries, _) = processor.ExtractPhraseQueries("\"北京 北京\"", reader, 1);

var combinedPhraseQuery = Assert.Single(phraseQueries);
var contentPhrase = Assert.Single(combinedPhraseQuery.Clauses
.Where(static clause => clause.Query is Lucene.Net.Search.PhraseQuery)
.Select(static clause => ( Lucene.Net.Search.PhraseQuery ) clause.Query));

Assert.Equal(2, Regex.Matches(contentPhrase.ToString(), "北京").Count);
}

private sealed class RepeatedTokenizer : ITokenizer {
public TokenizerMetadata Metadata { get; } = new("Repeated", "Test", false);

public IReadOnlyList<string> Tokenize(string text) {
return new[] { "北京" };
}

public IReadOnlyList<string> SafeTokenize(string text) {
return new[] { "北京" };
}

public IReadOnlyList<TokenWithOffset> TokenizeWithOffsets(string text) {
return new[] {
new TokenWithOffset(0, 2, "北京"),
new TokenWithOffset(3, 5, "北京")
};
}
}
}
}
26 changes: 26 additions & 0 deletions TelegramSearchBot.Search.Test/SearchHelperTests.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System;
using TelegramSearchBot.Search.Exception;
using TelegramSearchBot.Search.Tool;
using TelegramSearchBot.Tokenizer.Abstractions;

namespace TelegramSearchBot.Search.Test {
public class SearchHelperTests {
Expand Down Expand Up @@ -43,5 +44,30 @@ public void FindBestSnippet_Throws_ForInvalidInput() {
Assert.Throws<InvalidSearchInputException>(() => SearchHelper.FindBestSnippet("北京今天天气不错", " ", 10));
Assert.Throws<InvalidSearchInputException>(() => SearchHelper.FindBestSnippet("北京今天天气不错", "北京", 0));
}

[Fact]
public void FindBestSnippet_UsesProvidedTokenizer() {
var tokenizer = new StubTokenizer();

var snippet = SearchHelper.FindBestSnippet("abcXYZdef", "unused", 3, tokenizer);

Assert.Equal("XYZ", snippet);
}

private sealed class StubTokenizer : ITokenizer {
public TokenizerMetadata Metadata { get; } = new("Stub", "Test", false);

public IReadOnlyList<string> Tokenize(string text) {
return new[] { "XYZ" };
}

public IReadOnlyList<string> SafeTokenize(string text) {
return new[] { "XYZ" };
}

public IReadOnlyList<TokenWithOffset> TokenizeWithOffsets(string text) {
return new[] { new TokenWithOffset(3, 6, "XYZ") };
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,13 @@
<PackageReference Include="xunit.runner.visualstudio" Version="2.8.2" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\TelegramSearchBot.Search\TelegramSearchBot.Search.csproj" />
</ItemGroup>

<ItemGroup>
<Using Include="Xunit" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\TelegramSearchBot.Search\TelegramSearchBot.Search.csproj" />
<ProjectReference Include="..\TelegramSearchBot.Tokenizer\TelegramSearchBot.Tokenizer.csproj" />
</ItemGroup>

</Project>
11 changes: 4 additions & 7 deletions TelegramSearchBot.Search/Service/SimpleSearchService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,30 @@
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Lucene.Net.Analysis.Cn.Smart;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Util;
using TelegramSearchBot.Search.Model;
using TelegramSearchBot.Search.Tokenizer;
using TelegramSearchBot.Search.Tool;
using TelegramSearchBot.Tokenizer.Abstractions;

namespace TelegramSearchBot.Search.Service {
public class SimpleSearchService {
private readonly UnifiedTokenizer _tokenizer;
private readonly ITokenizer _tokenizer;
private readonly ExtFieldQueryOptimizer _extOptimizer;
private readonly Func<string, Task>? _log;

public SimpleSearchService(UnifiedTokenizer tokenizer, ExtFieldQueryOptimizer extOptimizer, Func<string, Task>? log) {
public SimpleSearchService(ITokenizer tokenizer, ExtFieldQueryOptimizer extOptimizer, Func<string, Task>? log) {
_tokenizer = tokenizer;
_extOptimizer = extOptimizer;
_log = log;
}

private List<string> GetKeyWords(string query) {
return _tokenizer.SafeTokenize(query);
return _tokenizer.SafeTokenize(query).ToList();
}

private (Query Query, string[] Terms) ParseSimpleQuery(string query, IndexReader reader) {
_ = reader; // 保留参数以兼容未来扩展
_ = new SmartChineseAnalyzer(LuceneVersion.LUCENE_48); // 与原实现保持一致,虽然当前未使用

var booleanQuery = new BooleanQuery();
var terms = GetKeyWords(query).ToArray();
Expand Down
8 changes: 4 additions & 4 deletions TelegramSearchBot.Search/Service/SyntaxSearchService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,23 @@
using Lucene.Net.Search;
using Lucene.Net.Util;
using TelegramSearchBot.Search.Model;
using TelegramSearchBot.Search.Tokenizer;
using TelegramSearchBot.Search.Tool;
using TelegramSearchBot.Tokenizer.Abstractions;

namespace TelegramSearchBot.Search.Service {
public class SyntaxSearchService {
private static readonly Regex ExcludeRegex = new Regex(@"-([^\s]+)", RegexOptions.Compiled);

private readonly PhraseQueryProcessor _phraseProcessor;
private readonly FieldSpecificationParser _fieldParser;
private readonly UnifiedTokenizer _tokenizer;
private readonly ITokenizer _tokenizer;
private readonly ExtFieldQueryOptimizer _extOptimizer;
private readonly Func<string, Task>? _log;

public SyntaxSearchService(
PhraseQueryProcessor phraseProcessor,
FieldSpecificationParser fieldParser,
UnifiedTokenizer tokenizer,
ITokenizer tokenizer,
ExtFieldQueryOptimizer extOptimizer,
Func<string, Task>? log) {
_phraseProcessor = phraseProcessor;
Expand All @@ -34,7 +34,7 @@ public SyntaxSearchService(
}

private List<string> GetKeyWords(string query) {
return _tokenizer.SafeTokenize(query);
return _tokenizer.SafeTokenize(query).ToList();
}

private (BooleanQuery Query, string[] Terms) ParseQuery(string query, IndexReader reader, long groupId) {
Expand Down
1 change: 1 addition & 0 deletions TelegramSearchBot.Search/TelegramSearchBot.Search.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

<ItemGroup>
<ProjectReference Include="..\TelegramSearchBot.Common\TelegramSearchBot.Common.csproj" />
<ProjectReference Include="..\TelegramSearchBot.Tokenizer\TelegramSearchBot.Tokenizer.csproj" />
</ItemGroup>

</Project>
59 changes: 0 additions & 59 deletions TelegramSearchBot.Search/Tokenizer/UnifiedTokenizer.cs

This file was deleted.

8 changes: 4 additions & 4 deletions TelegramSearchBot.Search/Tool/ContentQueryBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
using Lucene.Net.Index;
using Lucene.Net.Search;
using TelegramSearchBot.Search.Interface;
using TelegramSearchBot.Search.Tokenizer;
using TelegramSearchBot.Tokenizer.Abstractions;

namespace TelegramSearchBot.Search.Tool {
internal class ContentQueryBuilder : IQueryBuilder {
private readonly UnifiedTokenizer _tokenizer;
private readonly ITokenizer _tokenizer;
private readonly Action<string>? _logAction;

public ContentQueryBuilder(UnifiedTokenizer tokenizer, Action<string>? logAction = null) {
public ContentQueryBuilder(ITokenizer tokenizer, Action<string>? logAction = null) {
_tokenizer = tokenizer;
_logAction = logAction;
}
Expand All @@ -30,7 +30,7 @@ public BooleanQuery BuildQuery(string query, long groupId, IndexReader reader) {
}

public List<string> TokenizeQuery(string query) {
var tokens = _tokenizer.SafeTokenize(query);
var tokens = _tokenizer.SafeTokenize(query).ToList();
if (tokens.Count == 0) {
_logAction?.Invoke($"ContentQueryBuilder: 查询 \"{query}\" 未能生成有效关键词");
}
Expand Down
8 changes: 4 additions & 4 deletions TelegramSearchBot.Search/Tool/ExtQueryBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
using Lucene.Net.Index;
using Lucene.Net.Search;
using TelegramSearchBot.Search.Interface;
using TelegramSearchBot.Search.Tokenizer;
using TelegramSearchBot.Tokenizer.Abstractions;

namespace TelegramSearchBot.Search.Tool {
internal class ExtQueryBuilder : IQueryBuilder {
private readonly UnifiedTokenizer _tokenizer;
private readonly ITokenizer _tokenizer;
private readonly ExtFieldQueryOptimizer _extOptimizer;
private readonly Action<string>? _logAction;

public ExtQueryBuilder(UnifiedTokenizer tokenizer, ExtFieldQueryOptimizer extOptimizer, Action<string>? logAction = null) {
public ExtQueryBuilder(ITokenizer tokenizer, ExtFieldQueryOptimizer extOptimizer, Action<string>? logAction = null) {
_tokenizer = tokenizer;
_extOptimizer = extOptimizer;
_logAction = logAction;
Expand All @@ -23,7 +23,7 @@ public BooleanQuery BuildQuery(string query, long groupId, IndexReader reader) {
}

public List<string> TokenizeQuery(string query) {
var tokens = _tokenizer.SafeTokenize(query);
var tokens = _tokenizer.SafeTokenize(query).ToList();
if (tokens.Count == 0) {
_logAction?.Invoke($"ExtQueryBuilder: 查询 \"{query}\" 未能生成有效关键词");
}
Expand Down
14 changes: 10 additions & 4 deletions TelegramSearchBot.Search/Tool/LuceneManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,27 @@
using TelegramSearchBot.Common;
using TelegramSearchBot.Search.Model;
using TelegramSearchBot.Search.Service;
using TelegramSearchBot.Search.Tokenizer;
using TelegramSearchBot.Tokenizer.Abstractions;
using TelegramSearchBot.Tokenizer.Implementations;

namespace TelegramSearchBot.Search.Tool {
public class LuceneManager {
private readonly Func<string, Task> _log;
private readonly UnifiedTokenizer _tokenizer;
private readonly ITokenizerFactory _tokenizerFactory;
private readonly ITokenizer _tokenizer;
private readonly ExtFieldQueryOptimizer _extOptimizer;
private readonly PhraseQueryProcessor _phraseProcessor;
private readonly FieldSpecificationParser _fieldParser;
private readonly SimpleSearchService _simpleSearchService;
private readonly SyntaxSearchService _syntaxSearchService;

public LuceneManager(Func<string, Task> log) {
public LuceneManager(Func<string, Task> log) : this(log, null, TokenizerType.SmartChinese) {
}

public LuceneManager(Func<string, Task> log, ITokenizerFactory? tokenizerFactory = null, TokenizerType tokenizerType = TokenizerType.SmartChinese) {
_log = log ?? throw new ArgumentNullException(nameof(log));
_tokenizer = new UnifiedTokenizer(LogFireAndForget);
_tokenizerFactory = tokenizerFactory ?? new TokenizerFactory(LogFireAndForget);
_tokenizer = _tokenizerFactory.Create(tokenizerType);
_extOptimizer = new ExtFieldQueryOptimizer(LogFireAndForget);
_phraseProcessor = new PhraseQueryProcessor(_tokenizer, _extOptimizer, LogFireAndForget);
_fieldParser = new FieldSpecificationParser(LogFireAndForget);
Expand Down
10 changes: 6 additions & 4 deletions TelegramSearchBot.Search/Tool/PhraseQueryProcessor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
using System.Text.RegularExpressions;
using Lucene.Net.Index;
using Lucene.Net.Search;
using TelegramSearchBot.Search.Tokenizer;
using TelegramSearchBot.Tokenizer.Abstractions;

namespace TelegramSearchBot.Search.Tool {
public class PhraseQueryProcessor {
private static readonly Regex PhraseRegex = new Regex("\"([^\"]+)\"", RegexOptions.Compiled);
private readonly UnifiedTokenizer _tokenizer;
private readonly ITokenizer _tokenizer;
private readonly ExtFieldQueryOptimizer _extOptimizer;
private readonly Action<string>? _logAction;

public PhraseQueryProcessor(UnifiedTokenizer tokenizer, ExtFieldQueryOptimizer extOptimizer, Action<string>? logAction = null) {
public PhraseQueryProcessor(ITokenizer tokenizer, ExtFieldQueryOptimizer extOptimizer, Action<string>? logAction = null) {
_tokenizer = tokenizer;
_extOptimizer = extOptimizer;
_logAction = logAction;
Expand All @@ -38,7 +38,9 @@ public BooleanQuery BuildUnifiedPhraseQuery(List<string> terms, IndexReader read
foreach (Match match in PhraseRegex.Matches(query)) {
try {
var phraseText = match.Groups[1].Value;
var terms = _tokenizer.SafeTokenize(phraseText);
var terms = _tokenizer.TokenizeWithOffsets(phraseText)
.Select(static token => token.Term)
.ToList();
if (terms.Count == 0) {
continue;
}
Expand Down
Loading
Loading