Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,512 changes: 3,108 additions & 404 deletions Cargo.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ tracing-appender = "0.2"
serde_yaml = "0.9"
walkdir = "2"
globset = "0.4"
tantivy = "0.25"
lindera = { version = "2.3", features = ["embed-ipadic"] }
lindera-tantivy = { version = "2.0", features = ["embed-ipadic"] }

[dev-dependencies]
tempfile = "3"
Expand Down
3 changes: 3 additions & 0 deletions src/indexer/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pub mod reader;
pub mod schema;
pub mod writer;
141 changes: 141 additions & 0 deletions src/indexer/reader.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
use std::fmt;
use std::path::Path;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::Value;
use tantivy::{Index, ReloadPolicy};

use crate::indexer::schema::IndexSchema;

#[derive(Debug)]
pub enum ReaderError {
Tantivy(tantivy::TantivyError),
Query(tantivy::query::QueryParserError),
}

impl fmt::Display for ReaderError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ReaderError::Tantivy(e) => write!(f, "Tantivy error: {e}"),
ReaderError::Query(e) => write!(f, "Query parse error: {e}"),
}
}
}

impl std::error::Error for ReaderError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
ReaderError::Tantivy(e) => Some(e),
ReaderError::Query(e) => Some(e),
}
}
}

impl From<tantivy::TantivyError> for ReaderError {
fn from(e: tantivy::TantivyError) -> Self {
ReaderError::Tantivy(e)
}
}

impl From<tantivy::query::QueryParserError> for ReaderError {
fn from(e: tantivy::query::QueryParserError) -> Self {
ReaderError::Query(e)
}
}

#[derive(Debug, Clone)]
pub struct SearchResult {
pub path: String,
pub heading: String,
pub body: String,
pub tags: String,
pub heading_level: u64,
pub line_start: u64,
pub score: f32,
}

pub struct IndexReaderWrapper {
index: Index,
schema: IndexSchema,
}

impl IndexReaderWrapper {
/// ディスク上のインデックスを開く
pub fn open(index_dir: &Path) -> Result<Self, ReaderError> {
let schema = IndexSchema::new();
let index = Index::open_in_dir(index_dir)?;
IndexSchema::register_tokenizer(&index);
Ok(Self { index, schema })
}

/// 既存のIndexオブジェクトから作成する(テスト用)
pub fn from_index(index: Index) -> Self {
let schema = IndexSchema::new();
IndexSchema::register_tokenizer(&index);
Self { index, schema }
}

/// クエリで検索し、上位N件を返す
pub fn search(&self, query_str: &str, limit: usize) -> Result<Vec<SearchResult>, ReaderError> {
let reader = self
.index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.try_into()?;
let searcher = reader.searcher();

let query_parser = QueryParser::for_index(
&self.index,
vec![self.schema.heading, self.schema.body, self.schema.tags],
);

let query = query_parser.parse_query(query_str)?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(limit))?;

let mut results = Vec::new();
for (score, doc_address) in top_docs {
let doc: tantivy::TantivyDocument = searcher.doc(doc_address)?;

let path = doc
.get_first(self.schema.path)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let heading = doc
.get_first(self.schema.heading)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let body = doc
.get_first(self.schema.body)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let tags = doc
.get_first(self.schema.tags)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let heading_level = doc
.get_first(self.schema.heading_level)
.and_then(|v| v.as_u64())
.unwrap_or(0);
let line_start = doc
.get_first(self.schema.line_start)
.and_then(|v| v.as_u64())
.unwrap_or(0);

results.push(SearchResult {
path,
heading,
body,
tags,
heading_level,
line_start,
score,
});
}

Ok(results)
}
}
74 changes: 74 additions & 0 deletions src/indexer/schema.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
use lindera::dictionary::load_dictionary;
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use lindera_tantivy::tokenizer::LinderaTokenizer;
use tantivy::Index;
use tantivy::schema::{Field, STORED, STRING, Schema, TextFieldIndexing, TextOptions};
use tantivy::tokenizer::TextAnalyzer;

pub const TOKENIZER_NAME: &str = "lang_ja";

#[derive(Clone)]
pub struct IndexSchema {
pub schema: Schema,
pub path: Field,
pub heading: Field,
pub body: Field,
pub tags: Field,
pub heading_level: Field,
pub line_start: Field,
}

impl IndexSchema {
pub fn new() -> Self {
let mut schema_builder = Schema::builder();

let ja_text_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer(TOKENIZER_NAME)
.set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();

let path = schema_builder.add_text_field("path", STRING | STORED);
let heading = schema_builder.add_text_field("heading", ja_text_options.clone());
let body = schema_builder.add_text_field("body", ja_text_options.clone());
let tags = schema_builder.add_text_field("tags", ja_text_options);
let heading_level =
schema_builder.add_u64_field("heading_level", tantivy::schema::INDEXED | STORED);
let line_start = schema_builder.add_u64_field("line_start", STORED);

let schema = schema_builder.build();

Self {
schema,
path,
heading,
body,
tags,
heading_level,
line_start,
}
}

fn create_lindera_tokenizer() -> LinderaTokenizer {
let dictionary = load_dictionary("embedded://ipadic")
.expect("Failed to load embedded ipadic dictionary");
let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
LinderaTokenizer::from_segmenter(segmenter)
}

/// lindera 日本語トークナイザーをインデックスに登録する
pub fn register_tokenizer(index: &Index) {
let tokenizer = Self::create_lindera_tokenizer();
let analyzer = TextAnalyzer::builder(tokenizer).build();
index.tokenizers().register(TOKENIZER_NAME, analyzer);
}
}

impl Default for IndexSchema {
fn default() -> Self {
Self::new()
}
}
106 changes: 106 additions & 0 deletions src/indexer/writer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
use std::fmt;
use std::path::Path;
use tantivy::{Index, IndexWriter as TantivyIndexWriter, doc};

use crate::indexer::schema::IndexSchema;

const WRITER_HEAP_SIZE: usize = 50_000_000; // 50MB

#[derive(Debug)]
pub enum WriterError {
Tantivy(tantivy::TantivyError),
Io(std::io::Error),
}

impl fmt::Display for WriterError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
WriterError::Tantivy(e) => write!(f, "Tantivy error: {e}"),
WriterError::Io(e) => write!(f, "IO error: {e}"),
}
}
}

impl std::error::Error for WriterError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
WriterError::Tantivy(e) => Some(e),
WriterError::Io(e) => Some(e),
}
}
}

impl From<tantivy::TantivyError> for WriterError {
fn from(e: tantivy::TantivyError) -> Self {
WriterError::Tantivy(e)
}
}

impl From<std::io::Error> for WriterError {
fn from(e: std::io::Error) -> Self {
WriterError::Io(e)
}
}

pub struct SectionDoc {
pub path: String,
pub heading: String,
pub body: String,
pub tags: String,
pub heading_level: u64,
pub line_start: u64,
}

pub struct IndexWriterWrapper {
writer: TantivyIndexWriter,
schema: IndexSchema,
}

impl IndexWriterWrapper {
/// ディスク上のインデックスを開き、writerを作成する
pub fn open(index_dir: &Path) -> Result<Self, WriterError> {
let schema = IndexSchema::new();
std::fs::create_dir_all(index_dir)?;
let index = Index::create_in_dir(index_dir, schema.schema.clone())?;
IndexSchema::register_tokenizer(&index);
let writer = index.writer(WRITER_HEAP_SIZE)?;
Ok(Self { writer, schema })
}

/// 既存のインデックスを開く(上書きなし)
pub fn open_existing(index_dir: &Path) -> Result<Self, WriterError> {
let schema = IndexSchema::new();
let index = Index::open_in_dir(index_dir)?;
IndexSchema::register_tokenizer(&index);
let writer = index.writer(WRITER_HEAP_SIZE)?;
Ok(Self { writer, schema })
}

/// RAMベースのインデックスを作成する(テスト用)
pub fn open_in_ram() -> Result<(Self, Index), WriterError> {
let schema = IndexSchema::new();
let index = Index::create_in_ram(schema.schema.clone());
IndexSchema::register_tokenizer(&index);
let writer = index.writer(WRITER_HEAP_SIZE)?;
Ok((Self { writer, schema }, index))
}

/// ドキュメント(セクション)を追加する
pub fn add_section(&mut self, section: &SectionDoc) -> Result<(), WriterError> {
self.writer.add_document(doc!(
self.schema.path => section.path.clone(),
self.schema.heading => section.heading.clone(),
self.schema.body => section.body.clone(),
self.schema.tags => section.tags.clone(),
self.schema.heading_level => section.heading_level,
self.schema.line_start => section.line_start,
))?;
Ok(())
}

/// 変更をコミットする
pub fn commit(&mut self) -> Result<(), WriterError> {
self.writer.commit()?;
Ok(())
}
}
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// Module declarations will be added as implementation progresses:
// pub mod cli;
pub mod indexer;
pub mod parser;
// pub mod indexer;
// pub mod search;
// pub mod output;
Loading
Loading