In [3]:
import os
import pandas as pd
import glob
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [None]:
# folder path
data_folder = "../data/text"

# BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# query
query = "how to hide purchase history in app store"

# encording
def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy().flatten()

query_vector = encode_text(query)

# target file
text_files = glob.glob(os.path.join(data_folder, "*.txt"))

# near text
relevant_files = []

for file in tqdm(text_files):
    try:
        with open(file, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
            text_vector = encode_text(text)
            similarity = cosine_similarity([query_vector], [text_vector])[0][0]
            if similarity > 0.7:  
                relevant_files.append((file, similarity))
    except Exception as e:
        print(f"Error processing {file}: {e}")

df = pd.DataFrame(relevant_files, columns=["File", "Similarity"])
df.to_csv("../result/bert_search_results.csv", index=False)

print(f"該当するファイル数: {len(relevant_files)}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 10001/10001 [2:33:56<00:00,  1.08it/s] 


該当するファイル数: 2499


In [4]:
df=pd.read_csv("../result/bert_search_results.csv")
top_10_files = df.sort_values(by="Similarity", ascending=False).head(10)
print("類似度上位10件のファイル:")
print(top_10_files)

類似度上位10件のファイル:
                       File  Similarity
1296  ../data/text/5583.txt    0.864557
1931  ../data/text/6360.txt    0.856678
2415  ../data/text/9925.txt    0.855231
1512  ../data/text/4992.txt    0.854641
460    ../data/text/437.txt    0.848185
1902  ../data/text/9090.txt    0.847875
2050  ../data/text/7046.txt    0.847269
1891  ../data/text/2363.txt    0.843861
2193  ../data/text/2023.txt    0.843390
1140  ../data/text/4867.txt    0.841890


In [5]:
for idx, row in top_10_files.iterrows():
    file_path = row['File']
    similarity = row['Similarity']
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            text_content = f.read()
            print(f"\n--- ファイル: {file_path} ---")
            print(f"類似度: {similarity}")
            print(f"内容:\n{text_content[:500]}") 
    except Exception as e:
        print(f"Error processing {file_path}: {e}")


--- ファイル: ../data/text/5583.txt ---
類似度: 0.8645571
内容:
Q: How can I enable pinch to zoom in Finder on OS X Lion? After upgrading to Lion the pinch-to-zoom seems to be gone in Finder.  Didn't even find an option for it in system preferences. Any ideas?

A: The preference for this used to be located in Finder>Preferences. It's no longer available and after some research online, it looks like no one has discovered a way to turn this on using a plist edit. 
You might consider telling Apple about how much you miss this:
http://www.apple.com/feedback/maco

--- ファイル: ../data/text/6360.txt ---
類似度: 0.8566777
内容:
Q: Is there any IDE for iPad? 
Possible Duplicate:
HTML / CSS / Javascript editor for the iPad? 

Is there any IDE like Notepad++ for iPad? I want to create HTML pages with CSS and jQuery on my iPad.

A: Haven't used it myself, but Textastic ($10) should help you with a lot of what you're looking to do.
http://www.textasticapp.com/


--- ファイル: ../data/text/9925.txt ---
類似度: 0.8552313
