In [2]:
import os
import glob
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util

In [None]:
# 環境変数で tokenizers の並列処理を無効化
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# folder path
data_folder = "../data"

# setting GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# BERT
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name, device=device)  

# query
query = "how to hide purchase history in the app store?"
query_embedding = model.encode(query, convert_to_tensor=True).to(device)

# target text file
text_files = glob.glob(os.path.join(data_folder, "text", "*.txt"))


results = []

def get_text_embedding(file):
    """ 各ファイルのテキストをベクトル化し、クエリと類似度を計算 """
    try:
        with open(file, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read().lower()  
            if len(text) < 10:  
                return None

            
            text_embedding = model.encode(text, convert_to_tensor=True).to(device)

            similarity = util.pytorch_cos_sim(query_embedding, text_embedding).item()
            
            return (file, similarity)
    
    except Exception as e:
        print(f"Error processing {file}: {e}")
        return None


for file in text_files:
    result = get_text_embedding(file)
    if result:
        results.append(result)

# sort
results.sort(key=lambda x: x[1], reverse=True)

# save csv
df = pd.DataFrame(results, columns=["File", "Similarity"])
df.to_csv("../result/purchase_hide_related_files_bert.csv", index=False)

# present result
print("Top 5 relevant files:")
print(df.head(5))


Using device: cpu


model.safetensors:  23%|##3       | 21.0M/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Top 5 relevant files:
                    File  Similarity
0  ../data/text/8210.txt    0.750824
1  ../data/text/7250.txt    0.699581
2  ../data/text/4601.txt    0.669198
3  ../data/text/4729.txt    0.668846
4  ../data/text/3133.txt    0.652501


In [6]:
df=pd.read_csv("../result/purchase_hide_related_files_bert.csv")
top_10_files = df.sort_values(by="Similarity", ascending=False).head(9)
print("類似度上位10件のファイル:")
print(top_10_files)

類似度上位10件のファイル:
                    File  Similarity
0  ../data/text/8210.txt    0.750824
1  ../data/text/7250.txt    0.699581
2  ../data/text/4601.txt    0.669198
3  ../data/text/4729.txt    0.668846
4  ../data/text/3133.txt    0.652501
5  ../data/text/6349.txt    0.602130
6   ../data/text/483.txt    0.598953
7  ../data/text/1042.txt    0.594634
8  ../data/text/6657.txt    0.594049


In [7]:
for idx, row in top_10_files.iterrows():
    file_path = row['File']
    similarity = row['Similarity']
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            text_content = f.read()
            print(f"\n--- ファイル: {file_path} ---")
            print(f"類似度: {similarity}")
            print(f"内容:\n{text_content[:500]}") 
    except Exception as e:
        print(f"Error processing {file_path}: {e}")


--- ファイル: ../data/text/8210.txt ---
類似度: 0.750823974609375
内容:
Q: How do I unhide purchases from the App Store? 
Possible Duplicate:
Is possible modify the App Store purchased listing (delete or hide some/unhide other hidden apps)? 

I hid an app listed in my purchased list by accident, how would I go about getting in back?

A: *

*Go to your account in iTunes on your computer by selecting your Apple ID on the top right in the iTunes Store.

*Select view hidden purchases.

*Find the app and unhide.





--- ファイル: ../data/text/7250.txt ---
類似度: 0.6995807886123657
内容:
Q: Is possible modify the App Store purchased listing (delete or hide some/unhide other hidden apps)? I have an application listed in my bought applications. I prefer to not see or hide some apps I have purchased in the list of purchased apps. Is there a way to do this? Can I reverse this and unhide these hidden apps at a later date?
For example:


A: Not on any version of the Mac App Store and iTunes App store for iTunes 