In [1]:
import os
import re
import pdfplumber
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the model
model = SentenceTransformer('sentence-transformers/LaBSE')

## Creating vectors for ESRS TOPICS

In [3]:
# Define Japanese topics
topics = [
    "気候変動", "気候変動への適応", "気候変動の緩和", "エネルギー",
    "空気の汚染", "水の汚染", "土壌汚染", "生物および食料資源の汚染",
    "懸念物質", "非常に懸念の高い物質", "マイクロプラスチック",
    "水の消費と取水", "放水", "海洋資源の使用", "海洋資源の採取と利用",
    "生物多様性の損失", "侵略的外来種", "種の個体数", "絶滅リスク", "砂漠化",
    "循環経済", "無駄", "労働条件", "適切な賃金", "労働時間", "健康と安全",
    "平等と機会", "障害者の雇用", "児童労働", "プライバシー", "文化的権利",
    "企業文化", "ロビー活動", "贈収賄", "動物福祉"
]

In [4]:
# Step 2: Encode topic phrases using the same model (LaBSE)
topic_embeddings = model.encode(topics, convert_to_numpy=True, show_progress_bar=True)

# Step 3: Store in a new DataFrame
topic_df = pd.DataFrame({
    'topic': topics,
    'embedding_topic': topic_embeddings.tolist()
})

Batches: 100%|██████████| 2/2 [00:00<00:00,  3.79it/s]


In [5]:
# Step 4: Check result
topic_df.head(5)

Unnamed: 0,topic,embedding_topic
0,気候変動,"[-0.03803761675953865, -0.04873177781701088, -..."
1,気候変動への適応,"[-0.00950303953140974, -0.040512144565582275, ..."
2,気候変動の緩和,"[-0.03168332204222679, -0.022349612787365913, ..."
3,エネルギー,"[0.01078301016241312, -0.05533900484442711, -0..."
4,空気の汚染,"[-0.06226995214819908, -0.034258317202329636, ..."


## EXTRACTING TEXT FROM THE PDF FILES

In [6]:
pdf_folder = "JAPANESE_FILES" 
output_folder = "sentence_embdedding_files"  
os.makedirs(output_folder, exist_ok=True)

In [7]:
model = SentenceTransformer('sentence-transformers/LaBSE')

In [8]:
def clean_japanese_sentences(text):
    sentences = re.split(r'(?<=[。！？])', text)
    cleaned = []
    for s in sentences:
        s = s.strip()
        if len(s) > 10 and not re.match(r'^[\d\W_]+$', s):
            cleaned.append(s)
    return cleaned

In [9]:
def is_valid_sentence(sentence):
    if re.search(r'https?://', sentence):
        return False
    if len(re.findall(r'[一-龯ぁ-ゔァ-ヴー々〆〤]', sentence)) < 5:
        return False
    if re.match(r'^[\d\W_]+$', sentence):
        return False
    return True

In [10]:
def advanced_is_valid_sentence(sentence):
    if re.search(r'https?://', sentence):
        return False
    if len(re.findall(r'[一-龯ぁ-ゔァ-ヴー々〆〤]', sentence)) < 6:
        return False
    if len(sentence) < 15:
        return False
    if re.match(r'^[\d\W\s_]+$', sentence):
        return False
    if sentence.count('\n') > 1 or len(sentence.split()) < 3:
        return False
    return True

In [11]:
processed_files = []  # List of (filename, sentence_count)
skipped_files = []    # List of (filename, reason)
failed_files = []     # List of (filename, error_message)

In [12]:
# -------------------- MAIN LOOP --------------------
for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, filename)
        print(f"Processing: {filename}")
        data = []

        try:
            with pdfplumber.open(pdf_path) as pdf:
                for i, page in enumerate(pdf.pages):
                    raw_text = page.extract_text()
                    if raw_text:
                        sentences = clean_japanese_sentences(raw_text)
                        for sentence in sentences:
                            data.append({"page": i + 1, "sentence": sentence})
        except Exception as e:
            error_msg = str(e)
            print(f"❌ Failed to process {filename} — Error: {error_msg}")
            failed_files.append((filename, error_msg))
            continue

        if not data:
            reason = "No extractable text"
            print(f"⚠️ Skipping {filename} — {reason}.")
            skipped_files.append((filename, reason))
            continue

        # Filter valid sentences using both filters
        final_filtered_data = [
            row for row in data
            if is_valid_sentence(row["sentence"]) and advanced_is_valid_sentence(row["sentence"])
        ]

        if not final_filtered_data:
            reason = "No valid sentences after filtering"
            print(f"⚠️ Skipping {filename} — {reason}.")
            skipped_files.append((filename, reason))
            continue

        # Extract sentence and page info
        sentences = [row["sentence"] for row in final_filtered_data]
        pages = [row["page"] for row in final_filtered_data]

        # Compute embeddings
        embeddings = model.encode(sentences).tolist()

        # Create DataFrame and save to CSV
        df = pd.DataFrame({
            "sentence": sentences,
            "page": pages,
            "embedding": embeddings
        })

        output_csv_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.csv")
        df.to_csv(output_csv_path, index=False, encoding="utf-8-sig")
        print(f"✅ Saved: {output_csv_path}")
        processed_files.append((filename, len(sentences)))

Processing: 01_JAPAN_TABACCO.pdf
✅ Saved: sentence_embdedding_files\01_JAPAN_TABACCO.csv
Processing: 02_SEVEN&AI_HD.pdf
✅ Saved: sentence_embdedding_files\02_SEVEN&AI_HD.csv
Processing: 03_SHINETSU_CHEMISTRY.pdf
✅ Saved: sentence_embdedding_files\03_SHINETSU_CHEMISTRY.csv
Processing: 04_TAKEDA_PHARMA.pdf
✅ Saved: sentence_embdedding_files\04_TAKEDA_PHARMA.csv
Processing: 05_CHUGAI_PHARMA.pdf
✅ Saved: sentence_embdedding_files\05_CHUGAI_PHARMA.csv
Processing: 06_TERUMO.pdf
✅ Saved: sentence_embdedding_files\06_TERUMO.csv
Processing: 07_DAIICHI_SANKYO_PHARMA.pdf
✅ Saved: sentence_embdedding_files\07_DAIICHI_SANKYO_PHARMA.csv
Processing: 08_ORIENTAL_LAND.pdf
✅ Saved: sentence_embdedding_files\08_ORIENTAL_LAND.csv
Processing: 09_FUJIFILM_HD.pdf


Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data


✅ Saved: sentence_embdedding_files\09_FUJIFILM_HD.csv
Processing: 10_BRIDGESTONE.pdf


Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss 

✅ Saved: sentence_embdedding_files\10_BRIDGESTONE.csv
Processing: 11_NIPPONSTEEL.pdf
✅ Saved: sentence_embdedding_files\11_NIPPONSTEEL.csv
Processing: 12_RECRUIT_HD.pdf
✅ Saved: sentence_embdedding_files\12_RECRUIT_HD.csv
Processing: 14_NIPPON_YUBIN.pdf


Data-loss while decompressing corrupted data


✅ Saved: sentence_embdedding_files\14_NIPPON_YUBIN.csv
Processing: 15_TOYOTA_INDUSTRIES_CORP.pdf
✅ Saved: sentence_embdedding_files\15_TOYOTA_INDUSTRIES_CORP.csv
Processing: 16_SMC_CORP.pdf
⚠️ Skipping 16_SMC_CORP.pdf — No valid sentences after filtering.
Processing: 17_KOMATSU.pdf
✅ Saved: sentence_embdedding_files\17_KOMATSU.csv
Processing: 18_DAIKIN.pdf
✅ Saved: sentence_embdedding_files\18_DAIKIN.csv
Processing: 19_HITACHI.pdf
✅ Saved: sentence_embdedding_files\19_HITACHI.csv
Processing: 20_MITSAUBISHI_ELECTRIC.pdf


Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data


✅ Saved: sentence_embdedding_files\20_MITSAUBISHI_ELECTRIC.csv
Processing: 21_NIDEC.pdf
✅ Saved: sentence_embdedding_files\21_NIDEC.csv
Processing: 22_FUJITSU.pdf
✅ Saved: sentence_embdedding_files\22_FUJITSU.csv
Processing: 23_RENESUS.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


✅ Saved: sentence_embdedding_files\23_RENESUS.csv
Processing: 24_PANASONIC_HD.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

✅ Saved: sentence_embdedding_files\24_PANASONIC_HD.csv
Processing: 25_SONY.pdf


Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data


✅ Saved: sentence_embdedding_files\25_SONY.csv
Processing: 26_ADVANTEST.pdf
✅ Saved: sentence_embdedding_files\26_ADVANTEST.csv
Processing: 27_KEYENCE.pdf
⚠️ Skipping 27_KEYENCE.pdf — No valid sentences after filtering.
Processing: 28_DENSO.pdf
✅ Saved: sentence_embdedding_files\28_DENSO.csv
Processing: 30_FANUC.pdf


Data-loss while decompressing corrupted data


✅ Saved: sentence_embdedding_files\30_FANUC.csv
Processing: 31_KYOCERA.pdf
✅ Saved: sentence_embdedding_files\31_KYOCERA.csv
Processing: 32_MURATA.pdf
✅ Saved: sentence_embdedding_files\32_MURATA.csv
Processing: 33_MITSUBISHI_HEAVY_INDUSTRIES.pdf
✅ Saved: sentence_embdedding_files\33_MITSUBISHI_HEAVY_INDUSTRIES.csv
Processing: 34_YUCHO_BANK.pdf
✅ Saved: sentence_embdedding_files\34_YUCHO_BANK.csv
Processing: 35_TOYOTA_MOTOR_CORP.pdf
✅ Saved: sentence_embdedding_files\35_TOYOTA_MOTOR_CORP.csv
Processing: 36_HONDA.pdf
✅ Saved: sentence_embdedding_files\36_HONDA.csv
Processing: 37_SUZUKI.pdf
✅ Saved: sentence_embdedding_files\37_SUZUKI.csv
Processing: 38_HOYA.pdf
✅ Saved: sentence_embdedding_files\38_HOYA.csv
Processing: 39_CANON.pdf
✅ Saved: sentence_embdedding_files\39_CANON.csv
Processing: 40_NINTENDO.pdf
⚠️ Skipping 40_NINTENDO.pdf — No valid sentences after filtering.
Processing: 41_ITOCHU.pdf


Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data


✅ Saved: sentence_embdedding_files\41_ITOCHU.csv
Processing: 42_MARUBENI.pdf
✅ Saved: sentence_embdedding_files\42_MARUBENI.csv
Processing: 43_TOYOTA_TSUSHO.pdf
✅ Saved: sentence_embdedding_files\43_TOYOTA_TSUSHO.csv
Processing: 44_MITSUI_CORP.pdf


Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data


✅ Saved: sentence_embdedding_files\44_MITSUI_CORP.csv
Processing: 45_TOKYO_ELECTRON.pdf
✅ Saved: sentence_embdedding_files\45_TOKYO_ELECTRON.csv
Processing: 46_SUMITOMO_CORP.pdf


Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss 

✅ Saved: sentence_embdedding_files\46_SUMITOMO_CORP.csv
Processing: 47_MITSUBISHI_CORP.pdf
✅ Saved: sentence_embdedding_files\47_MITSUBISHI_CORP.csv
Processing: 48_UNICHARM.pdf
✅ Saved: sentence_embdedding_files\48_UNICHARM.csv
Processing: 49_AEON.pdf


Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data


✅ Saved: sentence_embdedding_files\49_AEON.csv
Processing: 50_MUFG.pdf
✅ Saved: sentence_embdedding_files\50_MUFG.csv
Processing: 52_MIZUHO.pdf
✅ Saved: sentence_embdedding_files\52_MIZUHO.csv
Processing: 53_ORIX.pdf
✅ Saved: sentence_embdedding_files\53_ORIX.csv
Processing: 54_NOMURA_SEC_HD.pdf


Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data


✅ Saved: sentence_embdedding_files\54_NOMURA_SEC_HD.csv
Processing: 55_SOMPO_HD.pdf
✅ Saved: sentence_embdedding_files\55_SOMPO_HD.csv
Processing: 56_MS&AD_INSURE_G.pdf
✅ Saved: sentence_embdedding_files\56_MS&AD_INSURE_G.csv
Processing: 57_DAIICHI_LIFE.pdf
✅ Saved: sentence_embdedding_files\57_DAIICHI_LIFE.csv
Processing: 58_TOKIO_MARINE.pdf
✅ Saved: sentence_embdedding_files\58_TOKIO_MARINE.csv
Processing: 59_MITSU_FUDOSAN.pdf


Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data


✅ Saved: sentence_embdedding_files\59_MITSU_FUDOSAN.csv
Processing: 60_MITSUBISHI_ESTATE.pdf


Data-loss while decompressing corrupted data


✅ Saved: sentence_embdedding_files\60_MITSUBISHI_ESTATE.csv
Processing: 61_JR_EAST.pdf
✅ Saved: sentence_embdedding_files\61_JR_EAST.csv
Processing: 62_JR_TOKAI.pdf
⚠️ Skipping 62_JR_TOKAI.pdf — No extractable text.
Processing: 63_NTT.pdf
✅ Saved: sentence_embdedding_files\63_NTT.csv
Processing: 64_KDDI.pdf
✅ Saved: sentence_embdedding_files\64_KDDI.csv
Processing: 65_SOFTBANK.pdf
✅ Saved: sentence_embdedding_files\65_SOFTBANK.csv
Processing: 66_NTT_DATA.pdf
✅ Saved: sentence_embdedding_files\66_NTT_DATA.csv
Processing: 67_SOFTBANK_GROUP.pdf
✅ Saved: sentence_embdedding_files\67_SOFTBANK_GROUP.csv
Processing: 68_FIRST_RETAILING.pdf
✅ Saved: sentence_embdedding_files\68_FIRST_RETAILING.csv


In [13]:
print(f"✅ Processed ({len(processed_files)}):")
for fname, count in processed_files:
    print(f"  - {fname}: {count} sentences")

✅ Processed (61):
  - 01_JAPAN_TABACCO.pdf: 422 sentences
  - 02_SEVEN&AI_HD.pdf: 219 sentences
  - 03_SHINETSU_CHEMISTRY.pdf: 267 sentences
  - 04_TAKEDA_PHARMA.pdf: 117 sentences
  - 05_CHUGAI_PHARMA.pdf: 189 sentences
  - 06_TERUMO.pdf: 290 sentences
  - 07_DAIICHI_SANKYO_PHARMA.pdf: 352 sentences
  - 08_ORIENTAL_LAND.pdf: 174 sentences
  - 09_FUJIFILM_HD.pdf: 260 sentences
  - 10_BRIDGESTONE.pdf: 305 sentences
  - 11_NIPPONSTEEL.pdf: 351 sentences
  - 12_RECRUIT_HD.pdf: 98 sentences
  - 14_NIPPON_YUBIN.pdf: 443 sentences
  - 15_TOYOTA_INDUSTRIES_CORP.pdf: 218 sentences
  - 17_KOMATSU.pdf: 148 sentences
  - 18_DAIKIN.pdf: 259 sentences
  - 19_HITACHI.pdf: 104 sentences
  - 20_MITSAUBISHI_ELECTRIC.pdf: 378 sentences
  - 21_NIDEC.pdf: 251 sentences
  - 22_FUJITSU.pdf: 263 sentences
  - 23_RENESUS.pdf: 90 sentences
  - 24_PANASONIC_HD.pdf: 161 sentences
  - 25_SONY.pdf: 209 sentences
  - 26_ADVANTEST.pdf: 160 sentences
  - 28_DENSO.pdf: 484 sentences
  - 30_FANUC.pdf: 152 sentences
  -

In [14]:

print(f"\n⚠️ Skipped ({len(skipped_files)}):")
for fname, reason in skipped_files:
    print(f"  - {fname} → {reason}")


⚠️ Skipped (4):
  - 16_SMC_CORP.pdf → No valid sentences after filtering
  - 27_KEYENCE.pdf → No valid sentences after filtering
  - 40_NINTENDO.pdf → No valid sentences after filtering
  - 62_JR_TOKAI.pdf → No extractable text


In [15]:
print(f"\n❌ Failed ({len(failed_files)}):")
for fname, error in failed_files:
    print(f"  - {fname} → Error: {error}")


❌ Failed (0):


In [16]:
print("\n✅ Done.")


✅ Done.


## Cosine Similarity

In [17]:
input_folder = "sentence_embdedding_files"  # Folder with sentence+embedding CSVs
output_folder = "cosine_similarity_japanese_pdf_file"
os.makedirs(output_folder, exist_ok=True)

In [18]:
# Threshold for matching
threshold = 0.3

In [19]:
# Topic vectors (from earlier)
topic_vecs = np.vstack(topic_df['embedding_topic'].values)

In [20]:
for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):
        input_path = os.path.join(input_folder, filename)
        print(f"🔍 Processing: {filename}")

        # Load sentence+embedding CSV
        df = pd.read_csv(input_path, converters={'embedding': eval})

        # Sanity check
        required_cols = {'embedding', 'sentence', 'page'}
        if df.empty or not required_cols.issubset(df.columns):
            print(f"⚠️ Skipping {filename}: missing one of {required_cols}.")
            continue

        # Convert embeddings to array
        sentence_vecs = np.vstack(df['embedding'].values)

        # Compute cosine similarity
        similarity_matrix = cosine_similarity(sentence_vecs, topic_vecs)

        # Collect matches
        matches = []
        for sent_idx, row in enumerate(similarity_matrix):
            for topic_idx, score in enumerate(row):
                if score >= threshold:
                    matches.append({
                        'sentence': df.loc[sent_idx, 'sentence'],
                        'page': df.loc[sent_idx, 'page'],
                        'topic': topic_df.loc[topic_idx, 'topic'],
                        'similarity': round(score, 4)
                    })

        matched_df = pd.DataFrame(matches)

        # Save results
        output_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}_cosine_matches.csv")
        matched_df.to_csv(output_path, index=False, encoding="utf-8-sig")
        print(f"✅ Saved: {output_path}")

🔍 Processing: 01_JAPAN_TABACCO.csv
✅ Saved: cosine_similarity_japanese_pdf_file\01_JAPAN_TABACCO_cosine_matches.csv
🔍 Processing: 02_SEVEN&AI_HD.csv
✅ Saved: cosine_similarity_japanese_pdf_file\02_SEVEN&AI_HD_cosine_matches.csv
🔍 Processing: 03_SHINETSU_CHEMISTRY.csv
✅ Saved: cosine_similarity_japanese_pdf_file\03_SHINETSU_CHEMISTRY_cosine_matches.csv
🔍 Processing: 04_TAKEDA_PHARMA.csv
✅ Saved: cosine_similarity_japanese_pdf_file\04_TAKEDA_PHARMA_cosine_matches.csv
🔍 Processing: 05_CHUGAI_PHARMA.csv
✅ Saved: cosine_similarity_japanese_pdf_file\05_CHUGAI_PHARMA_cosine_matches.csv
🔍 Processing: 06_TERUMO.csv
✅ Saved: cosine_similarity_japanese_pdf_file\06_TERUMO_cosine_matches.csv
🔍 Processing: 07_DAIICHI_SANKYO_PHARMA.csv
✅ Saved: cosine_similarity_japanese_pdf_file\07_DAIICHI_SANKYO_PHARMA_cosine_matches.csv
🔍 Processing: 08_ORIENTAL_LAND.csv
✅ Saved: cosine_similarity_japanese_pdf_file\08_ORIENTAL_LAND_cosine_matches.csv
🔍 Processing: 09_FUJIFILM_HD.csv
✅ Saved: cosine_similarity_japa