In [4]:
import os
import pandas as pd
from transformers import pipeline

def shorten_texts_in_folder(folder_path):
    """
    指定フォルダ内の全CSVファイルに対して、文章を短縮して "_shortened.csv" を出力する。
    各CSVは「文章, ラベル」の2列構成を想定。
    """

    # 文章短縮モデル（英語用）
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")#"facebook/bart-large-cnn"

    # フォルダ内のCSVをすべて取得
    csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

    for file_name in csv_files:
        input_path = os.path.join(folder_path, file_name)
        output_path = os.path.join(folder_path, file_name.replace(".csv", "_shortened.csv"))

        print(f"処理中: {file_name}")

        # CSV読み込み
        df = pd.read_csv(input_path)

        # 文章列を短縮
        shortened_texts = []
        for text in df["text"]:
            try:
                summary = summarizer(
                    text,
                    max_length=15,   # 出力の最大トークン数
                    min_length=10,   # 出力の最小トークン数
                    do_sample=False  # 決定的に
                )[0]["summary_text"]
                shortened_texts.append(summary)
            except Exception as e:
                print(f"スキップ: {text[:50]}... ({e})")
                shortened_texts.append(text)  # エラー時は元の文章を残す

        # 結果を保存
        df["text"] = shortened_texts
        df.to_csv(output_path, index=False)
        print(f"✅ 出力完了: {output_path}")

# 使用例
folder = r"C:\Users\neoia\研究\data\emotion"
shorten_texts_in_folder(folder)




config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

処理中: emotion_test.csv


Your max_length is set to 15, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 15, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)
Your max_length is set to 15, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 15, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_len

✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_test_shortened.csv
処理中: emotion_train.csv


Your max_length is set to 15, but your input_length is only 6. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_length is set to 15, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 15, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_length is set to 15, but your input_length is only 6. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_length

✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_train_shortened.csv
処理中: emotion_validation.csv


Your max_length is set to 15, but your input_length is only 8. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 15, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)
Your max_length is set to 15, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 15, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_leng

✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_validation_shortened.csv


In [5]:
import os
import pandas as pd
from googletrans import Translator

def translate_shortened_csvs(folder_path):
    """
    フォルダ内の *_shortened.csv を日本語に翻訳して *_shortened_ja.csv を出力
    """
    translator = Translator()

    # 翻訳対象ファイルを取得
    csv_files = [f for f in os.listdir(folder_path) if f.endswith("_shortened.csv")]

    for file_name in csv_files:
        input_path = os.path.join(folder_path, file_name)
        output_path = os.path.join(folder_path, file_name.replace("_shortened.csv", "_shortened_ja.csv"))

        print(f"翻訳中: {file_name}")

        # CSVを読み込み
        df = pd.read_csv(input_path)

        # 翻訳列
        translated_texts = []
        for i, text in enumerate(df["text"]):
            try:
                result = translator.translate(text, src="en", dest="ja")
                translated_texts.append(result.text)
            except Exception as e:
                print(f"⚠️ 翻訳スキップ ({i}行目): {e}")
                translated_texts.append(text)  # 失敗時は元のまま

        # 日本語列を追加して保存
        df["text_ja"] = translated_texts
        df.to_csv(output_path, index=False)
        print(f"✅ 出力完了: {output_path}")

# 使用例
folder = r"C:\Users\neoia\研究\data\emotion"
translate_shortened_csvs(folder)


翻訳中: emotion_test_shortened.csv
⚠️ 翻訳スキップ (737行目): The read operation timed out
✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_test_shortened_ja.csv
翻訳中: emotion_train_shortened.csv
⚠️ 翻訳スキップ (51行目): The read operation timed out
⚠️ 翻訳スキップ (1291行目): The read operation timed out
⚠️ 翻訳スキップ (1321行目): The read operation timed out
⚠️ 翻訳スキップ (1403行目): The read operation timed out
⚠️ 翻訳スキップ (8341行目): The read operation timed out
⚠️ 翻訳スキップ (15996行目): The read operation timed out
✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_train_shortened_ja.csv
翻訳中: emotion_validation_shortened.csv
⚠️ 翻訳スキップ (889行目): The read operation timed out
⚠️ 翻訳スキップ (921行目): The read operation timed out
⚠️ 翻訳スキップ (923行目): The read operation timed out
⚠️ 翻訳スキップ (1049行目): The read operation timed out
⚠️ 翻訳スキップ (1613行目): The read operation timed out
⚠️ 翻訳スキップ (1696行目): the JSON object must be str, bytes or bytearray, not NoneType
✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_validation_shortened_ja.csv


In [4]:
import os
import pandas as pd
def final_csvs(folder_path):
    """
    フォルダ内の*_shortened_ja.csv を最終的な形にして出力
    """


    # 翻訳対象ファイルを取得
    csv_files = [f for f in os.listdir(folder_path) if f.endswith("_shortened_ja.csv")]

    for file_name in csv_files:
        input_path = os.path.join(folder_path, file_name)
        output_path = os.path.join(folder_path, file_name.replace("_shortened_ja.csv", "_final.csv"))

        print(f"翻訳中: {file_name}")

        # CSVを読み込み
        df = pd.read_csv(input_path)

        df = df.drop(columns=["text"])  # 英語列を削除
        df = df[["text_ja", "label"]]  # 列の順番を変更
        
        # 最終CSVを保存
        df.to_csv(output_path, index=False)
        print(f"✅ 出力完了: {output_path}")

# 使用例
folder = r"C:\Users\neoia\研究\data\emotion"
final_csvs(folder)

翻訳中: emotion_test_shortened_ja.csv
✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_test_final.csv
翻訳中: emotion_train_shortened_ja.csv
✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_train_final.csv
翻訳中: emotion_validation_shortened_ja.csv
✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_validation_final.csv


In [5]:
import os
import pandas as pd

def final_csvs(folder_path):
    """
    フォルダ内の *_final.csv に対して、
    「。」が複数ある場合、最初の「。」までで切り捨てる処理を行う。
    """
    csv_files = [f for f in os.listdir(folder_path) if f.endswith("_final.csv")]

    for file_name in csv_files:
        input_path = os.path.join(folder_path, file_name)
        output_path = os.path.join(folder_path, file_name)

        df = pd.read_csv(input_path)

        # 各行ごとに処理
        for index, row in df.iterrows():
            text = str(row["text_ja"])  # 念のため文字列化（NaN対策）
            if "。" in text:
                first_sentence = text.split("。", 1)[0] + "。"
            else:
                first_sentence = text

            # 各行に上書き
            df.at[index, "text_ja"] = first_sentence

        # ファイルを上書き保存
        df.to_csv(output_path, index=False, encoding="utf-8-sig")
        print(f"✅ 出力完了: {output_path}")

# 使用例
folder = r"C:\Users\neoia\研究\data\emotion"
final_csvs(folder)


✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_test_final.csv
✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_train_final.csv
✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_validation_final.csv


In [6]:
import os
import pandas as pd

def final_csvs(folder_path):
    # 翻訳対象ファイルを取得
    csv_files = [f for f in os.listdir(folder_path) if f.endswith("_final.csv")]

    for file_name in csv_files:
        input_path = os.path.join(folder_path, file_name)
        output_path = os.path.join(folder_path, file_name)

        df = pd.read_csv(input_path)

        new_rows = []
        for index, row in df.iterrows():
            text = str(row["text_ja"])

            # 「。」で最初の文を取り出す
            if "。" in text:
                first_sentence = text.split("。", 1)[0] + "。"
            else:
                first_sentence = text

            # 「「があるのに」」がない」 → この文は除外
            if "「" in first_sentence and "」" not in first_sentence:
                continue  # スキップ

            new_rows.append(row)
            new_rows[-1]["text_ja"] = first_sentence

        # 新しいDataFrame作成
        new_df = pd.DataFrame(new_rows)

        # 上書き保存
        new_df.to_csv(output_path, index=False, encoding="utf-8-sig")
        print(f"✅ 出力完了: {output_path}")

# 使用例
folder = r"C:\Users\neoia\研究\data\emotion"
final_csvs(folder)


✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_test_final.csv
✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_train_final.csv
✅ 出力完了: C:\Users\neoia\研究\data\emotion\emotion_validation_final.csv


In [7]:
import os
import pandas as pd
from googletrans import Translator
import re

def translate_to_japanese(folder_path):
    translator = Translator()

    # 翻訳対象ファイルを取得
    csv_files = [f for f in os.listdir(folder_path) if f.endswith("_final.csv")]

    for file_name in csv_files:
        input_path = os.path.join(folder_path, file_name)
        output_path = os.path.join(folder_path, file_name)  # 上書き

        df = pd.read_csv(input_path)
        new_rows = []

        for index, row in df.iterrows():
            text_ja = str(row.get("text_ja", ""))
            text_en = str(row.get("text_en", ""))

            # --- 1️⃣ text_ja が英語の場合、日本語に翻訳 ---
            if re.search(r'[A-Za-z]', text_ja):
                try:
                    translated = translator.translate(text_ja, src='en', dest='ja').text
                    text_ja = translated
                except Exception as e:
                    print(f"⚠️ 翻訳失敗（{file_name} 行 {index}）: {e}")

            # --- 2️⃣ text_en がある場合も日本語に翻訳して text_ja に統合 ---
            if text_en.strip():
                try:
                    translated_en = translator.translate(text_en, src='en', dest='ja').text
                    text_ja += " " + translated_en  # 日本語化した英文を後ろに追加
                except Exception as e:
                    print(f"⚠️ text_en 翻訳失敗（{file_name} 行 {index}）: {e}")

            # --- 3️⃣ 更新 ---
            new_row = row.copy()
            new_row["text_ja"] = text_ja.strip()
            new_rows.append(new_row)

        new_df = pd.DataFrame(new_rows)
        new_df.to_csv(output_path, index=False, encoding="utf-8-sig")
        print(f"✅ 日本語翻訳＆上書き完了: {output_path}")

# 使用例
folder = r"C:\Users\neoia\研究\data\emotion"
translate_to_japanese(folder)


✅ 日本語翻訳＆上書き完了: C:\Users\neoia\研究\data\emotion\emotion_test_final.csv
✅ 日本語翻訳＆上書き完了: C:\Users\neoia\研究\data\emotion\emotion_train_final.csv
✅ 日本語翻訳＆上書き完了: C:\Users\neoia\研究\data\emotion\emotion_validation_final.csv


In [2]:
import pandas as pd
import spacy
from collections import defaultdict

# -------------------------
# 1. データ読み込み
# -------------------------
train = pd.read_csv(r"C:\Users\neoia\研究\data\emotion\emotion_train_final.csv")
val   = pd.read_csv(r"C:\Users\neoia\研究\data\emotion\emotion_validation_final.csv")
test  = pd.read_csv(r"C:\Users\neoia\研究\data\emotion\emotion_test_final.csv")

# 文章が 0 カラム目にある前提
TEXT_COL = train.columns[0]

# -------------------------
# 2. GiNZA のロード
# -------------------------
nlp = spacy.load("ja_ginza")


# -------------------------
# 3. トークン → 行番号 の辞書を作る関数
# -------------------------
def build_token_dict(df):
    token_dict = defaultdict(list)

    for idx, sentence in df[TEXT_COL].items():
        doc = nlp(sentence)
        tokens = [token.text for token in doc]

        for t in tokens:
            token_dict[t].append(idx)

    return token_dict


# -------------------------
# 4. train / val / test の辞書作成
# -------------------------
train_tokens = build_token_dict(train)
val_tokens   = build_token_dict(val)
test_tokens  = build_token_dict(test)


# -------------------------
# 5. 未知語（train+val に無い test の語）を抽出
# -------------------------
train_val_vocab = set(list(train_tokens.keys()) + list(val_tokens.keys()))
test_vocab = set(test_tokens.keys())

unknown_tokens = test_vocab - train_val_vocab

print("=== テストにのみ存在する未知トークン ===")
print(unknown_tokens)

# どの行に出現したか
unknown_token_rows = {tok: test_tokens[tok] for tok in unknown_tokens}

print("=== 未知トークンの出現行 index ===")
print(unknown_token_rows)



  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


=== テストにのみ存在する未知トークン ===
{'話しかけれ', '拾い', 'アカアシシギ', '投手', 'エレベーター', 'バーコムコーチ', '密猟者', '空中', 'だます', '紫', 'ボーコーチ', 'オブ', '社会主義', '随所', 'ファー', 'ブランケット', '定時', 'かじり', '味覚', '下品', '変革', '長々', '勝た', '左翼', 'ニンジン', '埋め合わせ', '日焼け', '投票権', 'アンジョルラス', 'ムード', '購入者', '乗り物', 'カラス', 'タブ', 'バンジョー', '宣教', '引っ込み', 'つら', '照らし', '庭師', '後ほど', '貧乏', 'ねじり', 'ハリラヤクッキー', 'ウォーカー', '飲め', 'シティ', 'マイラー', '現代化', '待ち伏せ', '大家', 'チンポ', '暦年', '乗り込み', '今月末', '写真家', '交通量', '罪深い', 'やつ', '瓶', '養わ', 'ステラ', '搾取', 'チェクスパーティー', '見つかり', '至高', '慈善活動', 'マーカー', 'もてなし', '用事', '物資', 'mopey', '過敏', 'アバ', 'ユタ', 'フレーバー', 'ピクピク', 'とらえどころ', 'DD', 'コスプレ', 'ゴールド', '腫れる', '没頭', '占領', 'コメントタグ', 'ナグ', 'グラフィック', 'ルーティン', '苛性', '異性愛者', '数人', 'br', '陰唇', 'ライティング', '権力者', 'シャオ', '着色', '吸う', 'チャリティー', '援助', '中途半端', '私生活', 'ドラマティック', '交渉人', '鳥肌', 'ドラえもん', '批判的', 'アウトフィッター', 'たたく', '湧き上がる', '核心', 'クイン', 'え', '鳳', '受け取ら', 'ロウ', '閉じ込めよう', 'ホークス', 'プレッシー', 'なじみ', 'コーナー', 'BBQ', '化', '醸造家', 'ジャー', '憧れる', 'アマルグ', '国境', '巫女', '日目', 'ミネラルウォーター', '存在感', '屈する

In [3]:
# --- 未知語の行 index を集める ---
rows_to_drop = set()
for rows in unknown_token_rows.values():
    rows_to_drop.update(rows)

# --- test データから削除 ---
test_cleaned = test.drop(index=rows_to_drop).reset_index(drop=True)

# --- CSV に保存 ---
output_path = r"C:\Users\neoia\研究\data\emotion\emotion_test_final_cleaned.csv"
test_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")

print("保存完了:", output_path)


保存完了: C:\Users\neoia\研究\data\emotion\emotion_test_final_cleaned.csv


In [4]:
import re

# -------------------------
# 英字を含む行 index を返す関数
# -------------------------
def get_english_rows(df, text_col):
    return [idx for idx, text in df[text_col].items() if re.search(r"[A-Za-z]", str(text))]


# -------------------------
# 各データセットで実行
# -------------------------
train_eng_rows = get_english_rows(train, TEXT_COL)
val_eng_rows   = get_english_rows(val, TEXT_COL)
test_eng_rows  = get_english_rows(test_cleaned, TEXT_COL)

print("=== 英字を含む行 index リスト ===")
print("train:", train_eng_rows)
print("val:", val_eng_rows)
print("test_cleaned:", test_eng_rows)


=== 英字を含む行 index リスト ===
train: [5, 21, 43, 70, 159, 271, 281, 301, 312, 314, 379, 419, 427, 431, 447, 487, 489, 566, 599, 776, 835, 845, 903, 917, 942, 1032, 1081, 1176, 1243, 1252, 1275, 1312, 1321, 1332, 1398, 1481, 1483, 1562, 1609, 1690, 1719, 1825, 1882, 1912, 1913, 1959, 2023, 2095, 2106, 2134, 2195, 2202, 2242, 2276, 2364, 2463, 2618, 2623, 2710, 2734, 2913, 2950, 2951, 2987, 3030, 3130, 3254, 3267, 3278, 3365, 3447, 3476, 3551, 3636, 3683, 3761, 3762, 3763, 3804, 3893, 3927, 3950, 3981, 3991, 4010, 4029, 4068, 4122, 4195, 4254, 4298, 4322, 4327, 4330, 4355, 4514, 4550, 4559, 4565, 4774, 4816, 4844, 4886, 4892, 4915, 4966, 5042, 5067, 5132, 5139, 5197, 5217, 5318, 5366, 5368, 5376, 5455, 5494, 5540, 5543, 5582, 5606, 5635, 5663, 5672, 5676, 5749, 5771, 5884, 5955, 5958, 5977, 5996, 6007, 6054, 6157, 6207, 6225, 6471, 6576, 6672, 6684, 6727, 6812, 6822, 6883, 6911, 6913, 6937, 6959, 7086, 7141, 7155, 7170, 7190, 7221, 7235, 7419, 7441, 7489, 7545, 7673, 7704, 7718, 7726, 7763, 7

In [5]:
print(len(train_eng_rows), len(val_eng_rows), len(test_eng_rows))

344 46 7


In [7]:
import pandas as pd
import spacy
import re
from collections import defaultdict

# -------------------------
# 0. 元の CSV 読み込み
# -------------------------
train_path = r"C:\Users\neoia\研究\data\emotion\emotion_train_final.csv"
val_path   = r"C:\Users\neoia\研究\data\emotion\emotion_validation_final.csv"
test_path  = r"C:\Users\neoia\研究\data\emotion\emotion_test_final_cleaned.csv"  # cleaned後

train = pd.read_csv(train_path)
val   = pd.read_csv(val_path)
test_cleaned = pd.read_csv(test_path)

TEXT_COL = train.columns[0]

# -------------------------
# 1. 英字を含む行 index を抽出する関数
# -------------------------
def get_english_rows(df, text_col):
    return [idx for idx, text in df[text_col].items() if re.search(r"[A-Za-z]", str(text))]


# -------------------------
# 2. 各データセットで英字行の index を取得
# -------------------------
train_eng_rows = get_english_rows(train, TEXT_COL)
val_eng_rows   = get_english_rows(val, TEXT_COL)
test_eng_rows  = get_english_rows(test_cleaned, TEXT_COL)

print("train 英字行:", train_eng_rows)
print("val 英字行:", val_eng_rows)
print("test_cleaned 英字行:", test_eng_rows)


# -------------------------
# 3. 英字行を削除
# -------------------------
train_no_eng = train.drop(index=train_eng_rows).reset_index(drop=True)
val_no_eng   = val.drop(index=val_eng_rows).reset_index(drop=True)
test_no_eng  = test_cleaned.drop(index=test_eng_rows).reset_index(drop=True)


# -------------------------
# 4. 削除したバージョンを元の CSV に保存（上書き）
# -------------------------
train_no_eng.to_csv(train_path, index=False, encoding="utf-8-sig")
val_no_eng.to_csv(val_path, index=False, encoding="utf-8-sig")
test_no_eng.to_csv(test_path, index=False, encoding="utf-8-sig")

print("保存完了！（train, val, test_cleaned を上書き保存）")


train 英字行: [5, 21, 43, 70, 159, 271, 281, 301, 312, 314, 379, 419, 427, 431, 447, 487, 489, 566, 599, 776, 835, 845, 903, 917, 942, 1032, 1081, 1176, 1243, 1252, 1275, 1312, 1321, 1332, 1398, 1481, 1483, 1562, 1609, 1690, 1719, 1825, 1882, 1912, 1913, 1959, 2023, 2095, 2106, 2134, 2195, 2202, 2242, 2276, 2364, 2463, 2618, 2623, 2710, 2734, 2913, 2950, 2951, 2987, 3030, 3130, 3254, 3267, 3278, 3365, 3447, 3476, 3551, 3636, 3683, 3761, 3762, 3763, 3804, 3893, 3927, 3950, 3981, 3991, 4010, 4029, 4068, 4122, 4195, 4254, 4298, 4322, 4327, 4330, 4355, 4514, 4550, 4559, 4565, 4774, 4816, 4844, 4886, 4892, 4915, 4966, 5042, 5067, 5132, 5139, 5197, 5217, 5318, 5366, 5368, 5376, 5455, 5494, 5540, 5543, 5582, 5606, 5635, 5663, 5672, 5676, 5749, 5771, 5884, 5955, 5958, 5977, 5996, 6007, 6054, 6157, 6207, 6225, 6471, 6576, 6672, 6684, 6727, 6812, 6822, 6883, 6911, 6913, 6937, 6959, 7086, 7141, 7155, 7170, 7190, 7221, 7235, 7419, 7441, 7489, 7545, 7673, 7704, 7718, 7726, 7763, 7803, 7815, 7819, 7895

In [8]:
import pandas as pd
import spacy

# -------------------------
# ファイルパス
# -------------------------
train_path = r"C:\Users\neoia\研究\data\emotion\emotion_train_final.csv"
val_path   = r"C:\Users\neoia\研究\data\emotion\emotion_validation_final.csv"
test_path  = r"C:\Users\neoia\研究\data\emotion\emotion_test_final_cleaned.csv"

# -------------------------
# データ読み込み
# -------------------------
train = pd.read_csv(train_path)
val   = pd.read_csv(val_path)
test_cleaned = pd.read_csv(test_path)

TEXT_COL = train.columns[0]

# -------------------------
# spaCy GiNZA
# -------------------------
nlp = spacy.load("ja_ginza")


# -------------------------
# トークン数が 3 以下の行 index を返す関数（POS 不使用）
# -------------------------
def get_short_token_rows(df, text_col):
    rows = []
    for idx, text in df[text_col].items():
        doc = nlp(str(text))
        tokens = [t.text for t in doc if not t.is_space]   # ← 空白だけ除外
        if len(tokens) <= 3:
            rows.append(idx)
    return rows


# -------------------------
# 削除行の抽出
# -------------------------
train_drop = get_short_token_rows(train, TEXT_COL)
val_drop   = get_short_token_rows(val, TEXT_COL)
test_drop  = get_short_token_rows(test_cleaned, TEXT_COL)

print("train 削除行:", train_drop)
print("val 削除行:", val_drop)
print("test_cleaned 削除行:", test_drop)


# -------------------------
# 削除してクリーンデータ作成
# -------------------------
train_final = train.drop(index=train_drop).reset_index(drop=True)
val_final   = val.drop(index=val_drop).reset_index(drop=True)
test_final  = test_cleaned.drop(index=test_drop).reset_index(drop=True)


# -------------------------
# 元の CSV を上書き保存
# -------------------------
train_final.to_csv(train_path, index=False, encoding="utf-8-sig")
val_final.to_csv(val_path, index=False, encoding="utf-8-sig")
test_final.to_csv(test_path, index=False, encoding="utf-8-sig")

print("完了：トークン数 ≤3 の行を削除して保存しました。")


train 削除行: [1854, 2214, 2437, 2868, 3383, 3951, 4043, 4669, 4819, 4882, 5740, 5938, 6153, 6199, 8495, 9115, 9172, 9568, 9997, 10128, 10142, 10323, 10948, 11201, 11299, 11579, 11995, 12353, 12696, 12907, 13026, 13403, 13497, 13523, 14166, 14536, 14568]
val 削除行: [233, 412, 630]
test_cleaned 削除行: [735]
完了：トークン数 ≤3 の行を削除して保存しました。


In [None]:
import spacy
import pandas as pd

nlp = spacy.load("ja_ginza")

def has_multiple_s(text):
    doc = nlp(text)
    roots = [token for token in doc if token.dep_ == "ROOT"]
    return len(roots) >= 2


def filter_by_single_s(input_csv, output_csv):
    df = pd.read_csv(input_csv)

    filtered_texts = []
    filtered_labels = []

    for text, label in zip(df["text_ja"], df["label"]):
        if has_multiple_s(text):
            # print("skip:", text)
            continue
        
        filtered_texts.append(text)
        filtered_labels.append(label)

    df_out = pd.DataFrame({
        "text_ja": filtered_texts,
        "label": filtered_labels
    })
    df_out.to_csv(output_csv, index=False)

    print("before:", len(df))
    print("after:", len(df_out))


# 呼び出し
filter_by_single_s("input.csv", "filtered.csv")


In [24]:
def dependency_df(doc):
    
    doc = nlp(sentence)
    df = pd.DataFrame(columns=['token','token_num', 'dependency', 'token_head', 'token_head_num'])

    for token in doc:
        df = pd.concat([df, pd.DataFrame([{
            'token': token.text,
            'token_num': token.i,
            'dependency': token.dep_,
            'token_head': token.head.text,
            'token_head_num': token.head.i
        }])], ignore_index=True)
    
    
    return df

In [25]:
def build_pregroup_from_df(df):
    n, s = Ty('n'), Ty('s')
    cups = Id()

    # 空の Ty を入れる（ここで非参照問題はなし）
    df['type'] = [Ty() for _ in range(len(df))]

    for idx, row in df.iterrows():
        i = row['token_num']
        head_i = row['token_head_num']
        dep = row['dependency']

        # 現在の type を取り出す（コピー）
        ty = df.at[idx, 'type']

        # head の type を取り出す
        head_row = df.index[df['token_num'] == head_i][0]
        head_ty = df.at[head_row, 'type']

        # --- ROOT ---
        if dep == 'ROOT':
            new_ty = ty @ s
            df.at[idx, 'type'] = new_ty
            cups @= Id(s)
        
        # --- 非ROOT ---
        else:
            # 自分に n
            new_ty = ty @ n
            df.at[idx, 'type'] = new_ty

            # head に n.r または n.l を与える
            if i < head_i:
                new_head_ty = head_ty @ n.r
                df.at[head_row, 'type'] = new_head_ty
                cups @= Cup(n, n.r)

            elif i > head_i:
                new_head_ty = head_ty @ n.l
                df.at[head_row, 'type'] = new_head_ty
                cups @= Cup(n.l, n)

    return cups, df

In [26]:
def assign_words(df):
    Words = []
    for idx, row in df.iterrows():
        word = row['token']
        ty = row['type']
        Words.append(Word(word, ty))
    
    return Words

In [27]:
import matplotlib.pyplot as plt
from matplotlib import rcParams
import matplotlib.font_manager as fm
'''
# 日本語フォントを設定
font_path = "C:\\Windows\\Fonts\\meiryo.ttc" 
jp_font = fm.FontProperties(fname=font_path) 
# matplotlibにフォントを設定
rcParams['font.family'] = jp_font.get_name()
'''


# Ubuntu のNotoフォント（日本語）
font_path = "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc"

jp_font = fm.FontProperties(fname=font_path)

# matplotlib にフォントを設定
rcParams['font.family'] = jp_font.get_name()


# diagram作成
def create_diagram(sentence):
    doc = nlp(sentence)
    df = dependency_df(doc)
    #cleaned_df = dock_adjacent_by_head(df)
    cups, dep_with_types_df = build_pregroup_from_df(df)
    n, s = Ty('n'), Ty('s')
    types = Ty()
    
    word = assign_words(dep_with_types_df)
    
    diagram = Id().tensor(*word)
    
    for type in dep_with_types_df['type']:
        types @= type
    i = 0
    while i < len(types) - 1:
        if types[i:i + 2] == n @ n.r:  
            diagram = diagram >> types[:i] @ Cup(n, n.r) @ types[i + 2:]
            types = types[:i] @ types[i + 2:]
            i = max(0, i - 1)
        elif types[i:i + 2] == n.l @ n:
            diagram = diagram >> types[:i] @ Cup(n.l, n) @ types[i + 2:]
            types = types[:i] @ types[i + 2:]
            i = max(0, i - 1)
        else:
            i += 1
    
           
    return diagram

In [28]:
def filter_dataset_by_pregroup_csv(csv_path):
    """
    CSV(text_ja, label) を読み込み、
    - create_diagram(sentence) が成功
    - cod に含まれる s が 1 個以下
    の行のみ残す。
    出力は元ファイル名 + '_2.csv'
    """
    df = pd.read_csv(csv_path)
    required_cols = {"text_ja", "label"}
    if not required_cols.issubset(df.columns):
        raise ValueError(f"CSV must contain columns {required_cols}")
    
    diagrams = []
    failed_sentences = set()
    drop_indices = []
    s = Ty('s')
    
    def count_s(ty):
        return sum(1 for t in ty if t == s)
    
    for idx, row in df.iterrows():
        current_sentence = None  # ★ ローカル変数として定義
        
        try:
            current_sentence = str(row["text_ja"])
            
            # 空文は即除外
            if current_sentence.strip() == "":
                print(f"☒ empty sentence at index {idx}")
                drop_indices.append(idx)
                continue
            
            # ★ グローバルに sentence を設定してから create_diagram を呼ぶ
            global sentence
            sentence = current_sentence
            
            # diagram 作成
            d = create_diagram(sentence)
            
            # cod チェック
            if not hasattr(d, "cod"):
                print(f"☒ Invalid diagram object at index {idx}: {current_sentence}")
                failed_sentences.add(current_sentence)
                drop_indices.append(idx)
                continue
            
            # s カウント
            s_count = count_s(d.cod)
            if s_count > 1:
                print(f"⚠ multiple s ({s_count}) at index {idx}: {current_sentence}")
                failed_sentences.add(current_sentence)
                drop_indices.append(idx)
                continue
            
            # 成功
            diagrams.append(d)
            
        except Exception as e:
            print(f"\n☒ index {idx}: {current_sentence if current_sentence else 'UNKNOWN'}\n{e}")
            if current_sentence:
                failed_sentences.add(current_sentence)
            drop_indices.append(idx)
    
    # 失敗行を削除
    df_clean = df.drop(index=drop_indices).reset_index(drop=True)
    
    # 出力パス
    base, ext = os.path.splitext(csv_path)
    output_path = f"{base}_2{ext}"
    df_clean.to_csv(output_path, index=False)
    
    return diagrams, failed_sentences, output_path

In [10]:
import spacy
from spacy import displacy
from lambeq.backend.drawing import draw
from lambeq.backend.grammar import Cup, Id, Ty, Word
from lambeq import AtomicType, IQPAnsatz, NumpyModel, QuantumTrainer, SPSAOptimizer, Dataset
from lambeq.training.loss import CELoss4
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#import en_core_web_trf
n, s = Ty('n'), Ty('s')
# nlp = en_core_web_trf.load()
nlp = spacy.load("ja_ginza")

  from .autonotebook import tqdm as notebook_tqdm


In [30]:
train_csv = "/home/hiro1/研究/data/emotion/emotion_train_final.csv"
dev_csv   = "/home/hiro1/研究/data/emotion/emotion_validation_final.csv"
test_csv  = "/home/hiro1/研究/data/emotion/emotion_test_final_cleaned.csv"

train_diagrams, failed_train, train_output_path= filter_dataset_by_pregroup_csv(train_csv)
dev_diagrams, failed_dev, dev_output_path = filter_dataset_by_pregroup_csv(dev_csv)
test_diagrams, failed_test, test_output_path   = filter_dataset_by_pregroup_csv(test_csv)


⚠ multiple s (2) at index 43: その朝、私は少し落胆していることに気づきました。見つけました
⚠ multiple s (2) at index 142: のドアに足を踏み入れたような気がする
⚠ multiple s (2) at index 181: 私は今、それらすべてに対してとても不快に感じています、おそらくそうでしょう
⚠ multiple s (2) at index 250: 明日学校に行くのがとても気が進まないのですが、
⚠ multiple s (2) at index 429: 私はそれが聖霊が働いていることを知っていました、そしてそれは感じました
⚠ multiple s (2) at index 633: 私は彼に何が彼をそんなに素晴らしい気分にさせるのかと尋ねました、そして彼は
⚠ multiple s (2) at index 667: 怒っている顧客に直面するのがどのような感じか私は知っています。
⚠ multiple s (2) at index 674: エリック・プリンスの自伝「民間戦士」のナレーションをさせていただいたことを光栄に思います
⚠ multiple s (2) at index 724: 私はこれらの疑問を抱き始めています私の胃は私の心を締め付けます
⚠ multiple s (2) at index 730: 私は他人に対して敵意を持っているとは感じません、ただそれだけです
⚠ multiple s (2) at index 932: 私の周りのすべてがめちゃくちゃになっているように感じます私の周りの誰もが落ち込んでいます
⚠ multiple s (2) at index 947: 私は無神論を確信しているにもかかわらず、どういうわけか私にそう感じさせずにはいられません
⚠ multiple s (2) at index 978: に教育に関する投稿を投稿するのは少し腐ったような気がします
⚠ multiple s (2) at index 989: 私は決して良い状態に立つことができないようです、そして私は
⚠ multiple s (2) at index 1021: 私は時間を取り戻しました、そして私はまだ強いと感じています。
⚠ multiple s (2) at index 

In [31]:
import pandas as pd
import spacy
from collections import defaultdict

# -------------------------
# 1. データ読み込み
# -------------------------
train = pd.read_csv(r"C:\Users\neoia\研究\data\emotion\emotion_train_final_2.csv")
val   = pd.read_csv(r"C:\Users\neoia\研究\data\emotion\emotion_validation_final_2.csv")
test  = pd.read_csv(r"C:\Users\neoia\研究\data\emotion\emotion_test_final_cleaned_2.csv")

# 文章が 0 カラム目にある前提
TEXT_COL = train.columns[0]

# -------------------------
# 2. GiNZA のロード
# -------------------------
nlp = spacy.load("ja_ginza")


# -------------------------
# 3. トークン → 行番号 の辞書を作る関数
# -------------------------
def build_token_dict(df):
    token_dict = defaultdict(list)

    for idx, sentence in df[TEXT_COL].items():
        doc = nlp(sentence)
        tokens = [token.text for token in doc]

        for t in tokens:
            token_dict[t].append(idx)

    return token_dict


# -------------------------
# 4. train / val / test の辞書作成
# -------------------------
train_tokens = build_token_dict(train)
val_tokens   = build_token_dict(val)
test_tokens  = build_token_dict(test)


# -------------------------
# 5. 未知語（train+val に無い test の語）を抽出
# -------------------------
train_val_vocab = set(list(train_tokens.keys()) + list(val_tokens.keys()))
test_vocab = set(test_tokens.keys())

unknown_tokens = test_vocab - train_val_vocab

print("=== テストにのみ存在する未知トークン ===")
print(unknown_tokens)

# どの行に出現したか
unknown_token_rows = {tok: test_tokens[tok] for tok in unknown_tokens}

print("=== 未知トークンの出現行 index ===")
print(unknown_token_rows)



FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\neoia\\研究\\data\\emotion\\emotion_train_final_2.csv'