In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

df = pd.read_csv("/Users/horiuchiminori/Desktop/研究/datasets/日本語4クラス/4emos_jp_4000.csv")
text_col = 'text' if 'text' in df.columns else df.select_dtypes(include=[object]).columns[0]
label_col = 'label' if 'label' in df.columns else [c for c in df.columns if c!=text_col][0]

X = df[text_col].astype(str).values
y = df[label_col].astype(str).values

# 日本語判定（簡易）
def is_japanese_text(samples, threshold=0.3):
    count = 0
    total = min(len(samples), 200)
    for s in samples[:total]:
        if any('\u3040' <= ch <= '\u30ff' or '\u4e00' <= ch <= '\u9fff' for ch in s):
            count += 1
    return (count / total) >= threshold

is_jp = is_japanese_text(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

if is_jp:
    vect = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4), max_features=2000)
else:
    vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=2000)

models = {
    "LogReg": Pipeline([('tfidf', vect), ('clf', LogisticRegression(max_iter=1000))]),
    "LinearSVC": Pipeline([('tfidf', vect), ('clf', LinearSVC())]),
    "RandomForest": Pipeline([('tfidf', vect), ('clf', RandomForestClassifier(n_jobs=1, n_estimators=100))])
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    rpt = classification_report(y_test, y_pred, zero_division=0)
    cm = confusion_matrix(y_test, y_pred, labels=np.unique(y))
    print(f"{name} Accuracy: {acc:.4f}")
    print(rpt)
    results[name] = {'model': model, 'accuracy': acc, 'report': rpt, 'cm': cm}

best_name = max(results.keys(), key=lambda k: results[k]['accuracy'])


LogReg Accuracy: 0.7799
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       215
           1       0.74      0.80      0.77       181
           2       0.77      0.71      0.74       170
           3       0.77      0.76      0.77       170

    accuracy                           0.78       736
   macro avg       0.78      0.78      0.78       736
weighted avg       0.78      0.78      0.78       736

LinearSVC Accuracy: 0.7894
              precision    recall  f1-score   support

           0       0.84      0.81      0.82       215
           1       0.75      0.83      0.79       181
           2       0.77      0.76      0.76       170
           3       0.80      0.75      0.78       170

    accuracy                           0.79       736
   macro avg       0.79      0.79      0.79       736
weighted avg       0.79      0.79      0.79       736





RandomForest Accuracy: 0.7283
              precision    recall  f1-score   support

           0       0.75      0.76      0.76       215
           1       0.68      0.75      0.72       181
           2       0.76      0.68      0.72       170
           3       0.72      0.71      0.71       170

    accuracy                           0.73       736
   macro avg       0.73      0.73      0.73       736
weighted avg       0.73      0.73      0.73       736



In [6]:
# ===== ライブラリのインポート =====
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np
import re
import spacy

# ===== データの読み込み =====
df = pd.read_csv("/Users/horiuchiminori/Desktop/研究/datasets/日本語4クラス/combined_unique_texts.csv")
print(df.head())

# ===== 前処理 =====
# 日本語モデルをロード（例：ja_core_news_sm）
nlp = spacy.load("ja_core_news_sm")

def preprocess(text):
    text = re.sub(r'[0-9０-９]+', '0', text)  # 数字を0に統一
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.pos_ != "PUNCT"]
    return tokens

# トークン化
df["tokens"] = df["text"].astype(str).apply(preprocess)

# ===== Word2Vec モデル学習 =====
w2v_model = Word2Vec(
    sentences=df["tokens"],
    vector_size=100,  # ベクトル次元
    window=5,         # 文脈ウィンドウサイズ
    min_count=1,      # 出現頻度の閾値
    sg=1,             # Skip-gram（0ならCBOW）
    epochs=50
)

# ===== 各文のベクトルを作成 =====
def sentence_vector(tokens):
    vecs = []
    for w in tokens:
        if w in w2v_model.wv:
            vecs.append(w2v_model.wv[w])
    if len(vecs) == 0:
        return np.zeros(w2v_model.vector_size)
    else:
        return np.mean(vecs, axis=0)

df["vector"] = df["tokens"].apply(sentence_vector)
X = np.vstack(df["vector"].values)
y = df["label"].values

# ===== 学習・評価 =====
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("=== 分類結果 ===")
print(classification_report(y_test, y_pred))

                      text  label
0  顧客から高い評価をもらえて誇らしい気持ちです！      0
1      上司が急に出張を取りやめて驚きました。      3
2       納得できない決定に苛立ちを感じます！      1
3       会議中に新しい提案が出て驚きました。      3
4        言い訳ばかりで誠実さを感じません！      1




=== 分類結果 ===
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00         3
           3       0.07      1.00      0.13         1

    accuracy                           0.06        16
   macro avg       0.02      0.25      0.03        16
weighted avg       0.00      0.06      0.01        16



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
