In [10]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from janome.tokenizer import Tokenizer

import pickle

# 形態素解析器の設定
t = Tokenizer()

# 形態素解析関数
def tokenize(text):
    tokens = t.tokenize(text)
    return [token.base_form for token in tokens if token.part_of_speech.split(',')[0] in ['名詞', '動詞']]

In [11]:
df_train = pd.read_csv("train.csv")

In [12]:
# データセットの例（ダミーデータ）
texts = df_train['text']
labels = df_train['label']  # 1はポジティブ、0はネガティブ

# データを訓練用とテスト用に分割
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# パイプラインの作成（TF-IDFベクトル化とSVMモデル）
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize)),
    ('clf', SVC(kernel='linear'))
])

# モデルの訓練
pipeline.fit(X_train, y_train)

# テストデータでの評価
predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.65      0.92      0.76        12
           1       0.93      0.70      0.80        20

    accuracy                           0.78        32
   macro avg       0.79      0.81      0.78        32
weighted avg       0.83      0.78      0.78        32





In [13]:
model_filename = 'text_classification_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(pipeline, file)

print(f"モデルは{model_filename}に保存されました。")

モデルはtext_classification_model.pklに保存されました。
