In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from janome.tokenizer import Tokenizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,roc_curve, auc
import lightgbm as lgb
import matplotlib.pyplot as plt

In [24]:
requests_df = pd.read_csv("requests.csv")
subjects_items_df = pd.read_csv("subjects_items.csv")
ocr_results_df = pd.read_csv("ocr_results.csv")

In [25]:
merged = pd.merge(
        requests_df,
        ocr_results_df[['id', 'ocr_client_name', 'ocr_payment_amount', 'ocr_document_date']],
        left_on='ocr_id',
        right_on='id',
        suffixes=('', '_ocr')
    )

In [26]:
df = pd.merge(
        merged,
        subjects_items_df,
        left_on='subject_items_id',
        right_on='id',
        suffixes=('', '_item')
    )

In [27]:
final_cols = [
    'name',
    'ocr_client_name',
    'ocr_payment_amount',
    'ocr_document_date',
    'subject_items_id'
    ]
df = df[final_cols].copy()

In [28]:
# 欠損値のある行を削除
df.dropna(subset=['ocr_document_date'], inplace=True)
df.dropna(subset=['ocr_client_name'], inplace=True)

In [29]:
#数値変換でよく使わあれる正規化というメソッドですが、今回の場合ocr_payment_amountは綺麗な数字になってるため不要
# df['ocr_payment_amount'] = pd.to_numeric(
#         df['ocr_payment_amount'].astype(str)
#           .str.replace(r'[^0-9.]', '', regex=True),
#         errors='coerce'
#     )
# df['ocr_payment_amount'] = df['ocr_payment_amount'].fillna(0)

In [30]:
# 日付を何曜日何月分に変換
df['ocr_document_date'] = pd.to_datetime(df['ocr_document_date'], errors='coerce')
df['dayofweek'] = df['ocr_document_date'].dt.dayofweek
df['month'] = df['ocr_document_date'].dt.month

In [31]:
# 店舗名を単語ごとに区切る
def tokenize_japanese(text):
    return [token.surface for token in Tokenizer().tokenize(text)]

In [None]:
results = {}

print("start")


preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(
                tokenizer=tokenize_japanese,
                token_pattern=None,
                ngram_range=(1, 2),
                max_features=5000
            ), 'ocr_client_name'),
            ('numeric', StandardScaler(), ['ocr_payment_amount']),
            ('datetime', OneHotEncoder(), ['dayofweek', 'month'])
        ],
        remainder='drop'
            )
    
sub_df = df.copy()

    
X = sub_df[['ocr_client_name', 'ocr_payment_amount', 'dayofweek', 'month']]
y = sub_df['subject_items_id']
    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


    # モデルを作る
model = lgb.LGBMClassifier(
    random_state=42,
    learning_rate=0.05,
    objective='multiclass',
    min_child_samples=5,
    min_split_gain=0.0
        )
pipe = Pipeline([('preprocessor', preprocessor), ('classifier', model)])
pipe.fit(X_train, y_train)
    
    # 結果を入れ
trained_results = {
        'pipe': pipe,
        'X_test': X_test,
        'y_test': y_test,
        'sub_df': sub_df
        }

print("\n finish")

In [16]:
# 評価
print("\n start")

for tenant_id, results in trained_results.items():
    print(f"\nTenant ID: {tenant_id} ")

    
    pipe   = results['pipe']
    X_test = results['X_test']
    y_test = results['y_test']
    

    # 1. Classification Reportを作る
    print("\n--- Classification Report ---")
    
    y_pred = pipe.predict(X_test)

    class_labels = pipe.named_steps['classifier'].classes_

    # classification_reportをプリント
    print(classification_report(y_test, y_pred, labels=class_labels, target_names=[str(c) for c in class_labels]))


    # 2.ROC AUC曲線を描く
    print("\n--- ROC AUC　---")

    y_score = pipe.predict_proba(X_test)
    y_test_bin = label_binarize(y_test, classes=class_labels)

    fpr, tpr, _ = roc_curve(y_test_bin.ravel(), y_score.ravel())
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, lw=2, 
             label=f'ROC curve (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], 'k--')
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])

    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title(f'ROC Curve for Tenant {tenant_id}')
    
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

print("\n finish")


 start

 finish
