In [62]:
import os
from typing import List, Dict, Any, Optional
import asyncio
import warnings

import dotenv
import pandas as pd
from enum import Enum
from pydantic import BaseModel, Field
from pydantic_ai import Agent, BinaryContent, ModelRetry
from pydantic_ai.messages import ModelRequest, ModelResponse
from pydantic_ai.models.gemini import GeminiModel, GeminiModelSettings
from pydantic_ai.providers.google_vertex import GoogleVertexProvider

warnings.filterwarnings("ignore", category=UserWarning)

# Load environment variables from .env file
dotenv_path = os.path.join('..', '.env')
dotenv.load_dotenv(dotenv_path)


True

In [63]:
def generate_agent(
    model_id: str,
    output_type: Optional[BaseModel | str] = str,
    retries: int = 5,
    temperature: float = 0.2,
    max_tokens: int = 20000,
    timeout: int = 60,
) -> Agent:

    model = GeminiModel(
        model_id, provider=GoogleVertexProvider(region=os.getenv("GCP_LOCATION"))
    )

    model_settings = GeminiModelSettings(
        temperature=temperature,
        max_tokens=max_tokens,
        timeout=timeout,
    )

    agent = Agent(
        model,
        retries=retries,
        output_type=output_type,
        model_settings=model_settings,
    )

    return agent


async def generate_content_with_agent(
    model_id: str,
    query: list[str | BinaryContent],
    history: list[ModelRequest | ModelResponse] = [],
    output_type: Optional[BaseModel | str] = str,
    retries: int = 3,
    temperature: float = 0.2,
    max_tokens: int = 20000,
    timeout: int = 60,
) -> tuple[str, int, int, int]:

    await asyncio.sleep(0.5)  # APIレート制限対策

    agent = generate_agent(
        model_id,
        output_type,
        retries=retries,
        temperature=temperature,
        max_tokens=max_tokens,
        timeout=timeout,
    )

    response = await agent.run(
        query,
        message_history=history,
    )
    output = response.output

    input_tokens = response.usage().request_tokens
    output_tokens = response.usage().response_tokens
    total_tokens = response.usage().total_tokens
    think_tokens = total_tokens - input_tokens - output_tokens


    return output, input_tokens, output_tokens, think_tokens

def calculate_cost(
    model_name: str,
    input_tokens: int,
    output_tokens: int,
    think_tokens: Optional[int] = None,
) -> tuple[float, float, float]:
    """
    Calculate the total cost based on input, output, and think tokens.
    """
    if think_tokens is None:
        think_tokens = 0

    if "gemini-2.0-flash" in model_name:
        input_cost_per_mil_token = 0.15
        output_cost_per_mil_token = 0.60

    elif "gemini-2.5-flash" in model_name:
        input_cost_per_mil_token = 0.15
        if think_tokens == 0:
            output_cost_per_mil_token = 0.60
        else:
            output_cost_per_mil_token = 3.5

    else:
        raise ValueError(f"Unsupported model name: {model_name}")

    input_cost = input_tokens * input_cost_per_mil_token * 1e-6
    output_cost = output_tokens * output_cost_per_mil_token * 1e-6
    think_cost = think_tokens * output_cost_per_mil_token * 1e-6

    return input_cost, output_cost, think_cost


class SentimentEnum(str, Enum):
    POSITIVE = 'ポジティブ'
    NEUTRAL = '中立'
    NEGATIVE = 'ネガティブ'

class CategoryEnum(str, Enum):
    LECTURE_CONTENT = '講義内容'
    LECTURE_MATERIAL = '講義資料'
    OPERATION = '運営'
    OTHER = 'その他'

class ImportanceEnum(str, Enum):
    HIGH = '高'
    MEDIUM = '中'
    LOW = '低'

class CommonalityEnum(str, Enum):
    HIGH = '高'
    MEDIUM = '中'
    LOW = '低'


class EvalOutput(BaseModel):
    sentiment: SentimentEnum = Field(description="コメントに対する感情の分類")
    category: CategoryEnum = Field(description="コメントに対するカテゴリの分類")
    importance: ImportanceEnum = Field(description="コメントに対する重要度の分類")
    commonality: CommonalityEnum = Field(description="コメントに対する共通性の分類")

In [64]:
data_days = ["day1", "day2", "day3"]
models = ["gemini-2.0-flash", "gemini-2.5-flash-preview-05-20"]
data_dir = os.path.join("..", "data")

for day in data_days:
    for model in models:
        print(f"Processing {day} with model {model}...")
        act_df = pd.read_csv(os.path.join(data_dir, f"{day}_アンケート.csv"))
        comments = act_df['コメント'].tolist()

        tasks = [
            generate_content_with_agent(
                model_id=model,
                query=[comment],
                output_type=EvalOutput,
                retries=3,
                temperature=0.2,
                max_tokens=2000,
                timeout=60,
            )
            for comment in comments
        ]

        semaphore = asyncio.Semaphore(50)
        async with semaphore:
            results = await asyncio.gather(*tasks, return_exceptions=True)

        # Process results
        processed_results = []
        for result in results:
            if isinstance(result, Exception):
                print(f"Error: {result}")
                result_dict = {
                    "sentiment": None,
                    "category": None,
                    "importance": None,
                    "commonality": None,
                    "total_cost": 0,
                    "is_error": True,
                }
            else:
                output, input_tokens, output_tokens, think_tokens = result

                input_cost, output_cost, think_cost = calculate_cost(
                    model_name=model,
                    input_tokens=input_tokens,
                    output_tokens=output_tokens,
                    think_tokens=think_tokens,
                )
                total_cost = input_cost + output_cost + think_cost

                result_dict = output.model_dump()
                for k, v in result_dict.items():
                        result_dict[k] = v.value
                result_dict.update({
                    "total_cost": total_cost,
                    "is_error": False,
                })

            processed_results.append(result_dict)

        # evaluation
        results_df = pd.DataFrame(processed_results)

        # is_errorのTrueを数えて、results_dfから削除
        error_count = results_df['is_error'].sum()
        print(f"Error Rate: {error_count / len(results_df):.2%}")
        act_df = act_df[~results_df['is_error']]
        results_df = results_df[~results_df['is_error']]

        # 感情のaccuracyを計算
        sentiment_pred = results_df['sentiment'].map({
            SentimentEnum.POSITIVE: 1,
            SentimentEnum.NEUTRAL: 0,
            SentimentEnum.NEGATIVE: -1
        })

        sentiment_act = act_df["点数"].map({
            5: 1,
            4: 1,
            3: 0,
            2: -1,
            1: -1,
        })
        sentiment_accuracy = (sentiment_pred == sentiment_act).mean()
        print(f"Sentiment Accuracy: {sentiment_accuracy:.2%}")

        # カテゴリのaccuracyを計算
        category_accuracy = (results_df['category'] == act_df['カテゴリ']).mean()
        print(f"Category Accuracy: {category_accuracy:.2%}")
        # カテゴリごとのaccuracyを表示
        category_accuracy_by_group = results_df.groupby('category').apply(
            lambda x: (x['category'] == act_df.loc[x.index, 'カテゴリ']).mean()
        )
        print("Category Accuracy by Group:")
        print(category_accuracy_by_group)

        # 感情、重要度、共通性の掛け算からスコアを算出、点数との相関を計算
        importance_map = {
            ImportanceEnum.HIGH: 3,
            ImportanceEnum.MEDIUM: 2,
            ImportanceEnum.LOW: 1
        }
        commonality_map = {
            CommonalityEnum.HIGH: 3,
            CommonalityEnum.MEDIUM: 2,
            CommonalityEnum.LOW: 1
        }
        results_df['score'] = sentiment_pred * results_df['importance'].map(importance_map) * results_df['commonality'].map(commonality_map)
        # 1-5にスケールを合わせる
        results_df['score'] = (results_df['score'] + 9) / 18 * 4 + 1

        score_corr = results_df['score'].corr(act_df['点数'])
        print(f"Score Correlation: {score_corr:.2f}")

        # コストを表示
        total_cost = results_df['total_cost'].sum()
        print(f"Total Cost: ${total_cost:.4f}")

        print("-" * 20)


Processing day1 with model gemini-2.0-flash...
Error Rate: 0.00%
Sentiment Accuracy: 81.47%
Category Accuracy: 59.91%
Category Accuracy by Group:
category
その他     0.387755
講義内容    0.559701
講義資料    0.933333
運営      0.894737
dtype: float64
Score Correlation: 0.80
Total Cost: $0.0045
--------------------
Processing day1 with model gemini-2.5-flash-preview-05-20...


  category_accuracy_by_group = results_df.groupby('category').apply(


Error Rate: 0.00%
Sentiment Accuracy: 85.34%
Category Accuracy: 62.07%
Category Accuracy by Group:
category
その他     0.000000
講義内容    0.562500
講義資料    0.928571
運営      0.760000
dtype: float64
Score Correlation: 0.79
Total Cost: $0.2738
--------------------
Processing day2 with model gemini-2.0-flash...


  category_accuracy_by_group = results_df.groupby('category').apply(


Error Rate: 0.00%
Sentiment Accuracy: 79.82%
Category Accuracy: 62.16%
Category Accuracy by Group:
category
その他     0.386792
講義内容    0.650273
講義資料    0.907692
運営      0.634146
dtype: float64
Score Correlation: 0.81
Total Cost: $0.0083
--------------------
Processing day2 with model gemini-2.5-flash-preview-05-20...


  category_accuracy_by_group = results_df.groupby('category').apply(


Error Rate: 0.00%
Sentiment Accuracy: 81.65%
Category Accuracy: 62.16%
Category Accuracy by Group:
category
その他     0.259259
講義内容    0.595506
講義資料    0.855072
運営      0.630137
dtype: float64
Score Correlation: 0.79
Total Cost: $0.4792
--------------------
Processing day3 with model gemini-2.0-flash...


  category_accuracy_by_group = results_df.groupby('category').apply(


Error Rate: 0.00%
Sentiment Accuracy: 82.62%
Category Accuracy: 64.38%
Category Accuracy by Group:
category
その他     0.295455
講義内容    0.655319
講義資料    0.952381
運営      0.750000
dtype: float64
Score Correlation: 0.86
Total Cost: $0.0091
--------------------
Processing day3 with model gemini-2.5-flash-preview-05-20...


  category_accuracy_by_group = results_df.groupby('category').apply(


Error Rate: 0.00%
Sentiment Accuracy: 83.05%
Category Accuracy: 70.82%
Category Accuracy by Group:
category
その他     0.125000
講義内容    0.668852
講義資料    0.938462
運営      0.787500
dtype: float64
Score Correlation: 0.81
Total Cost: $0.5423
--------------------


  category_accuracy_by_group = results_df.groupby('category').apply(
