In [1]:
import pandas as pd
from collections import Counter
import numpy as np

In [77]:
df1 = pd.read_csv('MM_eu_32.csv')
df2 = pd.read_csv('MC_eu_gen_32.csv')

# 1. file2から各(data, query)グループのcoverage_per_time最大値を持つ行を取得
df2_best = df2.sort_values('coverage_per_time', ascending=False).drop_duplicates(subset=['data', 'query'])

# 2. 2つのDataFrameを'data'と'query'をキーとして結合
# どのファイル由来の列か分かるように、接尾辞(suffixes)を追加
merged_df = pd.merge(
    df1[['data', 'query', 'engine_1', 'coverage_per_time_1']],
    df2_best[['data', 'query', 'coverage_per_time']],
    on=['data', 'query'],
    suffixes=('_file1', '_file2')
)

# 3. 勝敗を記録するための変数を初期化
file1_wins = 0
file2_wins = 0
ties = 0
file1_winning_engines = []

# 4. 結合したDataFrameを1行ずつ比較
for index, row in merged_df.iterrows():
    val1 = row['coverage_per_time_1']
    val2 = row['coverage_per_time']

    if val1 > val2:
        file1_wins += 1
        # file1が勝った場合、その時のengine_1を記録
        file1_winning_engines.append(row['engine_1'])
    elif val2 > val1:
        file2_wins += 1
    else:
        ties += 1

# 5. 結果の表示
print("--- 比較結果 ---")
print(f"MMの勝利数: {file1_wins}回")
print(f"MatCoの勝利数: {file2_wins}回")
print(f"引き分け数: {ties}回")
print("-" * 20)

if file1_wins > 0:
    # 勝利したengineの種類と回数を集計
    engine_counts = Counter(file1_winning_engines)
    print("File1が勝利した時のEngineの内訳:")
    for engine, count in engine_counts.items():
        print(f"- {engine}: {count}回")

--- 比較結果 ---
MMの勝利数: 33回
MatCoの勝利数: 17回
引き分け数: 0回
--------------------
File1が勝利した時のEngineの内訳:
- MM: 10回
- MMN: 13回
- MMI: 10回


In [78]:
# df1 = pd.read_csv('MM_yt_32.csv')
# df2 = pd.read_csv('MC_yt_gen_32.csv')

merged_df = pd.merge(
    df1[['data', 'query', 'engine_1', 'engine_2', 'engine_3','coverage_per_time_1', 'coverage_per_time_2', 'coverage_per_time_3']],
    df2[['data', 'query', 'coverage_per_time']],
    on=['data', 'query']
)

In [79]:
target_cols = ['coverage_per_time_1', 'coverage_per_time_2', 'coverage_per_time_3', 'coverage_per_time']

max_vals = merged_df[target_cols].max(axis=1)
min_vals = merged_df[target_cols].min(axis=1)

merged_df['coverage_variation'] = (max_vals - min_vals) * 100 / max_vals

merged_df['coverage_variation'] = merged_df['coverage_variation'].fillna(0)

merged_df['coverage_variation'].mean()


np.float64(76.11944295133496)

In [80]:
# 対象となる列のリストを定義
target_cols = ['coverage_per_time_1', 'coverage_per_time_2', 'coverage_per_time_3']

# 各行における最大値と最小値を取得 (axis=1で行方向の操作を指定)
max_vals = merged_df[target_cols].max(axis=1)
min_vals = merged_df[target_cols].min(axis=1)

# (最大値 - 最小値) / 最大値 を計算し、新しい列 'coverage_variation' として追加
merged_df['coverage_variation'] = (max_vals - min_vals) * 100 / max_vals

# 最大値が0の場合に発生するNaN(Not a Number)を0で置換
merged_df['coverage_variation'] = merged_df['coverage_variation'].fillna(0)

# 結果の確認
merged_df['coverage_variation'].mean()


np.float64(58.16743966997133)

In [81]:
target_cols = ['coverage_per_time_1', 'coverage_per_time_2', 'coverage_per_time_3', 'coverage_per_time']

# 1. target_colsの値をNumPy配列として抽出
values = merged_df[target_cols].values

# 2. 各行を小さい順に並べ替え (axis=1で行方向を指定)
sorted_values = np.sort(values, axis=1)

# 3. 各順位（列）の平均値を計算 (axis=0で列方向を指定)
rank_averages = np.mean(sorted_values, axis=0)

# 結果を分かりやすく表示
results = {
    "1番目に小さい値の平均": rank_averages[0],
    "2番目に小さい値の平均": rank_averages[1],
    "3番目に小さい値の平均": rank_averages[2],
    "4番目に小さい値の平均": rank_averages[3]
}

# 結果をDataFrameに変換して表示
results_df = pd.DataFrame([results])
print("各順位の平均値:")
print(results_df)


各順位の平均値:
   1番目に小さい値の平均  2番目に小さい値の平均  3番目に小さい値の平均  4番目に小さい値の平均
0     0.004743     0.014109     0.018993     0.022928
