In [20]:
import matplotlib.pyplot as plt
import pickle
import numpy as np
from scipy import stats
import matplotlib.colors as mcolors

In [21]:
models = [
    "qwen2.5_3b-instruct @ 1Pass",
    "qwen2.5_3b-instruct @ MoA 1",
    "qwen2.5_3b-instruct @ MoA 2",
    "qwen2.5_3b-instruct @ MoA 3",
    "qwen2.5_3b-instruct @ MoA 4",
    "qwen2.5_3b-instruct @ MoA 5",
    "qwen2.5_3b-instruct @ MoA 6",
    "qwen2.5_3b-instruct @ MoA 7",
    "qwen2.5_3b-instruct @ MoA 8",
]

In [22]:
def plot_moa_performance(models, names, output_file, title, lower_y=6.5, upper_y=9.5):
    accuracy = []
    clarity = []
    relevancy = []
    style = []
    quality = []
    avg = []
    
    for model in models:
        with open(f"./gdrive_results/EVAL_{model}.pkl", "rb") as f:
            eval = pickle.load(f)
        
        accuracy_ = []
        clarity_ = []
        relevancy_ = []
        style_ = []
        quality_ = []
        avg_ = []
        for entry in eval["detailed"]:
            accuracy_.append(entry["ratings"]["accuracy"])
            clarity_.append(entry["ratings"]["clarity"])
            relevancy_.append(entry["ratings"]["relevancy"])
            style_.append(entry["ratings"]["style"])
            quality_.append(np.mean([entry["ratings"]["accuracy"], entry["ratings"]["clarity"], 
                                     entry["ratings"]["relevancy"]]))
            avg_.append(entry["ratings"]["avg"])
        # Drop lowest and highest
        accuracy_.sort()
        clarity_.sort()
        relevancy_.sort()
        style_.sort()
        quality_.sort()
        avg_.sort()
        if "1Pass" not in model:
            accuracy.append(np.mean(accuracy_[1:-1]))
            clarity.append(np.mean(clarity_[1:-1]))
            relevancy.append(np.mean(relevancy_[1:-1]))
            style.append(np.mean(style_[1:-1]))
            quality.append(np.mean(quality_[1:-1]))
        avg.append(np.mean(avg_[1:-1]))

    fig, ax = plt.subplots(figsize=(12, 8), facecolor='#f0f0f0')

    # Define colors and make them darker
    colors = plt.cm.tab10(np.linspace(0, 1, 6))
    darker_colors = [mcolors.rgb_to_hsv(c[:3]) for c in colors]
    for c in darker_colors:
        c[2] *= 0.7
    darker_colors = [mcolors.hsv_to_rgb(c) for c in darker_colors]

    ax.plot(names[1:], accuracy, marker='o', label='Accuracy', color=darker_colors[0], alpha=0.3, linewidth=1)
    ax.plot(names[1:], clarity, marker='s', label='Clarity', color=darker_colors[1], alpha=0.3, linewidth=1)
    ax.plot(names[1:], relevancy, marker='^', label='Relevancy', color=darker_colors[2], alpha=0.3, linewidth=1)
    ax.plot(names[1:], style, marker='D', label='Style', color=darker_colors[3], alpha=0.3, linewidth=1)
    ax.plot(names[1:], quality, marker='x', label='Quality', color=darker_colors[4], linewidth=2)
    ax.plot(names[1:], avg[1:], marker='*', label='Average', color="blue", linewidth=2, markersize=12)

    # Plot 1-pass average as a dotted blue line
    ax.axhline(y=avg[0], color='blue', linestyle=':', linewidth=2, label='1-Pass Average')

    # Add linear regression for average
    x = np.arange(len(avg[1:]))
    slope_avg, intercept_avg, r_value_avg, p_value_avg, std_err_avg = stats.linregress(x, avg[1:])

    def linear_func_avg(x):
        return slope_avg * x + intercept_avg

    x_smooth = np.linspace(0, len(avg[1:]) - 1, 100)
    y_smooth_avg = linear_func_avg(x_smooth)

    ax.plot(x_smooth, y_smooth_avg, 'r--', label='Average Regression')

    # Add linear regression for quality
    slope_quality, intercept_quality, r_value_quality, p_value_quality, std_err_quality = stats.linregress(x, quality)

    def linear_func_quality(x):
        return slope_quality * x + intercept_quality

    y_smooth_quality = linear_func_quality(x_smooth)

    ax.plot(x_smooth, y_smooth_quality, 'g--', label='Quality Regression')

    # Print regression statistics
    print("Average Regression:")
    print(f"Slope: {slope_avg:.4f}")
    print(f"Intercept: {intercept_avg:.4f}")
    print(f"R-squared: {r_value_avg**2:.4f}")
    print(f"P-value: {p_value_avg:.4f}")

    print("\nQuality Regression:")
    print(f"Slope: {slope_quality:.4f}")
    print(f"Intercept: {intercept_quality:.4f}")
    print(f"R-squared: {r_value_quality**2:.4f}")
    print(f"P-value: {p_value_quality:.4f}")

    ax.set_ylabel('Rating', fontsize=14, fontweight='bold')
    ax.set_xlabel('Model', fontsize=14, fontweight='bold')
    ax.set_title(title, fontsize=18, fontweight='bold', pad=20)

    ax.grid(True, linestyle='--', alpha=0.7)
    ax.set_axisbelow(True)

    legend = ax.legend(title='Metrics', loc='best', fontsize=10)
    legend.get_title().set_fontsize('12')
    legend.get_title().set_fontweight('bold')

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(1.5)
    ax.spines['bottom'].set_linewidth(1.5)

    ax.set_ylim(lower_y, upper_y)
    plt.tight_layout()
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()

# Call the function to create the MoA performance plot
names = [
    "1Pass",
    "MoA 1",
    "MoA 2",
    "MoA 3",
    "MoA 4",
    "MoA 5",
    "MoA 6",
    "MoA 7",
    "MoA 8",
]
plot_moa_performance(models, names, "./graphs/Qwen25_MoA_Performance.png", "Qwen2.5-3b Performance Across MoA Iterations", lower_y=6.5, upper_y=9.5)


Average Regression:
Slope: 0.0020
Intercept: 7.5273
R-squared: 0.0025
P-value: 0.9063

Quality Regression:
Slope: -0.0048
Intercept: 7.2199
R-squared: 0.0136
P-value: 0.7830


![](./graphs/Qwen25_MoA_Performance.png)

In [24]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
from scipy import stats

def plot_moa_performance(models, names, output_file, title, lower_y=6.5, upper_y=9.5):
    avg_ratings = []
    
    for model in models:
        with open(f"./gdrive_results/EVAL_{model}.pkl", "rb") as f:
            eval = pickle.load(f)
        
        model_avg_ratings = []
        for entry in eval["detailed"]:
            model_avg_ratings.append(entry["ratings"]["avg"])
        
        avg_ratings.append(model_avg_ratings)

    fig, axes = plt.subplots(3, 3, figsize=(15, 15), facecolor='#f0f0f0')
    axes = axes.flatten()

    for i, (ratings, name) in enumerate(zip(avg_ratings, names)):
        ax = axes[i]
        
        # Calculate mean and standard deviation
        mean = np.mean(ratings)
        std = np.std(ratings)
        
        # Create a range of x values
        x = np.linspace(lower_y, upper_y, 100)
        
        # Calculate the normal distribution
        y = stats.norm.pdf(x, mean, std)
        
        # Plot the normal distribution
        ax.plot(x, y, 'b-', linewidth=2)
        
        # Fill the area under the curve
        ax.fill_between(x, y, color='lightblue', alpha=0.7)
        
        # Plot the actual data points
        ax.scatter(ratings, np.zeros_like(ratings), color='red', alpha=0.5)
        
        ax.set_title(f"{name}", fontsize=12, fontweight='bold')
        ax.set_xlabel('Average Rating', fontsize=10)
        ax.set_ylabel('Density', fontsize=10)
        ax.set_xlim(lower_y, upper_y)
        
        # Remove top and right spines
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        
        # Add mean and std to the plot
        ax.text(0.05, 0.95, f'Mean: {mean:.2f}\nStd: {std:.2f}', 
                transform=ax.transAxes, verticalalignment='top', 
                fontsize=10, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

    # Remove any unused subplots
    for j in range(i+1, len(axes)):
        fig.delaxes(axes[j])

    fig.suptitle(title, fontsize=18, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()

# Call the function to create the MoA performance plot
names = [
    "1Pass",
    "MoA 1",
    "MoA 2",
    "MoA 3",
    "MoA 4",
    "MoA 5",
    "MoA 6",
    "MoA 7",
    "MoA 8",
]
plot_moa_performance(models, names, "./graphs/Qwen25_MoA_Performance_Distribution.png", "Qwen2.5-3b Performance Distribution Across MoA Iterations", lower_y=6.5, upper_y=9.5)


![](./graphs/Qwen25_MoA_Performance_Distribution.png)