---

In [14]:
# !pip install ipynb -q
# !pip install langchain -q
# # !pip install anthropic -q
# !pip install tiktoken
# !pip install nltk
# !pip install rouge-score
# !pip install evaluate
# !pip3 install fmeval --upgrade-strategy only-if-needed --force-reinstall
# !pip install transformers
# !pip install detoxify

In [15]:
# pip install seaborn

In [16]:
# pip install -U deepeval

In [17]:
# pip install -U "ray[default]"
# !pip install openpyxl

In [18]:
import pandas as pd

In [19]:
# pip install mlflow

---
# **Evaluate GEMMA**

---

In [20]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import mlflow

import mlflow
from datetime import datetime

def extract_columns(df, model_name):
    # Remove 'Unnamed: 0' column if present
    df = df.drop(columns='Unnamed: 0', errors='ignore')
    
    # Replace column names
    columns_list = [col.replace(f"{model_name} -", "") for  col in df.columns.to_list()]
    df.columns = columns_list
    
    # Define regex patterns for scores, toxicity, and hallucination scores
    score_pattern = r'Score$'  # Matches column names ending with 'Score'
    toxicity_pattern = r'toxicity$'  # Matches column names ending with 'toxicity'
    hallucination_pattern = r'Hallucination score$'  # Matches column names ending with 'Hallucination score'

    # Use list comprehension with regex to filter columns
    score_columns = [col for col in columns_list if re.search(score_pattern, col)]
    toxicity_columns = [col for col in columns_list if re.search(toxicity_pattern, col)]
    hallucination_columns = [col for col in columns_list if re.search(hallucination_pattern, col)]

    return df, score_columns, toxicity_columns, hallucination_columns    

#####################################

# Load DataFrame
gemma_df = pd.read_excel(r'google_gemma-7b-Evaluated-Toxicity-Hallucination-Summary-shuffled_transcript-F200.xlsx')
MODEL_NAME= 'google/gemma-7b'

# Process DataFrame
df, score_columns, toxicity_columns, hallucination_columns  = extract_columns(gemma_df, model_name = MODEL_NAME)

print("Score columns:", score_columns)
print("Toxicity columns:", toxicity_columns)
print("Hallucination columns:", hallucination_columns)


Score columns: [' Summary VS  Issue Summary - METEOR Score', ' Summary VS  Issue Summary - ROUGE Score', ' Summary VS  Issue Summary - BERT Score', ' Issue VS  Issue Summary - METEOR Score', ' Issue VS  Issue Summary - ROUGE Score', ' Issue VS  Issue Summary - BERT Score']
Toxicity columns: [' Issue - toxigen_toxicity', ' Issue - detoxify_toxicity', ' Issue - detoxify_severe_toxicity', ' Issue Summary - toxigen_toxicity', ' Issue Summary - detoxify_toxicity', ' Issue Summary - detoxify_severe_toxicity']
Hallucination columns: ['Transcript -  Summary - Hallucination score', ' Summary -  Issue Summary - Hallucination score']


In [21]:
experiment_name_prefix ='Prompt Evaluation'
experiment_name = f"{experiment_name_prefix}@{datetime.now().strftime('%Y%m%d%H%M%S')}"
run_name_prefix =  "Gemma 7B"

# Set experiment
mlflow.set_experiment(experiment_name)

# Get current timestamp for run name
run_name = f"{run_name_prefix}@{datetime.now().strftime('%Y%m%d%H%M%S')}"

def save_combined_violin_plot(data_dict, title, file_path):
    num_plots = len(data_dict)
    num_cols = 2
    num_rows = (num_plots + num_cols - 1) // num_cols  # Calculate number of rows dynamically
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(14, num_rows*6))
    fig.suptitle(title, fontsize=16)

    for i, (col, data) in enumerate(data_dict.items()):
        if num_rows == 1:  # If only one row, axes is 1-dimensional
            ax = axes[i % num_cols]
        else:  # For multiple rows, axes is 2-dimensional
            ax = axes[i // num_cols, i % num_cols]
        ax.violinplot(data, showmeans=True)
        ax.set_title(col)
        ax.set_xlabel("Score")
        ax.set_ylabel("Density")
    
    # Remove any unused subplots
    for j in range(num_plots, num_rows*num_cols):
        if num_rows == 1:
            fig.delaxes(axes.flatten()[j])
        else:
            fig.delaxes(axes.flatten()[j // num_cols, j % num_cols])

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to avoid overlap
    plt.savefig(file_path)  # Save the combined violin plot as an image
    plt.close()


def log_metrics_and_combined_plots(df, columns, column_type):
    violin_plots = {}
    for col in columns:
        data = df[col]
        col_mean = data.mean()
        col_median = data.median()
        col_90_percentile = data.quantile(0.9)
        
        # Log metrics
        mlflow.log_metric(f"{col}_mean", col_mean)
        # mlflow.log_metric(f"{col}_median", col_median)
        # mlflow.log_metric(f"{col}_90_percentile", col_90_percentile)
        
        # Store data for combined plot
        violin_plots[col] = data.values.reshape(-1, 1)

    # Generate and log combined violin plot
    combined_violin_plot_path = f"combined_violin_{column_type}.png"
    save_combined_violin_plot(violin_plots, f"Combined Violin Plots for {column_type} columns", combined_violin_plot_path)
    mlflow.log_artifact(combined_violin_plot_path)
    print(f"Logged metrics and combined plot for {column_type} columns")

with mlflow.start_run(run_name=run_name):
    log_metrics_and_combined_plots(df, score_columns, "score")
    log_metrics_and_combined_plots(df, toxicity_columns, "toxicity")
    log_metrics_and_combined_plots(df, hallucination_columns, "hallucination")

2024/06/08 09:15:49 INFO mlflow.tracking.fluent: Experiment with name 'Prompt Evaluation@20240608091549' does not exist. Creating a new experiment.


Logged metrics and combined plot for score columns
Logged metrics and combined plot for toxicity columns
Logged metrics and combined plot for hallucination columns


# Evaluate for GPT4

In [22]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import mlflow

import mlflow
from datetime import datetime

def extract_columns(df, model_name):
    # Remove 'Unnamed: 0' column if present
    df = df.drop(columns='Unnamed: 0', errors='ignore')
    
    # Replace column names
    columns_list = [col.replace(f"{model_name} -", "") for  col in df.columns.to_list()]
    df.columns = columns_list
    
    # Define regex patterns for scores, toxicity, and hallucination scores
    score_pattern = r'Score$'  # Matches column names ending with 'Score'
    toxicity_pattern = r'toxicity$'  # Matches column names ending with 'toxicity'
    hallucination_pattern = r'Hallucination score$'  # Matches column names ending with 'Hallucination score'

    # Use list comprehension with regex to filter columns
    score_columns = [col for col in columns_list if re.search(score_pattern, col)]
    toxicity_columns = [col for col in columns_list if re.search(toxicity_pattern, col)]
    hallucination_columns = [col for col in columns_list if re.search(hallucination_pattern, col)]

    return df, score_columns, toxicity_columns, hallucination_columns    

#####################################

# Load DataFrame
gpt_df = pd.read_excel(r'gpt-4-turbo-Evaluated-Toxicity-Hallucination-Summary-shuffled_transcript-F200.xlsx')
MODEL_NAME= 'gpt-4-turbo'

# Process DataFrame
df, score_columns, toxicity_columns, hallucination_columns  = extract_columns(gpt_df, model_name = MODEL_NAME)

print("Score columns:", score_columns)
print("Toxicity columns:", toxicity_columns)
print("Hallucination columns:", hallucination_columns)


Score columns: [' Summary VS  Issue Summary - METEOR Score', ' Summary VS  Issue Summary - ROUGE Score', ' Summary VS  Issue Summary - BERT Score', ' Issue VS  Issue Summary - METEOR Score', ' Issue VS  Issue Summary - ROUGE Score', ' Issue VS  Issue Summary - BERT Score']
Toxicity columns: [' Issue - toxigen_toxicity', ' Issue - detoxify_toxicity', ' Issue - detoxify_severe_toxicity', ' Issue Summary - toxigen_toxicity', ' Issue Summary - detoxify_toxicity', ' Issue Summary - detoxify_severe_toxicity']
Hallucination columns: ['Transcript -  Summary - Hallucination score', ' Summary -  Issue Summary - Hallucination score']


In [23]:
# experiment_name_prefix ='Prompt Evaluation'
# experiment_name = f"{experiment_name_prefix}@{datetime.now().strftime('%Y%m%d%H%M%S')}"
run_name_prefix =  "Llama 3 8B"

# Set experiment
mlflow.set_experiment(experiment_name)

# Get current timestamp for run name
run_name = f"{run_name_prefix}@{datetime.now().strftime('%Y%m%d%H%M%S')}"

def save_combined_violin_plot(data_dict, title, file_path):
    num_plots = len(data_dict)
    num_cols = 2
    num_rows = (num_plots + num_cols - 1) // num_cols  # Calculate number of rows dynamically
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(14, num_rows*6))
    fig.suptitle(title, fontsize=16)

    for i, (col, data) in enumerate(data_dict.items()):
        if num_rows == 1:  # If only one row, axes is 1-dimensional
            ax = axes[i % num_cols]
        else:  # For multiple rows, axes is 2-dimensional
            ax = axes[i // num_cols, i % num_cols]
        ax.violinplot(data, showmeans=True)
        ax.set_title(col)
        ax.set_xlabel("Score")
        ax.set_ylabel("Density")
    
    # Remove any unused subplots
    for j in range(num_plots, num_rows*num_cols):
        if num_rows == 1:
            fig.delaxes(axes.flatten()[j])
        else:
            fig.delaxes(axes.flatten()[j // num_cols, j % num_cols])

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to avoid overlap
    plt.savefig(file_path)  # Save the combined violin plot as an image
    plt.close()


def log_metrics_and_combined_plots(df, columns, column_type):
    violin_plots = {}
    for col in columns:
        data = df[col]
        col_mean = data.mean()
        col_median = data.median()
        col_90_percentile = data.quantile(0.9)
        
        # Log metrics
        mlflow.log_metric(f"{col}_mean", col_mean)
        # mlflow.log_metric(f"{col}_median", col_median)
        # mlflow.log_metric(f"{col}_90_percentile", col_90_percentile)
        
        # Store data for combined plot
        violin_plots[col] = data.values.reshape(-1, 1)

    # Generate and log combined violin plot
    combined_violin_plot_path = f"combined_violin_{column_type}.png"
    save_combined_violin_plot(violin_plots, f"Combined Violin Plots for {column_type} columns", combined_violin_plot_path)
    mlflow.log_artifact(combined_violin_plot_path)
    print(f"Logged metrics and combined plot for {column_type} columns")

with mlflow.start_run(run_name=run_name):
    log_metrics_and_combined_plots(df, score_columns, "score")
    log_metrics_and_combined_plots(df, toxicity_columns, "toxicity")
    log_metrics_and_combined_plots(df, hallucination_columns, "hallucination")

Logged metrics and combined plot for score columns
Logged metrics and combined plot for toxicity columns
Logged metrics and combined plot for hallucination columns


# Evaluate for MISTRAL

In [24]:
# pip install plotly
# !pip install -U kaleido

In [25]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import mlflow

import mlflow
from datetime import datetime

def extract_columns(df, model_name):
    # Remove 'Unnamed: 0' column if present
    df = df.drop(columns='Unnamed: 0', errors='ignore')
    
    # Replace column names
    columns_list = [col.replace(f"{model_name} -", "") for  col in df.columns.to_list()]
    df.columns = columns_list
    
    # Define regex patterns for scores, toxicity, and hallucination scores
    score_pattern = r'Score$'  # Matches column names ending with 'Score'
    toxicity_pattern = r'toxicity$'  # Matches column names ending with 'toxicity'
    hallucination_pattern = r'Hallucination score$'  # Matches column names ending with 'Hallucination score'

    # Use list comprehension with regex to filter columns
    score_columns = [col for col in columns_list if re.search(score_pattern, col)]
    toxicity_columns = [col for col in columns_list if re.search(toxicity_pattern, col)]
    hallucination_columns = [col for col in columns_list if re.search(hallucination_pattern, col)]

    return df, score_columns, toxicity_columns, hallucination_columns    

#####################################

# Load DataFrame
mistral_df = pd.read_excel(r'mixtral_8x7b-Evaluated-Toxicity-Hallucination-Summary-shuffled_transcript-F200.xlsx')
MODEL_NAME= 'mixtral_8x7b'

# Process DataFrame
df, score_columns, toxicity_columns, hallucination_columns  = extract_columns(mistral_df, model_name = MODEL_NAME)

print("Score columns:", score_columns)
print("Toxicity columns:", toxicity_columns)
print("Hallucination columns:", hallucination_columns)


Score columns: [' Summary VS  Issue Summary - METEOR Score', ' Summary VS  Issue Summary - ROUGE Score', ' Summary VS  Issue Summary - BERT Score', ' Issue VS  Issue Summary - METEOR Score', ' Issue VS  Issue Summary - ROUGE Score', ' Issue VS  Issue Summary - BERT Score']
Toxicity columns: [' Issue_toxigen_toxicity', ' Issue_detoxify_toxicity', ' Issue_detoxify_severe_toxicity', ' Issue - toxigen_toxicity', ' Issue - detoxify_toxicity', ' Issue - detoxify_severe_toxicity', ' Issue Summary - toxigen_toxicity', ' Issue Summary - detoxify_toxicity', ' Issue Summary - detoxify_severe_toxicity']
Hallucination columns: ['Transcript -  Summary - Hallucination score', ' Summary -  Issue Summary - Hallucination score']


In [None]:
# experiment_name_prefix ='Prompt Evaluation'
# experiment_name = f"{experiment_name_prefix}@{datetime.now().strftime('%Y%m%d%H%M%S')}"
run_name_prefix =  "Mistral 7x8B"

# Set experiment
mlflow.set_experiment(experiment_name)

# Get current timestamp for run name
run_name = f"{run_name_prefix}@{datetime.now().strftime('%Y%m%d%H%M%S')}"

def save_combined_violin_plot(data_dict, title, file_path):
    num_plots = len(data_dict)
    num_cols = 2
    num_rows = (num_plots + num_cols - 1) // num_cols  # Calculate number of rows dynamically
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(14, num_rows*6))
    fig.suptitle(title, fontsize=16)

    for i, (col, data) in enumerate(data_dict.items()):
        if num_rows == 1:  # If only one row, axes is 1-dimensional
            ax = axes[i % num_cols]
        else:  # For multiple rows, axes is 2-dimensional
            ax = axes[i // num_cols, i % num_cols]
        ax.violinplot(data, showmeans=True)
        ax.set_title(col)
        ax.set_xlabel("Score")
        ax.set_ylabel("Density")
    
    # Remove any unused subplots
    for j in range(num_plots, num_rows*num_cols):
        if num_rows == 1:
            fig.delaxes(axes.flatten()[j])
        else:
            fig.delaxes(axes.flatten()[j // num_cols, j % num_cols])

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to avoid overlap
    plt.savefig(file_path)  # Save the combined violin plot as an image
    plt.close()


def log_metrics_and_combined_plots(df, columns, column_type):
    violin_plots = {}
    for col in columns:
        data = df[col]
        col_mean = data.mean()
        col_median = data.median()
        col_90_percentile = data.quantile(0.9)
        
        # Log metrics
        mlflow.log_metric(f"{col}_mean", col_mean)
        # mlflow.log_metric(f"{col}_median", col_median)
        # mlflow.log_metric(f"{col}_90_percentile", col_90_percentile)
        
        # Store data for combined plot
        violin_plots[col] = data.values.reshape(-1, 1)

    # Generate and log combined violin plot
    combined_violin_plot_path = f"combined_violin_{column_type}.png"
    save_combined_violin_plot(violin_plots, f"Combined Violin Plots for {column_type} columns", combined_violin_plot_path)
    mlflow.log_artifact(combined_violin_plot_path)
    print(f"Logged metrics and combined plot for {column_type} columns")

with mlflow.start_run(run_name=run_name):
    log_metrics_and_combined_plots(df, score_columns, "score")
    log_metrics_and_combined_plots(df, toxicity_columns, "toxicity")
    log_metrics_and_combined_plots(df, hallucination_columns, "hallucination")

---

 - WIP - REMOVE AFTER EXP

 ---

In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import mlflow

import mlflow
from datetime import datetime

def extract_columns(df, model_name):
    # Remove 'Unnamed: 0' column if present
    df = df.drop(columns='Unnamed: 0', errors='ignore')
    
    # Replace column names
    columns_list = [col.replace(f"{model_name} -", "") for  col in df.columns.to_list()]
    df.columns = columns_list
    
    # Define regex patterns for scores, toxicity, and hallucination scores
    score_pattern = r'Score$'  # Matches column names ending with 'Score'
    toxicity_pattern = r'toxicity$'  # Matches column names ending with 'toxicity'
    hallucination_pattern = r'Hallucination score$'  # Matches column names ending with 'Hallucination score'

    # Use list comprehension with regex to filter columns
    score_columns = [col for col in columns_list if re.search(score_pattern, col)]
    toxicity_columns = [col for col in columns_list if re.search(toxicity_pattern, col)]
    hallucination_columns = [col for col in columns_list if re.search(hallucination_pattern, col)]

    return df, score_columns, toxicity_columns, hallucination_columns    

#####################################

# Load DataFrame
gpt_df = pd.read_excel(r'gpt-4-turbo-Evaluated-Toxicity-Hallucination-Summary-shuffled_transcript-F200.xlsx')
MODEL_NAME= 'gpt-4-turbo'

# Process DataFrame
df, score_columns, toxicity_columns, hallucination_columns  = extract_columns(gpt_df, model_name = MODEL_NAME)

print("Score columns:", score_columns)
print("Toxicity columns:", toxicity_columns)
print("Hallucination columns:", hallucination_columns)


In [None]:
# experiment_name_prefix ='Prompt Evaluation'
# experiment_name = f"{experiment_name_prefix}@{datetime.now().strftime('%Y%m%d%H%M%S')}"
run_name_prefix =  "Llama 3 8B"

# Set experiment
mlflow.set_experiment(experiment_name)

# Get current timestamp for run name
run_name = f"{run_name_prefix}@{datetime.now().strftime('%Y%m%d%H%M%S')}"

def save_combined_violin_plot(data_dict, title, file_path):
    num_plots = len(data_dict)
    num_cols = num_plots  # Set number of columns equal to number of plots
    num_rows = 1  # Set number of rows to 1 for single row layout
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(25, num_rows*6))
    fig.suptitle(title, fontsize=6)

    for i, (col, data) in enumerate(data_dict.items()):
        ax = axes[i]
        ax.violinplot(data, showmeans=True)
        ax.set_title(col)
        ax.set_xlabel("Score")
        ax.set_ylabel("Density")
    
    # Remove any unused subplots
    for j in range(num_plots, num_cols):
        fig.delaxes(axes[j])

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to avoid overlap
    plt.savefig(file_path)  # Save the combined violin plot as an image
    plt.close()


def log_metrics_and_combined_plots(df, columns, column_type):
    violin_plots = {}
    for col in columns:
        data = df[col]
        col_mean = data.mean()
        col_median = data.median()
        col_90_percentile = data.quantile(0.9)
        
        # Log metrics
        mlflow.log_metric(f"{col}_mean", col_mean)
        mlflow.log_metric(f"{col}_median", col_median)
        mlflow.log_metric(f"{col}_90_percentile", col_90_percentile)
        
        # Store data for combined plot
        violin_plots[col] = data.values.reshape(-1, 1)

    # Generate and log combined violin plot
    combined_violin_plot_path = f"combined_violin_{column_type}.png"
    save_combined_violin_plot(violin_plots, f"Combined Violin Plots for {column_type} columns", combined_violin_plot_path)
    mlflow.log_artifact(combined_violin_plot_path)
    print(f"Logged metrics and combined plot for {column_type} columns")

with mlflow.start_run(run_name=run_name):
    log_metrics_and_combined_plots(df, score_columns, "score")
    log_metrics_and_combined_plots(df, toxicity_columns, "toxicity")
    log_metrics_and_combined_plots(df, hallucination_columns, "hallucination")