# TextGen

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

def analyze_csv(file_path):
    # Read CSV file
    df = pd.read_csv(file_path)

    # Extract model name from file name
    model_name = os.path.basename(file_path).split('_evaluations.csv')[0]

    # Calculate statistics
    score_columns = ['Creativity Score', 'Coherence Score', 'Fluency Score', 'Relevance Score', 'Engagement Score', 'Overall Score']
    stats = df[score_columns].agg(['mean', 'median', 'min', 'max'])

    # Create distribution plot
    plt.figure(figsize=(12, 6))
    for column in score_columns:
        sns.kdeplot(df[column], shade=True, label=column)

    plt.title(f'Score Distribution for {model_name}')
    plt.xlabel('Score')
    plt.ylabel('Density')
    plt.legend()
    plt.savefig(f'{model_name}_distribution.png')
    plt.close()

    # Create bar plot
    plt.figure(figsize=(12, 6))
    stats.loc['mean'].plot(kind='bar')
    plt.title(f'Mean Scores for {model_name}')
    plt.xlabel('Criteria')
    plt.ylabel('Mean Score')
    plt.savefig(f'{model_name}_mean_scores.png')
    plt.close()

    return stats

# List of CSV files
csv_files = [
    'LN_noNorm_evaluations.csv',
    'RMSN_noNorm_evaluations.csv',
    'LN_AttnOnly_evaluations.csv',
    'RMSN_AttnOnly_evaluations.csv',
    'LN_FFNonly_evaluations.csv',
    'RMSN_FFNonly_evaluations.csv',
    'LN_baseModel_evaluations.csv',
    'RMSN_baseModel_evaluations.csv',
]

# Analyze each CSV file
all_stats = {}
for file in csv_files:
    print(f"Analyzing {file}...")
    stats = analyze_csv(file)
    all_stats[file] = stats
    print(f"Statistics for {file}:")
    print(stats)
    print("\n")

# Create a comparative bar plot for mean scores across all models
plt.figure(figsize=(15, 8))
mean_scores = pd.DataFrame({model: stats.loc['mean'] for model, stats in all_stats.items()})
mean_scores.plot(kind='bar', figsize=(15, 8))
plt.title('Comparison of Mean Scores Across All Models')
plt.xlabel('Criteria')
plt.ylabel('Mean Score')
plt.legend(title='Models', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('all_models_comparison.png')
plt.close()

print("Analysis complete. Check the generated PNG files for visualizations.")


`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecat

Analyzing LN_noNorm_evaluations.csv...



`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecat

Statistics for LN_noNorm_evaluations.csv:
        Creativity Score  Coherence Score  Fluency Score  Relevance Score  \
mean                 1.0              1.0            1.0              1.0   
median               1.0              1.0            1.0              1.0   
min                  1.0              1.0            1.0              1.0   
max                  1.0              1.0            1.0              1.0   

        Engagement Score  Overall Score  
mean                 1.0            1.0  
median               1.0            1.0  
min                  1.0            1.0  
max                  1.0            1.0  


Analyzing RMSN_noNorm_evaluations.csv...



`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecat

Statistics for RMSN_noNorm_evaluations.csv:
        Creativity Score  Coherence Score  Fluency Score  Relevance Score  \
mean                 1.0              1.0            1.0              1.0   
median               1.0              1.0            1.0              1.0   
min                  1.0              1.0            1.0              1.0   
max                  1.0              1.0            1.0              1.0   

        Engagement Score  Overall Score  
mean                 1.0            1.0  
median               1.0            1.0  
min                  1.0            1.0  
max                  1.0            1.0  


Analyzing LN_AttnOnly_evaluations.csv...



`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecat

Statistics for LN_AttnOnly_evaluations.csv:
        Creativity Score  Coherence Score  Fluency Score  Relevance Score  \
mean                 1.0              1.0            1.0              1.0   
median               1.0              1.0            1.0              1.0   
min                  1.0              1.0            1.0              1.0   
max                  1.0              1.0            1.0              1.0   

        Engagement Score  Overall Score  
mean                 1.0            1.0  
median               1.0            1.0  
min                  1.0            1.0  
max                  1.0            1.0  


Analyzing RMSN_AttnOnly_evaluations.csv...



`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecat

Statistics for RMSN_AttnOnly_evaluations.csv:
        Creativity Score  Coherence Score  Fluency Score  Relevance Score  \
mean                 1.0              1.0            1.0              1.0   
median               1.0              1.0            1.0              1.0   
min                  1.0              1.0            1.0              1.0   
max                  1.0              1.0            1.0              1.0   

        Engagement Score  Overall Score  
mean                 1.0            1.0  
median               1.0            1.0  
min                  1.0            1.0  
max                  1.0            1.0  


Analyzing LN_FFNonly_evaluations.csv...



`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)
  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecat

Statistics for LN_FFNonly_evaluations.csv:
        Creativity Score  Coherence Score  Fluency Score  Relevance Score  \
mean                 1.0              1.0            1.0              1.0   
median               1.0              1.0            1.0              1.0   
min                  1.0              1.0            1.0              1.0   
max                  1.0              1.0            1.0              1.0   

        Engagement Score  Overall Score  
mean                 1.0            1.0  
median               1.0            1.0  
min                  1.0            1.0  
max                  1.0            1.0  


Analyzing RMSN_FFNonly_evaluations.csv...
Statistics for RMSN_FFNonly_evaluations.csv:
        Creativity Score  Coherence Score  Fluency Score  Relevance Score  \
mean                 1.0              1.0            1.0              1.0   
median               1.0              1.0            1.0              1.0   
min                  1.0              1.0


`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `

Statistics for LN_baseModel_evaluations.csv:
        Creativity Score  Coherence Score  Fluency Score  Relevance Score  \
mean                 2.2              1.8           2.52              2.4   
median               2.0              2.0           3.00              2.0   
min                  2.0              1.0           1.00              1.0   
max                  4.0              3.0           4.00              4.0   

        Engagement Score  Overall Score  
mean                1.92          2.168  
median              2.00          2.200  
min                 1.00          1.400  
max                 4.00          3.600  


Analyzing RMSN_baseModel_evaluations.csv...



`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[column], shade=True, label=column)

`shade` is now deprecated in favor of `

Statistics for RMSN_baseModel_evaluations.csv:
        Creativity Score  Coherence Score  Fluency Score  Relevance Score  \
mean                2.24              1.8           2.68             2.12   
median              2.00              2.0           3.00             2.00   
min                 2.00              1.0           2.00             1.00   
max                 4.00              3.0           4.00             4.00   

        Engagement Score  Overall Score  
mean                1.92          2.152  
median              2.00          2.200  
min                 1.00          1.400  
max                 4.00          3.600  


Analysis complete. Check the generated PNG files for visualizations.


<Figure size 1500x800 with 0 Axes>

In [2]:
import pandas as pd


# List of evaluation CSV files
evaluation_files = [
    'LN_noNorm_evaluations.csv',
    'RMSN_noNorm_evaluations.csv',
    'LN_AttnOnly_evaluations.csv',
    'RMSN_AttnOnly_evaluations.csv',
    'LN_FFNonly_evaluations.csv',
    'RMSN_FFNonly_evaluations.csv',
    'LN_baseModel_evaluations.csv',
    'RMSN_baseModel_evaluations.csv',
]

# List of generated text CSV files
generated_text_files = [
    'LN_noNorm_generated_texts.csv',
    'RMSN_noNorm_generated_texts.csv',
    'LN_AttnOnly_generated_texts.csv',
    'RMSN_AttnOnly_generated_texts.csv',
    'LN_FFNonly_generated_texts.csv',
    'RMSN_FFNonly_generated_texts.csv',
    'LN_baseModel_generated_texts.csv',
    'RMSN_baseModel_generated_texts.csv',
]


# Function to read and process each CSV file
def process_csv(eval_file, gen_file):
    eval_df = pd.read_csv(eval_file)
    gen_df = pd.read_csv(gen_file)

    # Ensure the dataframes have the same number of rows
    min_rows = min(len(eval_df), len(gen_df))
    eval_df = eval_df.iloc[:min_rows]
    gen_df = gen_df.iloc[:min_rows]

    # Merge the dataframes
    merged_df = pd.merge(eval_df, gen_df[['prompt', 'generated_text']], left_index=True, right_index=True)

    return merged_df

# Read and combine all CSV files
all_data = pd.concat([process_csv(eval_file, gen_file) for eval_file, gen_file in zip(evaluation_files, generated_text_files)], ignore_index=True)

# Arrange columns as specified
columns_order = ['model_name', 'prompt', 'generated_text', 'Overall Score', 'Overall Feedback', 'Comments on Columns']
available_columns = [col for col in columns_order if col in all_data.columns]
all_data = all_data[available_columns]

# Truncate generated_text to 1000 characters
all_data['generated_text'] = all_data['generated_text'].str[:1000]

# Save the entire dataframe to a single CSV file
output_file = 'combined_data_truncated.csv'
all_data.to_csv(output_file, index=False)
print(f"All data has been saved to '{output_file}' with generated text truncated to 1000 characters.")

# Print the number of rows and columns in the saved file
print(f"The file contains {len(all_data)} rows and {len(all_data.columns)} columns.")
print("Columns:", ', '.join(all_data.columns))

All data has been saved to 'combined_data_truncated.csv' with generated text truncated to 1000 characters.
The file contains 200 rows and 6 columns.
Columns: model_name, prompt, generated_text, Overall Score, Overall Feedback, Comments on Columns


# New

In [3]:
!pip install pingouin
!pip install scikit_posthocs
!pip install krippendorff

Collecting pingouin
  Downloading pingouin-0.5.5-py3-none-any.whl.metadata (19 kB)
Collecting pandas-flavor (from pingouin)
  Downloading pandas_flavor-0.6.0-py3-none-any.whl.metadata (6.3 kB)
Downloading pingouin-0.5.5-py3-none-any.whl (204 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.4/204.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas_flavor-0.6.0-py3-none-any.whl (7.2 kB)
Installing collected packages: pandas-flavor, pingouin
Successfully installed pandas-flavor-0.6.0 pingouin-0.5.5
Collecting scikit_posthocs
  Downloading scikit_posthocs-0.9.1-py3-none-any.whl.metadata (5.8 kB)
Downloading scikit_posthocs-0.9.1-py3-none-any.whl (32 kB)
Installing collected packages: scikit_posthocs
Successfully installed scikit_posthocs-0.9.1
Collecting krippendorff
  Downloading krippendorff-0.8.0-py3-none-any.whl.metadata (2.8 kB)
Downloading krippendorff-0.8.0-py3-none-any.whl (18 kB)
Installing collected packages: krippendorff
Successfully ins

In [5]:
import pandas as pd
import re

# List of evaluation CSV files
evaluation_files = [
    'LN_noNorm_evaluations.csv',
    'RMSN_noNorm_evaluations.csv',
    'LN_AttnOnly_evaluations.csv',
    'RMSN_AttnOnly_evaluations.csv',
    'LN_FFNonly_evaluations.csv',
    'RMSN_FFNonly_evaluations.csv',
    'LN_baseModel_evaluations.csv',
    'RMSN_baseModel_evaluations.csv',
]

# List of generated text CSV files
generated_text_files = [
    'LN_noNorm_generated_texts.csv',
    'RMSN_noNorm_generated_texts.csv',
    'LN_AttnOnly_generated_texts.csv',
    'RMSN_AttnOnly_generated_texts.csv',
    'LN_FFNonly_generated_texts.csv',
    'RMSN_FFNonly_generated_texts.csv',
    'LN_baseModel_generated_texts.csv',
    'RMSN_baseModel_generated_texts.csv',
]

# Function to determine the task based on the file name
def determine_task(file_name):
    if 'summarization' in file_name.lower():
        return 'Summarization'
    elif 'squad' in file_name.lower():
        return 'Question Answering'
    else:
        return 'Text Generation'

# Function to read and process each CSV file
def process_csv(eval_file, gen_file):
    eval_df = pd.read_csv(eval_file)
    gen_df = pd.read_csv(gen_file)

    # Ensure the dataframes have the same number of rows
    min_rows = min(len(eval_df), len(gen_df))
    eval_df = eval_df.iloc[:min_rows]
    gen_df = gen_df.iloc[:min_rows]

    # Merge the dataframes
    merged_df = pd.merge(eval_df, gen_df[['prompt', 'generated_text']], left_index=True, right_index=True)

    # Add model_name column
    model_name = re.search(r'(LN|RMSN)_(.*?)_', eval_file).group(0).rstrip('_')
    merged_df['model_name'] = model_name

    # Add task column
    task = determine_task(eval_file)
    merged_df['task'] = task

    # Ensure all required columns are present
    required_columns = ['Creativity Score', 'Coherence Score', 'Fluency Score', 'Relevance Score', 'Engagement Score']
    for col in required_columns:
        if col not in merged_df.columns:
            merged_df[col] = None  # or some default value

    return merged_df

# Read and combine all CSV files
all_data = pd.concat([process_csv(eval_file, gen_file) for eval_file, gen_file in zip(evaluation_files, generated_text_files)], ignore_index=True)

# Arrange columns as specified
columns_order = ['model_name', 'task', 'prompt', 'generated_text', 'Overall Score', 'Creativity Score', 'Coherence Score',
                 'Fluency Score', 'Relevance Score', 'Engagement Score', 'Overall Feedback', 'Comments on Columns']
available_columns = [col for col in columns_order if col in all_data.columns]
all_data = all_data[available_columns]

# Truncate generated_text to 1000 characters
all_data['generated_text'] = all_data['generated_text'].str[:1000]

# Save the entire dataframe to a single CSV file
output_file = 'combined_data_truncated.csv'
all_data.to_csv(output_file, index=False)
print(f"All data has been saved to '{output_file}' with generated text truncated to 1000 characters.")

# Print the number of rows and columns in the saved file
print(f"The file contains {len(all_data)} rows and {len(all_data.columns)} columns.")
print("Columns:", ', '.join(all_data.columns))

# Print unique values in the 'task' column
print("Unique tasks:", all_data['task'].unique())

All data has been saved to 'combined_data_truncated.csv' with generated text truncated to 1000 characters.
The file contains 200 rows and 12 columns.
Columns: model_name, task, prompt, generated_text, Overall Score, Creativity Score, Coherence Score, Fluency Score, Relevance Score, Engagement Score, Overall Feedback, Comments on Columns
Unique tasks: ['Text Generation']


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

# Set random seed for reproducibility
np.random.seed(42)

def load_data(file_path):
    """Load data from CSV file."""
    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} rows of data")
    print(f"Columns: {', '.join(df.columns)}")
    return df

def calculate_statistics(df):
    """Calculate basic statistics for each score column."""
    score_columns = ['Overall Score', 'Creativity Score', 'Coherence Score',
                     'Fluency Score', 'Relevance Score', 'Engagement Score']
    return df.groupby(['model_name', 'task'])[score_columns].agg(['mean', 'std'])

def perform_anova(df, score_column):
    """Perform one-way ANOVA for a given score column."""
    models = df['model_name'].unique()
    data = [df[df['model_name'] == model][score_column] for model in models]
    f_value, p_value = stats.f_oneway(*data)
    return f_value, p_value

def create_collated_boxplots(df, task, output_dir):
    """Create collated boxplots for all score columns in a single figure."""
    score_columns = ['Overall Score', 'Creativity Score', 'Coherence Score',
                     'Fluency Score', 'Relevance Score', 'Engagement Score']

    fig, axes = plt.subplots(2, 3, figsize=(20, 15))
    fig.suptitle(f'Score Distributions for {task}', fontsize=16)

    for i, score in enumerate(score_columns):
        row = i // 3
        col = i % 3
        sns.boxplot(x='model_name', y=score, data=df, ax=axes[row, col])
        axes[row, col].set_title(score)
        axes[row, col].set_xticklabels(axes[row, col].get_xticklabels(), rotation=45, ha='right')

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{task}_collated_boxplots.png'))
    plt.close()

def main():
    # Set up output directory
    output_dir = 'analysis_results'
    os.makedirs(output_dir, exist_ok=True)

    # Load data
    df = load_data('combined_data_truncated.csv')

    # Calculate statistics
    stats_df = calculate_statistics(df)
    stats_df.to_csv(os.path.join(output_dir, 'descriptive_statistics.csv'))

    # Perform analysis for each task and score
    tasks = df['task'].unique()
    score_columns = ['Overall Score', 'Creativity Score', 'Coherence Score',
                     'Fluency Score', 'Relevance Score', 'Engagement Score']

    for task in tasks:
        task_df = df[df['task'] == task]

        # Create collated boxplots
        create_collated_boxplots(task_df, task, output_dir)

        # Perform ANOVA and save results
        anova_results = []
        for score in score_columns:
            f_value, p_value = perform_anova(task_df, score)
            anova_results.append({
                'Task': task,
                'Metric': score,
                'F-value': f_value,
                'p-value': p_value
            })

        # Save ANOVA results to CSV
        anova_df = pd.DataFrame(anova_results)
        anova_df.to_csv(os.path.join(output_dir, f'{task}_anova_results.csv'), index=False)

    print("Analysis complete. Results saved in 'analysis_results' directory.")

if __name__ == "__main__":
    main()

Loaded 200 rows of data
Columns: model_name, task, prompt, generated_text, Overall Score, Creativity Score, Coherence Score, Fluency Score, Relevance Score, Engagement Score, Overall Feedback, Comments on Columns


  axes[row, col].set_xticklabels(axes[row, col].get_xticklabels(), rotation=45, ha='right')
  axes[row, col].set_xticklabels(axes[row, col].get_xticklabels(), rotation=45, ha='right')
  axes[row, col].set_xticklabels(axes[row, col].get_xticklabels(), rotation=45, ha='right')
  axes[row, col].set_xticklabels(axes[row, col].get_xticklabels(), rotation=45, ha='right')
  axes[row, col].set_xticklabels(axes[row, col].get_xticklabels(), rotation=45, ha='right')
  axes[row, col].set_xticklabels(axes[row, col].get_xticklabels(), rotation=45, ha='right')


Analysis complete. Results saved in 'analysis_results' directory.


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.stats import kruskal
from scikit_posthocs import posthoc_dunn
import warnings
import os

warnings.filterwarnings('ignore')

# Create a directory for CSV outputs
output_dir = 'text_generation_analysis_results'
os.makedirs(output_dir, exist_ok=True)

# Load the data
df = pd.read_csv('combined_data_truncated.csv')

print("Columns in the CSV file:")
print(df.columns)

# Ensure 'model_name' is treated as a categorical variable
df['model_name'] = df['model_name'].astype('category')

# Define score columns
score_columns = ['Overall Score', 'Creativity Score', 'Coherence Score', 'Fluency Score', 'Relevance Score', 'Engagement Score']

print(f"\nUsing score columns: {score_columns}")

# 1. Effect Size Calculation
def calculate_effect_size(df, metric):
    try:
        f_value, _ = stats.f_oneway(*[group[metric] for name, group in df.groupby('model_name')])
        return f_value / (f_value + df.groupby('model_name').size().iloc[0] - 1)
    except Exception as e:
        print(f"Error calculating effect size for {metric}: {e}")
        return np.nan

effect_sizes = {metric: calculate_effect_size(df, metric) for metric in score_columns}
effect_sizes_df = pd.DataFrame.from_dict(effect_sizes, orient='index', columns=['Effect Size'])
effect_sizes_df.to_csv(os.path.join(output_dir, 'effect_sizes.csv'))

# 2. Post-hoc Tests
def perform_tukey_hsd(df, metric):
    try:
        tukey = pairwise_tukeyhsd(df[metric], df['model_name'])
        return pd.DataFrame(data=tukey._results_table.data[1:], columns=tukey._results_table.data[0])
    except Exception as e:
        print(f"Error performing Tukey HSD for {metric}: {e}")
        return pd.DataFrame()

tukey_results = {metric: perform_tukey_hsd(df, metric) for metric in score_columns}
for metric, result in tukey_results.items():
    result.to_csv(os.path.join(output_dir, f'tukey_hsd_{metric.replace(" ", "_")}.csv'), index=False)

# 3. Correlation Analysis
correlation_matrix = df[score_columns].corr()
correlation_matrix.to_csv(os.path.join(output_dir, 'correlation_matrix.csv'))

# 4. Principal Component Analysis
scaler = StandardScaler()
pca = PCA()
pca_result = pca.fit_transform(scaler.fit_transform(df[score_columns]))
pca_df = pd.DataFrame({
    'Principal Component': range(1, len(pca.explained_variance_ratio_) + 1),
    'Explained Variance Ratio': pca.explained_variance_ratio_,
    'Cumulative Explained Variance Ratio': np.cumsum(pca.explained_variance_ratio_)
})
pca_df.to_csv(os.path.join(output_dir, 'pca_results.csv'), index=False)

# 5. Non-parametric Tests
def perform_kruskal_dunn(df, metric):
    try:
        kruskal_result = kruskal(*[group[metric].values for name, group in df.groupby('model_name')])
        if kruskal_result.pvalue < 0.05:
            dunn_result = posthoc_dunn(df, val_col=metric, group_col='model_name', p_adjust='bonferroni')
            return pd.DataFrame({'statistic': [kruskal_result.statistic], 'p-value': [kruskal_result.pvalue]}), dunn_result
        return pd.DataFrame({'statistic': [kruskal_result.statistic], 'p-value': [kruskal_result.pvalue]}), None
    except Exception as e:
        print(f"Error performing Kruskal-Wallis and Dunn's test for {metric}: {e}")
        return None, None

kruskal_dunn_results = {metric: perform_kruskal_dunn(df, metric) for metric in score_columns}
for metric, (kruskal_result, dunn_result) in kruskal_dunn_results.items():
    if kruskal_result is not None:
        kruskal_result.to_csv(os.path.join(output_dir, f'kruskal_wallis_{metric.replace(" ", "_")}.csv'), index=False)
    if dunn_result is not None:
        dunn_result.to_csv(os.path.join(output_dir, f'dunn_test_{metric.replace(" ", "_")}.csv'))

# 6. Visualizations
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Text Generation Metrics')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'correlation_heatmap.png'))
plt.close()

# Violin Plots
plt.figure(figsize=(20, 15))
for i, metric in enumerate(score_columns, 1):
    plt.subplot(2, 3, i)
    sns.violinplot(x='model_name', y=metric, data=df)
    plt.title(metric)
    plt.xticks(rotation=45, ha='right')
    plt.xlabel('')
    if i % 3 != 1:
        plt.ylabel('')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'violin_plots.png'), bbox_inches='tight')
plt.close()

# Radar Chart
def radar_chart(df, metrics):
    means = df.groupby('model_name')[metrics].mean()
    angles = np.linspace(0, 2*np.pi, len(metrics), endpoint=False)
    means = pd.concat([means, means.iloc[:, :1]], axis=1)
    angles = np.concatenate((angles, [angles[0]]))

    fig, ax = plt.subplots(figsize=(14, 10), subplot_kw=dict(projection='polar'))
    for model in means.index:
        values = means.loc[model].values
        ax.plot(angles, values, 'o-', linewidth=2, label=model)
        ax.fill(angles, values, alpha=0.25)
    ax.set_thetagrids(angles[:-1] * 180/np.pi, metrics)
    ax.set_ylim(0, 5)  # Assuming scores are between 0 and 5
    plt.legend(loc='center left', bbox_to_anchor=(1.1, 0.5))
    plt.title("Model Performance Across Text Generation Metrics")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'radar_chart.png'), bbox_inches='tight')
    plt.close()

radar_chart(df, score_columns)

# Distribution of Scores
plt.figure(figsize=(20, 15))
for i, metric in enumerate(score_columns, 1):
    plt.subplot(2, 3, i)
    for model in df['model_name'].unique():
        sns.kdeplot(data=df[df['model_name'] == model], x=metric, label=model)
    plt.title(f'Distribution of {metric}')
    plt.xlabel('Score')
    plt.ylabel('Density')
    if i == 3:  # Place legend outside the plots
        plt.legend(title='Model Name', bbox_to_anchor=(1.05, 1), loc='upper left')
    else:
        plt.legend([])  # Remove individual legends
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'score_distributions.png'), bbox_inches='tight')
plt.close()

# Boxplots
plt.figure(figsize=(20, 15))
for i, metric in enumerate(score_columns, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x='model_name', y=metric, data=df)
    plt.title(metric)
    plt.xticks(rotation=45, ha='right')
    plt.xlabel('')
    if i % 3 != 1:
        plt.ylabel('')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'boxplots.png'), bbox_inches='tight')
plt.close()

# 7. Basic Descriptive Statistics
descriptive_stats = df.groupby('model_name')[score_columns].agg(['mean', 'std', 'min', 'max'])
descriptive_stats.to_csv(os.path.join(output_dir, 'descriptive_statistics.csv'))

print(f"\nAnalysis complete. Results saved in CSV files in the '{output_dir}' directory.")
print("Check the generated PNG files for visualizations.")

Columns in the CSV file:
Index(['model_name', 'task', 'prompt', 'generated_text', 'Overall Score',
       'Creativity Score', 'Coherence Score', 'Fluency Score',
       'Relevance Score', 'Engagement Score', 'Overall Feedback',
       'Comments on Columns'],
      dtype='object')

Using score columns: ['Overall Score', 'Creativity Score', 'Coherence Score', 'Fluency Score', 'Relevance Score', 'Engagement Score']

Analysis complete. Results saved in CSV files in the 'text_generation_analysis_results' directory.
Check the generated PNG files for visualizations.
