In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import json

# Paths
FOLDER_PATH = '../GotEnzymes2/'  # GotEnzymes2 Database, which is not provided here
GROUPS_JSON_PATH = '../Data/analysis/categorized_species_by_ogt.json'

# Load species grouping
groups = {}
try:
    with open(GROUPS_JSON_PATH, 'r', encoding='utf-8') as f:
        categorized_data = json.load(f)

    groups['Psychrophile'] = categorized_data.get("Psychrophilic (< 15°C)", [])
    groups['Mesophile'] = categorized_data.get("Mesophilic (15°C - 45°C)", [])
    groups['Thermophile'] = categorized_data.get("Thermophilic (45°C - 80°C)", [])
    groups['Hyperthermophile'] = categorized_data.get("Hyperthermophilic (> 80°C)", [])

    for name, org_list in groups.items():
        print(f"- {name}: {len(org_list)} species")

except FileNotFoundError:
    print(f"Error: Grouping file '{GROUPS_JSON_PATH}' not found.")
    exit()

# Read and combine data
all_data_list = []

for group_name, organisms in groups.items():
    if not organisms:
        continue

    for org_code in organisms:
        file_path = os.path.join(FOLDER_PATH, org_code + '.txt')
        if not os.path.exists(file_path):
            continue

        try:
            df = pd.read_csv(file_path, sep='\t', header=None)
            temp_df = df.iloc[:, [-2, -1]].copy()
            temp_df.columns = ['Topt', 'Tm']
            temp_df['Group'] = group_name
            all_data_list.append(temp_df)
        except Exception as e:
            print(f"Error processing '{file_path}': {e}")

if not all_data_list:
    print("Error: No data files were successfully read.")
else:
    combined_df = pd.concat(all_data_list, ignore_index=True)

    # Clean data
    combined_df['Topt'] = pd.to_numeric(combined_df['Topt'], errors='coerce')
    combined_df['Tm'] = pd.to_numeric(combined_df['Tm'], errors='coerce')
    combined_df.dropna(subset=['Topt', 'Tm'], inplace=True)

    print(f"\nData processing completed. Total valid enzyme entries: {len(combined_df)}")

    # Set plotting style
    plt.rcParams.update({
        'font.size': 7,
        'font.family': 'Arial',
        'mathtext.fontset': 'custom',
        'mathtext.rm': 'Arial',
        'mathtext.it': 'Arial:italic',
        'mathtext.bf': 'Arial:bold',
        'pdf.fonttype': 42,
        'ps.fonttype': 42
    })

    fig, axes = plt.subplots(1, 2, figsize=(7, 3), sharey=True)

    # Define group order and color palette
    group_order = ['Psychrophile', 'Mesophile', 'Thermophile', 'Hyperthermophile']
    palette = {
        "Psychrophile": "#4682B4",
        "Mesophile": "#71B4E6",
        "Thermophile": "#f9c295",
        "Hyperthermophile": "#BF0001"
    }

    # Plot Topt distribution
    sns.violinplot(ax=axes[0], data=combined_df, x='Group', y='Topt', order=group_order, palette=palette, inner='box', linewidth=0.75)
    axes[0].set_title(r'$\mathit{T}_\mathrm{opt}$')
    axes[0].set_xlabel('')
    axes[0].set_ylabel('Temperature (°C)')
    axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=30, ha='right')

    # Plot Tm distribution
    sns.violinplot(ax=axes[1], data=combined_df, x='Group', y='Tm', order=group_order, palette=palette, inner='box', linewidth=0.75)
    axes[1].set_title(r'$\mathit{T}_\mathrm{m}$')
    axes[1].set_xlabel('')
    axes[1].set_ylabel('')
    axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=30, ha='right')

    # Adjust tick label size
    for ax in axes:
        ax.tick_params(axis='both', which='major', labelsize=7)

    # Save and show
    plt.tight_layout(pad=0.5)
    output_filename = "pdf/fig5B.pdf"
    plt.savefig(output_filename, dpi=600)
    print(f"Figure saved as '{output_filename}'")

    plt.show()