In [2]:
from bokeh.models import ColumnDataSource, FactorRange, Whisker, HoverTool
from bokeh.palettes import colorblind  # Import colorblind palette
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap
from bokeh.io import output_notebook, output_file
import pandas as pd
import glob

# Output plot inline in Jupyter Notebook
output_notebook()

def get_data(files):
    data_list = {}
    for file in files:
        data = pd.read_csv(file)
        columns_after_rank = data.columns[3:]
        name = file.split('\\')[-1].split('.')[0]
        for column in columns_after_rank:
            data[column] *= 1319.7
            pivot_data = data.pivot(index='rank', columns='replicate', values=column)
            pivot_data['Avg.'] = pivot_data.mean(axis=1)
            pivot_data['Std. dev.'] = pivot_data.std(axis=1)
            data_list[name] = pivot_data
    return data_list

def transform_data(data_dict):
    avg_list = []
    std_list = []
    model_list = []
    descriptor_list = []
    
    for name, df in data_dict.items():
        last_record = df.iloc[-1]
        avg_list.append(last_record['Avg.'])
        std_list.append(last_record['Std. dev.'])
        
        parts = name.split(' ')
        model_list.append(parts[0])
        descriptor_list.append(' '.join(parts[1:]))
        
    result_df = pd.DataFrame({
        'Avg': avg_list,
        'Std': std_list,
        'Model': model_list,
        'Descriptor': descriptor_list
    })
    
    return result_df

# Replace '**/*.csv' with your actual path pattern if needed
files = glob.glob('**/*.csv', recursive=True)
data = get_data(files)
transformed_data = transform_data(data)

# Preparing data for Bokeh
models = transformed_data['Model'].unique()
descriptors = transformed_data['Descriptor'].unique()
x = [(model, descriptor) for model in models for descriptor in descriptors if (transformed_data['Model'] == model).any() and (transformed_data['Descriptor'] == descriptor).any()]
avg = transformed_data['Avg']
std = transformed_data['Std']
lower = avg - std
upper = avg + std

source = ColumnDataSource(data=dict(
    x=x,
    avg=avg,
    lower=lower,
    upper=upper
))

# Using colorblind palette for better visualization
palette = colorblind['Colorblind'][3]  # Adjust the number if you want more colors

# Creating the figure
p = figure(x_range=FactorRange(*x), height=350,
           toolbar_location='right', output_backend="svg")

# Adding hover tool
hover = HoverTool()
hover.tooltips = [
    ("Model, Descriptor", "@x"),
    ("Avg", "@avg"),
    ("Lower Bound", "@lower"),
    ("Upper Bound", "@upper")
]
p.add_tools(hover)

# Adding bars and error bars
p.vbar(x='x', top='avg', width=0.9, source=source, line_color="white",
       fill_color=factor_cmap('x', palette=palette, factors=descriptors, start=1, end=2))

p.add_layout(Whisker(source=source, base='x', upper='upper', lower='lower', line_color='black', level='overlay'))

# Customizing the plot
p.y_range.start = 0
p.y_range.end = max(upper) * 1.1  # Adding a 10% padding to the top of the y-axis
p.yaxis.axis_label = "Number of top 1% acquired"
p.xaxis.axis_label = "Model and Descriptor"
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = "gray"
p.ygrid.grid_line_dash = [6, 4]
p.ygrid.grid_line_alpha = 0.6

# Display the plot
show(p)
output_file('performance_by_model_and_descriptor.html')