In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import rankdata
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.libqsturng import psturng
from itertools import combinations
import sqlite3
from ast import literal_eval
from pathlib import Path
from pylatex import Document, NoEscape
from IPython.display import display, Latex

METRIC = "r2_score" 
output_dir = Path("./cache/analysis/")
output_dir.mkdir(parents=True, exist_ok=True)

In [None]:
def df_to_booktabs(df: pd.DataFrame, renaming_conventions: dict, title: str, optimization: str=None, caption: str = "", label: str = "") -> str:
    """
    Convert a Pandas DataFrame to a LaTeX table using the booktabs style.
    """
    df = df[sorted(df.columns, reverse=True)]
    float_cols = df.select_dtypes(include=['float', 'float64', 'float32']).columns
    int_cols = df.select_dtypes(include=['int', 'int64', 'int32']).columns
    if optimization is not None:
        optimum = np.min if optimization == "min" else np.max
        df[float_cols] = df[float_cols].apply(
            lambda col: col.map(
                lambda x: f"\\textbf{{{x:.2f}}}" if x == optimum(col) else f"{x:.2f}"
            )
        )
        df[int_cols] = df[int_cols].apply(
            lambda col: col.map(
                lambda x: f"\\textbf{{{x:.0f}}}" if x == optimum(col) else f"{x:.0f}"
            )
        )

    latex_str = "\\begin{table}[t!]\n"
    latex_str += "\\centering\n"

    # Convert DataFrame to LaTeX with booktabs
    latex_str += df.to_latex(index=False, escape=False, 
                             header=True, 
                             column_format="l" + "r" * (len(df.columns) - 1),
                             longtable=False, 
                             bold_rows=False,
                             multicolumn=True,
                             multicolumn_format='c',
                             float_format="%.2f",
                             )
                             
    if caption:
        latex_str += f"\\caption{{{caption}}}\n"
    if label:
        latex_str += f"\\label{{{label}}}\n"
    latex_str += "\\end{table}"


    with open(output_dir/("tab_"+title+".tex"), "w") as file:
        file.write(latex_str)


    return latex_str

def render_latex_table(latex_code, output_file="output"):
    # Create a LaTeX document
    doc = Document()
    doc.preamble.append(NoEscape(r"\usepackage{booktabs}"))
    doc.append(NoEscape(r"\section*{Example Table}"))
    doc.append(NoEscape(latex_code))  # Append the LaTeX table as raw string
    # Compile to PDF
    doc.generate_pdf(output_dir / output_file, clean_tex=True)
    print(f"PDF generated: {output_file}")

In [27]:
# Average Ranking of each method
# Load results from db
conn = sqlite3.connect('../results.db')
df = pd.read_sql_query(f"SELECT data_config_hash, data_id, model, {METRIC}  FROM results", conn)
# Calculate mean R2 score
df[METRIC] = df[METRIC].apply(literal_eval)
df[METRIC] = df[METRIC].apply(np.mean)

# all unique values
print(len(df))
ds_hashs = df["data_config_hash"].unique()
ds_ids = df["data_id"].unique()
model_names = df["model"].unique()

# full cartesian product
full_index = pd.MultiIndex.from_product(
    [ds_hashs, ds_ids, model_names],
    names=["data_config_hash", "data_id", "model"]
)

# set index to those columns
df = df.set_index(["data_config_hash", "data_id", "model"])

# reindex and fill missing
df = df.reindex(full_index, fill_value=-np.inf).reset_index()
df["failure"] = df[METRIC] == -np.inf
print(len(df))

# Group Data by data_id data_config_hash
df['rank'] = df.groupby(['data_id', 'data_config_hash'])[METRIC].rank(ascending=False)
df['clean_rank'] = df[df[METRIC] != -np.inf].groupby(['data_id', 'data_config_hash'])[METRIC].rank(ascending=False)

# Calculate average rank for each method
df = (
    df.groupby("model", as_index=False)
      .agg(mean_rank=("rank", "mean"),
           total_failures=("failure", "sum"),
           mean_clean_rank=("clean_rank", "mean")
           )
)

df[["Model", "Avg Rank", "#Failures"]] = df[["model", "mean_rank", "total_failures"]]
df = df[["Model", "Avg Rank", "#Failures"]]
df["#Failures"] = df["#Failures"].astype(int)

table = df_to_booktabs(df, renaming_conventions={}, title="Ranking", optimization="min", caption="Average ranking of different models across several benchmarks.", label="tab:Ranking")
try:
    render_latex_table("\n\n".join([table]))
except Exception as e:
    print(f"Error generating LaTeX table: {e}")
# TODO Join with custom Benchmark tables


118
175
Initial Win CP for (console input, console output, system): (CP850, CP850, CP1252)
I changed them all to CP1252
Rc files read:
  NONE
Latexmk: This is Latexmk, John Collins, 7 Apr. 2024. Version 4.85.
Latexmk: applying rule 'pdflatex'...
Rule 'pdflatex':  Reasons for rerun
Changed files or newly in use/created:
  c:/Users/poehlmann/PycharmProjects/AutoViMet/analysis/cache/analysis/output.tex
  output.tex
Category 'changed_user':
  c:/Users/poehlmann/PycharmProjects/AutoViMet/analysis/cache/analysis/output.tex
  output.tex

------------
Run number 1 of rule 'pdflatex'
------------
------------
Running 'pdflatex  --interaction=nonstopmode -recorder  "c:/Users/poehlmann/PycharmProjects/AutoViMet/analysis/cache/analysis/output.tex"'
------------
This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) (preloaded format=pdflatex)
 restricted \write18 enabled.
entering extended mode

(c:/Users/poehlmann/PycharmProjects/AutoViMet/analysis/cache/analysis/output.te
x
LaTeX2e <202

In [None]:
# TODO List
# Bar Chart Normalized Error
# Critical Difference Diagram
# fANOVA (Optional)
# Plot AutoGluons Performance over Training Time (In Normalized Error, Rank, etc.)
# Bar Chart of Datasets Ordered by difficulty

In [30]:
import pandas as pd

df = pd.DataFrame({"group": ["A"]*4, "value": [10, 20, 15, 30]})
df["rank"] = df["value"].rank(ascending=False)

print(df)


  group  value  rank
0     A     10   4.0
1     A     20   2.0
2     A     15   3.0
3     A     30   1.0
