In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

def pointsto_df(directory, project, best=False):
    file_path = f"../results/{directory}/{project}"
    if not os.path.exists(file_path):
        return None, None
    df = pd.read_csv(file_path, comment='#')

    # Convert time from nanoseconds to seconds
    df['time'] = df['time'] / 1000000000
    # Normalize time within each runid
    df['normalized_time'] = df.groupby('runid')['time'].transform(lambda x: (x - x.min()) / (x.max() - x.min()) if x.max() > x.min() else 0)


    if best:
        df_best_values = df.loc[df.groupby('runid')['time'].idxmin()].reset_index()
    else:
        df_last_values = df.groupby('runid').last().reset_index()

    df_to_use = df_best_values if best else df_last_values

    sample_count = df['sample_count'].iloc[0]


    # Aggregate mean values for various metrics
    df_mean_times = df_to_use.groupby(['distance'])['time'].mean().reset_index(name='mean_time')
    df_mean_allocs = df_to_use.groupby(['distance'])['alloc_count'].mean().reset_index(name='mean_alloc_count')
    df_mean_types = df_to_use.groupby(['distance'])['type_count'].mean().reset_index(name='mean_type_count')

    # Combine the mean dataframes
    df_mean = pd.merge(df_mean_times, df_mean_allocs, on='distance', how='left')
    df_mean = pd.merge(df_mean, df_mean_types, on='distance', how='left')
    #df_mean = pd.merge(df_mean, df_max_memory, on='distance', how='left')

    # Adjust mean values by the sample count
    df_mean['mean_time'] /= sample_count
    df_mean['mean_alloc_count'] /= sample_count
    df_mean['mean_type_count'] /= sample_count
    #df_mean['max_memory_usage'] /= 10**9 # convert from bytes to Gb

    # Add proportions of types/allocs/time compared to the highest value
    max_dist = df_mean['distance'].max()
    max_allocs = df_mean.loc[df_mean['distance'] == max_dist, 'mean_alloc_count'].values[0]
    max_types = df_mean.loc[df_mean['distance'] == max_dist, 'mean_type_count'].values[0]
    max_time = df_mean.loc[df_mean['distance'] == max_dist, 'mean_time'].values[0]

    df_mean['percent_types'] = (df_mean['mean_type_count'] / max_types) * 100
    df_mean['percent_allocs'] = (df_mean['mean_alloc_count'] / max_allocs) * 100
    df_mean['percent_time'] = (df_mean['mean_time'] / max_time) * 100
    
    return df, df_mean

df, df_mean = pointsto_df("sde-oberon_pointsto_3_3_20240603130030", "antlr-2.7.2_bfpa.new")
df_mean


In [None]:
def memory_df(directory, project):
    file_path = f"../results/{directory}/{project}"
    if not os.path.exists(file_path):
        return None
    df = pd.read_csv(file_path, comment='#', header=None, names=['distance', 'memory', 'pass'])
    df = df[df['pass'] == 1].groupby('distance')['memory'].min().reset_index(name='min_memory')
    return df

df_memory = memory_df("sde-oberon_memory_1_1_20240514114444", "antlr-2.7.2_bfpa.new")

In [None]:

def parse_df(directory, project, best=False):
    file_path = f"../results/{directory}/{project}"
    if not os.path.exists(file_path):
        return None, None
    df = pd.read_csv(file_path, comment='#')

    # Convert time from nanoseconds to seconds
    df['time'] = df['time'] / 1000000000
    # Normalize time within each runid
    df['normalized_time'] = df.groupby('runid')['time'].transform(lambda x: (x - x.min()) / (x.max() - x.min()) if x.max() > x.min() else 0)

    if best:
        df_best_values = df.loc[df.groupby('runid')['time'].idxmin()].reset_index()
    else:
        df_last_values = df.groupby('runid').last().reset_index()

    df_to_use = df_best_values if best else df_last_values
    
    mean = df_to_use['time'].mean()

    return df, mean

In [None]:
def collect_project_data(pointsto_dir, memory_dir, parse_dir):
    project_names = ["antlr-2.7.2", "commons-cli", "commons-jxpath", "jackson-core", "jackson-dataformat-xml"]
    project_names = ["commons-jxpath", "antlr-2.7.2", "weka", "struts", "castor-1.3.3", "fop-0.95", "pmd-4.2.5", "jfreechart-1.0.0", "joda-time"]
    results = []
    
    for project_name in project_names:
        df, df_mean = pointsto_df(pointsto_dir, f"{project_name}_bfpa.new", best=False)
        df_memory = memory_df(memory_dir, f"{project_name}_bfpa.new")
        df_parse, parse_mean = parse_df(parse_dir, f"{project_name}_bfpa.new", best=False)
        if df_memory is not None:
            df_mean = pd.merge(df_mean, df_memory, on='distance', how='left').fillna(-1)
        if df is not None and df_mean is not None:
            result = {
                "name": project_name,
                "data": df,
                "means": df_mean,
                "parse_mean": parse_mean
            }
            results.append(result)
    
    return results

memory = "sde-oberon_memory_1_1_20240522122042"
parse = "sde-oberon_compile_50_10_20240516135401"
oberon_res = collect_project_data("sde-oberon_pointsto_3_3_20240517121924", memory, parse)


def create_avg(result_list):
    return pd.concat([res['means'] for res in result_list]).groupby('distance').mean().reset_index() 

create_avg(oberon_res)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import pandas as pd
import random

def inspect(df, title=""):
    rows=2
    cols=2
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=('Time by Index for All Runs', 'Normalized Time by Index for All Runs', 'Time by Iteration for Each ID', 'Normalized Time by Iteration for Each ID'))

    def next_pos(prev_row, prev_col):
        if prev_col < cols:
            return prev_row, prev_col + 1
        else:
            return prev_row + 1, 1

    cmap = plt.get_cmap('tab20')

    unique_runids = df['runid'].unique()
    runid_to_index = {runid: index for index, runid in enumerate(unique_runids)}
    num_colors = len(unique_runids)

    colors = ['rgba(' + ','.join([str(int(255 * x)) for x in cmap(i / num_colors)[:3]]) + ',1.0)' for i in range(num_colors)]
    random.shuffle(colors)

    row = col = 1
    for runid, group in df.groupby('runid'):
        color_index = runid_to_index[runid]
        fig.add_trace(go.Scatter(x=group.index, y=group['time'], mode='lines', name=f'ID {runid}', line=dict(color=colors[color_index])), row=row, col=col)
        fig.update_xaxes(title_text="Index", row=row, col=col)
        fig.update_yaxes(title_text="Time", row=row, col=col)

    row, col = next_pos(row, col)
    for runid, group in df.groupby('runid'):
        color_index = runid_to_index[runid]
        fig.add_trace(go.Scatter(x=group.index, y=group['normalized_time'], mode='lines', name=f'ID {runid}', line=dict(color=colors[color_index])), row=row, col=col)
        fig.update_xaxes(title_text="Index", row=row, col=col, matches='x')
        fig.update_yaxes(title_text="Normalized time", row=row, col=col)

    row, col = next_pos(row, col)
    for runid, group in df.groupby('runid'):
        color_index = runid_to_index[runid]
        fig.add_trace(go.Scatter(x=group['iteration'], y=group['time'], mode='lines', name=f'ID {runid}', line=dict(color=colors[color_index])), row=row, col=col)
        fig.update_xaxes(title_text="iteration", row=row, col=col)
        fig.update_yaxes(title_text="time", row=row, col=col)

    row, col = next_pos(row, col)
    for runid, group in df.groupby('runid'):
        color_index = runid_to_index[runid]
        fig.add_trace(go.Scatter(x=group['iteration'], y=group['normalized_time'], mode='lines', name=f'ID {runid}', line=dict(color=colors[color_index])), row=row, col=col)
        fig.update_xaxes(title_text="iteration", row=row, col=col)
        fig.update_yaxes(title_text="Normalized Time", row=row, col=col)


    fig_witdh=500
    fig.update_layout(height=fig_witdh*rows, width=fig_witdh*cols, title_text=title, showlegend=True)
    fig.show()
    
#inspect(steady_state[1]['data'], "antlr")

In [None]:
subplot_titles = ['Distance vs Mean Time', 'Distance vs Alloc Count', 'Distance vs Type Count', 'Distance vs Memory Usage']
colors = ['red', 'blue', 'green', 'orange', 'purple']
dash_styles = ['solid', 'dot', 'dash', 'longdash', 'dashdot', 'longdashdot']


def plot_metrics(results_dfs, average_df=None):
    metrics = ['mean_time', 'mean_alloc_count', 'mean_type_count', 'min_memory']
    file_names = ['mean_time_evaluation.pdf', 'alloc_count_evaluation.pdf', 'type_count_evaluation.pdf', 'memory_usage_evaluation.pdf']

    # Ensure the figures directory exists
    if not os.path.exists('figures'):
        os.makedirs('figures')

    for metric, title, file_name in zip(metrics, subplot_titles, file_names):
        fig = go.Figure()
        if average_df is not None:
            fig.add_trace(go.Scatter(
                x=average_df['distance'], 
                y=average_df[metric], 
                mode='lines+markers',
                name='All Projects',
                opacity=0.5,
                line=dict(color='gray', dash='solid', width=2),
                marker=dict(symbol='diamond', size=8)
            ))

        for i, res in enumerate(results_dfs):
            df = res["means"]
            fig.add_trace(go.Scatter(
                x=df['distance'], 
                y=df[metric], 
                mode='lines+markers',
                name=res["name"],
                line=dict(color=colors[i % len(colors)], dash=dash_styles[i % len(dash_styles)])
            ))
        
        if metric == "mean_time":
            yaxis_title = "Mean Time (s)"
        elif metric == "min_memory":
            yaxis_title = "Memory Usage (MB)"
        else:
            yaxis_title = metric.replace('_', ' ').capitalize()

        fig.update_layout(
            #title=title,
            xaxis_title="Distance",
            yaxis_title=yaxis_title,
            showlegend=True,
            margin=dict(l=0, r=0, t=0, b=100),
            legend=dict(
                x=0.5,  # Center the legend
                y=-0.15,  # Position the legend below the x-axis
                xanchor='center',  # Anchor the legend's center at x position
                orientation='h',  # Horizontal orientation of the legend items
            ),
            font=dict(
                size=14
            ),
            template='plotly_white',
        )

        fig.write_image(f'figures/{file_name}')

plot_metrics(oberon_res, create_avg(oberon_res))

In [None]:
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from itertools import cycle


def plot_projects_separately(results_df, subplot_titles, colors, dash_styles, show80=False):
    if not os.path.exists('figures'):
        os.mkdir('figures')

    color_cycle = cycle(colors)
    dash_cycle = cycle(dash_styles)

    for res in results_df:
        fig = make_subplots(rows=2, cols=2, subplot_titles=subplot_titles,
                            vertical_spacing=0.2, horizontal_spacing=0.15)  # Adjust spacing here
        current_color = next(color_cycle)
        current_dash = next(dash_cycle)
        
        # Calculate 80% line value if needed
        if show80:
            max_type_count = max(res["means"]['mean_type_count'])
            eighty_percent_line = 0.8 * max_type_count
        
        # Distance vs Mean Time
        fig.add_trace(go.Scatter(x=res["means"]['distance'], y=res["means"]['mean_time'], mode='lines+markers',
                                 name=res["name"], line=dict(color=current_color, dash=current_dash),
                                 showlegend=False),
                      row=1, col=1)
        fig.update_xaxes(title_text="Distance", row=1, col=1)
        fig.update_yaxes(title_text="Mean Time (s)", row=1, col=1)
        
        # Distance vs Alloc Count
        fig.add_trace(go.Scatter(x=res["means"]['distance'], y=res["means"]['mean_alloc_count'], mode='lines+markers',
                                 line=dict(color=current_color, dash=current_dash),
                                 showlegend=False),
                      row=1, col=2)
        fig.update_xaxes(title_text="Distance", row=1, col=2)
        fig.update_yaxes(title_text="Allocations", row=1, col=2)
        
        # Distance vs Type Count
        trace = go.Scatter(x=res["means"]['distance'], y=res["means"]['mean_type_count'], mode='lines+markers',
                           line=dict(color=current_color, dash=current_dash),
                           showlegend=False)
        fig.add_trace(trace, row=2, col=1)
        if show80:
            fig.add_trace(go.Scatter(x=res["means"]['distance'], y=[eighty_percent_line]*len(res["means"]['distance']),
                                     mode='lines', line=dict(color='red', dash='dot'),
                                     name='80% of Max Type Count', showlegend=True),
                          row=2, col=1)
        fig.update_xaxes(title_text="Distance", row=2, col=1)
        fig.update_yaxes(title_text="Types", row=2, col=1)
        
        # Distance vs Memory Usage
        fig.add_trace(go.Scatter(x=res["means"]['distance'], y=res["means"]['min_memory'], mode='lines+markers',
                                 line=dict(color=current_color, dash=current_dash),
                                 showlegend=False),
                      row=2, col=2)
        fig.update_xaxes(title_text="Distance", row=2, col=2)
        fig.update_yaxes(title_text="Memory Usage (MB)", row=2, col=2)

        fig.update_layout(margin=dict(l=0, r=0, t=0, b=0), legend=dict(x=0.5, xanchor='center'))

        # Save plot as PDF
        pdf_path = f'figures/{res["name"]}_distance_evaluation.pdf'
        fig.write_image(pdf_path)
        
plot_projects_separately(oberon_res, subplot_titles=[], colors=colors, dash_styles=dash_styles, show80=False)

In [None]:
import os

def gen_latex_singles(results_df):
    individual_caption_template = "{name}"
    whole_figure_caption = "The metrics measured for each dataset. This is the same graphs as shown in Figure~\\ref{fig:mean_time}-\\ref{fig:memory_usage}, but plotted separately for each benchmark project."
    
    latex_output = "\\begin{figure}[htbp]\n\\centering\n"
    for idx, res in enumerate(results_df):
        file_path = f'figures/{res["name"]}_distance_evaluation.pdf'
        latex_output += f"\\begin{{subfigure}}[b]{{0.45\\textwidth}}\n"
        latex_output += f"\\captionsetup{{margin=0cm}}  % Remove caption margin for this subfigure\n"
        latex_output += f"\\includegraphics[width=\\textwidth]{{{file_path}}}\n"
        latex_output += f"\\caption{{{individual_caption_template.format(name=res['name'])}}}\n"
        latex_output += "\\end{subfigure}\n"
        latex_output += "\\vspace{0.5cm}\n"

        if idx % 2 == 0:
            latex_output += "\\hfill\n"
        else:
            latex_output += "\\\\\n"

    if len(results_df) % 2 != 0:
        latex_output = latex_output.rstrip("\\hfill\n")
    
    latex_output += "\\caption{" + whole_figure_caption + "}\n"
    latex_output += "\\label{fig:singles_result}\n"
    latex_output += "\\end{figure}\n"

    if not os.path.exists('generated'):
        os.mkdir('generated')
    with open('generated/singles_results.tex', 'w') as file:
        file.write(latex_output)

gen_latex_singles(oberon_res)


In [None]:
import os

def gen_latex_multi(results_df):
    # Configuration variables
    captions = [
    "Mean time for calculating the points-to information for a method across the benchmarks for different values of $k$. The gray line shows the mean for all projects. The numbers together with parse data can be seen in Table \\ref{tab:parse_times}.",
    "Mean allocation count for a method across the benchmarks for different values of $k$. The gray line shows the mean for all projects.",
    "Mean type count for a method across the benchmarks for different values of $k$. They gray line shows the mean for all projects.",
    "Mean memory usage for calculating the points-to information for a method across the benchmarks for different values of $k$. The gray line shows the mean for all projects."
    ]

    figure_labels = ['fig:mean_time', 'fig:alloc_count', 'fig:type_count', 'fig:memory_usage']
    file_names = ['mean_time_evaluation.pdf', 'alloc_count_evaluation.pdf', 'type_count_evaluation.pdf', 'memory_usage_evaluation.pdf']

    if not os.path.exists('generated'):
        os.mkdir('generated')

    latex_output = ""

    for caption, label, file_name in zip(captions, figure_labels, file_names):
        latex_output += "\\begin{figure}[htbp]\n\\centering\n"
        latex_output += f"\\includegraphics[width=\\FigureWidth]{{figures/{file_name}}}\n"
        latex_output += f"\\caption{{{caption}}}\n"
        latex_output += f"\\label{{{label}}}\n"
        latex_output += "\\end{figure}\n\n"

    with open('generated/multi_figures.tex', 'w') as file:
        file.write(latex_output)

gen_latex_multi(oberon_res)

In [None]:
def plot_culmulative(results_df):
    avg = create_avg(results_df)

    metrics = ['percent_types', 'percent_time']
    desc = ['Types', 'Time']
    colors = ['red', 'blue', 'green', 'orange', 'purple']
    dash_styles = ['solid', 'dot', 'dash', 'longdash', 'dashdot', 'longdashdot']

    fig = go.Figure()
    for i, metric in enumerate(metrics):
        fig.add_trace(go.Scatter(
                    x=avg['distance'], 
                    y=avg[metric], 
                    mode='lines+markers',
                    name=desc[i],
                    line=dict(color=colors[i % len(colors)], dash=dash_styles[i % len(dash_styles)])
                ))


    fig.update_layout(
                xaxis_title="Distance",
                yaxis_title="Percent",
                margin=dict(l=0, r=0, t=0, b=100),
                legend=dict(
                    x=0.5,  # Center the legend
                    y=-0.15,  # Position the legend below the x-axis
                    xanchor='center',  # Anchor the legend's center at x position
                    orientation='h',  # Horizontal orientation of the legend items
                ),
                font=dict(
                    size=14
                ),
                template='plotly_white',
            )

    pdf_path = f'figures/culumative.pdf'
    fig.write_image(pdf_path)

plot_culmulative(oberon_res)

In [None]:
def gen_latex_parse_table(results_df, include_dist_times=False):
    caption = "Parse times and time to retreive the points-to set for an average method as plotted in Figure~\\ref{fig:mean_time} for the projects used in the benchmarks programs."
    data = [(res['name'], res['parse_mean'], res['means']['mean_time']) for res in results_df]
    num_k_values = len(data[0][2])

    k_header = " & ".join([f"$k$ = {i}" for i in range(num_k_values)])
    print(k_header)
    latex_code = [
        "\\begin{table}[ht]",
        "\\centering",
        "\\footnotesize",
        "\\begin{tabular}{l" + ("c" * (num_k_values + 1)) + "}" if include_dist_times else "\\begin{tabular}{lc}",
        "\\toprule",
        "\\multicolumn{2}{l}{} & \\multicolumn{" + str(num_k_values) + "}{c}{Mean times for different values of $k$ (s)} \\\\" if include_dist_times else "",
        "\\cmidrule(lr){3-" + str(2 + num_k_values) + "}" if include_dist_times else "",
        "Project & Parse time (s)" + (f" & {k_header}" if include_dist_times else "") + " \\\\",
        "\\midrule"
    ]

    for project, parse_time, mean_times in data:
        if include_dist_times:
            latex_code.append(f"\\tool{{{project}}} & {parse_time:.2f} & {' & '.join(map(lambda x: f'{x:.2f}', mean_times))} \\\\")
        else:
            latex_code.append(f"\\tool{{{project}}} & {parse_time:.2f} \\\\")

    latex_code.append("\\bottomrule")
    latex_code.append("\\end{tabular}")
    latex_code.append(f"\\caption{{{caption}}}")
    latex_code.append("\\label{tab:parse_times}")
    latex_code.append("\\end{table}")

    latex_output = "\n".join(latex_code)
    
    os.makedirs('generated', exist_ok=True)
    
    with open('generated/parse_table.tex', 'w') as file:
        file.write(latex_output)

gen_latex_parse_table(oberon_res, include_dist_times=True)

In [None]:
import plotly.graph_objects as go
import time
from plotly.subplots import make_subplots

k_values = ["0", "1", "2", "3", "4", "5", "∞"]
recall = [0.00, 0.17, 0.43, 0.71, 0.85, 0.94, 0.95]
precision = [1.00, 0.92, 0.90, 0.78, 0.74, 0.67, 0.67]
correct_types = [2, 108, 271, 450, 537, 599, 603]
overapproximated_typed = [0, 10, 31, 125, 190, 294, 301]
times = [0.13, 0.31, 0.41, 0.85, 1.94, 2.64, 2.94]

qilin_overapproximated = 489
qilin_correct_types = 631
qilin_recall = 1.0
qilin_precision = 0.56
qilin_time = 8.52


recall_color = "#636efb"
precision_color = "#f0553b"
time_color = "#00cc96"
culm_color = "grey"

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Scatter(x=k_values, y=recall, mode='lines+markers', name='Type Recall', line=dict(dash='dot', color=recall_color)), secondary_y=False)
fig.add_trace(go.Scatter(x=k_values, y=precision, mode='lines+markers', name='Type Precision', line=dict(dash='dash', color=precision_color)), secondary_y=False)

fig.add_trace(go.Scatter(x=k_values, y=times, mode='lines+markers', name='Time (s)', line=dict(dash='solid', color=time_color)), secondary_y=True)

fig.add_hline(y=qilin_recall, line_dash="dash", line_color=recall_color, annotation_text="Qilin Recall", annotation_position="top right", secondary_y=False)
fig.add_hline(y=qilin_precision, line_dash="dash", line_color=precision_color, annotation_text="Qilin Precision", annotation_position="bottom right", secondary_y=False)

fig.update_layout(
    title="",
    xaxis_title="Distance",
    yaxis_title="Precision/Recall",
    yaxis2_title="Time (s)",
    legend_title="Metric",
    template='plotly_white',
    margin=dict(l=0, r=0, t=30, b=100),
    legend=dict(
        x=0.5,
        y=-0.2,
        xanchor='center',
        orientation='h'
    )
)

fig.update_yaxes(range=[0, 1.05], secondary_y=False)
fig.update_yaxes(range=[0, max(times) + 0.5], secondary_y=True)

fig.write_image("figures/qilin_comparison.pdf")
time.sleep(1) # for some reason the first plot generates the box with "Loading [MathJax]/extensions/MathMenu.js"
fig.write_image("figures/qilin_comparison.pdf")