In [1]:
#!pip install -e ../Maccabee > /dev/null

In [2]:
from maccabee.data_generation.utils import CompiledExpression
from sympy.abc import x, y
import sympy as sp
from itertools import combinations
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
def original_eval(expression, data):
    """Evaluates the Sympy expression in `expression` using the :class:`pandas.DataFrame` in `data` to fill in the value of all the variables in the expression. The expression is evaluated once for each row of the DataFrame.

    Args:
        expression (Sympy Expression): A Sympy expression with variables that are a subset of the variables in columns data.
        data (:class:`~pandas.DataFrame`): A DataFrame containing observations of the variables in the expression. The names of the columns must match the names of the symbols in the expression.

    Returns:
        :class:`~numpy.ndarray`: An array of expression values corresponding to the rows of the `data`.
    """
    if isinstance(expression, CompiledExpression):
        return expression.eval_expr(data)
    else:
        free_symbols = getattr(expression, "free_symbols", None)
        if free_symbols is not None:
            free_symbols = list(free_symbols)

            expr_func = sp.lambdify(
                    free_symbols,
                    expression,
                    modules=[
                        {
                            "amax": lambda x: np.maximum(*x),
                            "amin": lambda x: np.minimum(*x)
                        },
                        "numpy"
                    ],
                    dummify=True)

            column_data = [data[str(sym)] for sym in free_symbols]
            return expr_func(*column_data)
        else:
            # No free symbols, return expression itself.
            return expression

In [4]:
def optimized_eval(expression, data, symbols):
    """Evaluates the Sympy expression in `expression` using the :class:`pandas.DataFrame` in `data` to fill in the value of all the variables in the expression. The expression is evaluated once for each row of the DataFrame.

    Args:
        expression (Sympy Expression): A Sympy expression with variables that are a subset of the variables in columns data.
        data (:class:`~pandas.DataFrame`): A DataFrame containing observations of the variables in the expression. The names of the columns must match the names of the symbols in the expression.

    Returns:
        :class:`~numpy.ndarray`: An array of expression values corresponding to the rows of the `data`.
    """
    if isinstance(expression, CompiledExpression):
        return expression.eval_expr(data)
    else:
        free_symbols = getattr(expression, "free_symbols", None)
        if free_symbols is not None:
            expr_func = sp.lambdify(
                    list(data.columns),
                    expression,
                    modules=[
                        {
                            "amax": lambda x: np.maximum(*x),
                            "amin": lambda x: np.minimum(*x)
                        },
                        "numpy"
                    ],
                    dummify=False)
            
            return expr_func(*np.hsplit(data.values, data.shape[1]))
        else:
            # No free symbols, return expression itself.
            return expression

In [5]:
from sympy.utilities.autowrap import ufuncify, CodeWrapper
import pathlib
import sys
C_PATH = "./_maccabee_compiled_code/"

class OptimizedCompiledExpression():

    def __init__(self, expression, symbols):
        self.symbols = symbols
        self.expression = expression
        self.constant_expression = False

        self.compiled_module_name = None
        self.compiled_ordered_args = None
        self._compile()

        self.expression_func = None

    def __getstate__(self):
        return (
            self.expression,
            self.constant_expression,
            self.compiled_module_name,
            self.compiled_ordered_args
        )

    def __setstate__(self, state):
        (
            self.expression,
            self.constant_expression,
            self.compiled_module_name,
            self.compiled_ordered_args
        ) = state

        self.expression_func = None

    def _compile(self):
        free_symbols = getattr(self.expression, "free_symbols", None)
        if free_symbols is not None:
            # Args
#             expr_func_ordered_symbols = list(free_symbols)
#             self.compiled_ordered_args = [
#                 str(symbol)
#                 for symbol in expr_func_ordered_symbols
#             ]

            try:
                # Module
                self.compiled_module_name = \
                    f"mod_{abs(hash(self.expression))}_{np.random.randint(1e8)}"
                mod_path = C_PATH+self.compiled_module_name

                pathlib.Path(C_PATH).mkdir(parents=True, exist_ok=True)
                CodeWrapper.module_name = self.compiled_module_name

                # print("Compiling")
                # Compile
                ufuncify(
                    self.symbols,
                    self.expression,
                    backend="cython",
                    tempdir=mod_path)
                # print("Done compiling")
            except Exception as e:
                raise Exception(f"Failure in compilation of compiled expression. {e}")
        else:
            # No free symbols, expression is constant.
            self.constant_expression = True

    def eval_expr(self, data):
        if self.constant_expression:
            return self.expression

        try:
            if self.expression_func is None:
                if self.compiled_module_name not in sys.modules:
                    mod_path = C_PATH + self.compiled_module_name

                    if mod_path not in sys.path:
                        sys.path.append(mod_path)

                    # print("Importing compiled module.")
                    mod = importlib.import_module(self.compiled_module_name)
                else:
                    # print("Loading existing compiled module.")
                    mod = sys.modules[self.compiled_module_name]

                # compiled_func_prefix = "wrapped_"
                compiled_func_prefix = "autofunc"
                func_name = next(filter(lambda x: x.startswith(compiled_func_prefix), dir(mod)))
                self.expression_func = getattr(mod, func_name)

            # print("Executing compiled code")
            data = map(lambda x: x.flatten(), np.hsplit(data.values, data.shape[1]))
            expr_result = self.expression_func(*data)
            # print("Done executing compiled code")
            res = pd.Series(expr_result)
            return res
        except Exception as e:
            print(f"failure in compiled expression eval. {e}")
            return evaluate_expression(self.expression, data)

In [6]:
def run_trial(n_obs, n_covars, strategy, n_evals=40):
    
    covar_names = [f"x{i}" for i in range(n_covars)]
    covar_symbols = sp.symbols(covar_names)
    interactions = [a*b for a, b in combinations(covar_symbols, 2)]
    expression = np.sum(covar_symbols) + np.sum(interactions)
    
    n_eff_covars = n_covars + len(interactions)
    
    data = np.random.normal(size=(n_obs, n_covars))
    df = pd.DataFrame(data, columns=covar_names)
    
    if strategy == "compile" or strategy=="optimized_compile":
        compile_times = []
        for _ in range(1):
            start = time()
            if strategy == "compile":
                c = CompiledExpression(expression)
            else:
                c = OptimizedCompiledExpression(expression, covar_symbols)
            end = time()
            compile_times.append(end - start)

        compile_time = np.mean(compile_times)
    else:
        compile_time = 0
    
    eval_times = []
    start = time()
    
    for _ in range(n_evals):
        if strategy == "compile" or strategy=="optimized_compile":
            val = c.eval_expr(df)
        elif strategy == "lambda":
            val = original_eval(expression, df)
        elif strategy == "optimized_lambda":
            val = optimized_eval(expression, df, covar_symbols)
        assert(len(val) == n_obs)
        
    end = time()
    eval_time = end - start
    
    return compile_time, eval_time, n_eff_covars

In [7]:
n_covars = 10
n_obs = 1000
covar_names = [f"x{i}" for i in range(n_covars)]
covar_symbols = sp.symbols(covar_names)
interactions = [a*b for a, b in combinations(covar_symbols, 2)]
expression = np.sum(covar_symbols) + np.sum(interactions)

n_eff_covars = n_covars + len(interactions)

data = np.random.normal(size=(n_obs, n_covars))
df = pd.DataFrame(data, columns=covar_names)

%timeit sp.symbols(list(df.columns))

93.2 µs ± 3.42 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [8]:
N = 1000
n_covars = range(2, 25, 5)
n_evals = 20

In [9]:
compile_data = []
for n_covar in n_covars:
    print("Running at ", n_covar)
    compile_data.append(run_trial(N, n_covar,
                                  n_evals=n_evals, 
                                  strategy="compile"))

Running at  2


TypeError: __init__() missing 1 required positional argument: 'symbols'

In [None]:
optimized_compile_data = []
for n_covar in n_covars:
    print("Running at ", n_covar)
    optimized_compile_data.append(run_trial(N, n_covar,
                                            n_evals=n_evals,
                                            strategy="optimized_compile"))

In [None]:
no_compile_data = []
for n_covar in n_covars:
    print("Running at ", n_covar)
    no_compile_data.append(run_trial(N, n_covar,
                                     n_evals=n_evals,
                                     strategy="lambda"))

In [None]:
optimized_no_compile_data = []
for n_covar in n_covars:
    print("Running at ", n_covar)
    optimized_no_compile_data.append(run_trial(N, n_covar,
                                               n_evals=n_evals,
                                               strategy="optimized_lambda"))

In [None]:
compile_data = np.array(compile_data)
optimized_compile_data = np.array(optimized_compile_data)
no_compile_data = np.array(no_compile_data)
optimized_no_compile_data = np.array(optimized_no_compile_data)

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(131)
plt.title("Compile time")
plt.scatter(n_covars, compile_data[:, 0], label="Compiled", c="r")
plt.scatter(n_covars, optimized_compile_data[:, 0], label="Optimized Compiled", c="y")
plt.scatter(n_covars, no_compile_data[:, 0], label="Non Compiled", c="b")
plt.scatter(n_covars, optimized_no_compile_data[:, 0], label="Optimized Non Compiled", c="g")

plt.subplot(132)
plt.title("Eval time")
plt.scatter(n_covars, compile_data[:, 1], label="Compiled", c="r")
plt.scatter(n_covars, optimized_compile_data[:, 1], label="Optimized Compiled", c="y")
plt.scatter(n_covars, no_compile_data[:, 1], label="Non Compiled", c="b")
plt.scatter(n_covars, optimized_no_compile_data[:, 1], label="Non Compiled improved", c="g")

plt.subplot(133)
plt.title("Total time")
plt.scatter(n_covars, compile_data[:, 1] + compile_data[:, 0], label="Compiled", c="r")
plt.scatter(n_covars, optimized_compile_data[:, 1] + optimized_compile_data[:, 0], label="Optimized Compiled", c="y")
plt.scatter(n_covars, no_compile_data[:, 1] + no_compile_data[:, 0], label="Non Compiled", c="b")
plt.scatter(n_covars, optimized_no_compile_data[:, 1] + optimized_no_compile_data[:, 0], label="Non Compiled improved", c="g")


plt.legend()
plt.show()

In [None]:
improvements = 100*(no_compile_data[:, 1] - optimized_no_compile_data[:, 1])/no_compile_data[:, 1]
improvements, np.mean(improvements)

In [None]:
compiled_totals = compile_data[:, 1] + compile_data[:, 0]
optimized_compiled_totals = optimized_compile_data[:, 1] + optimized_compile_data[:, 0]

improvements = 100*(compiled_totals - optimized_compiled_totals)/compiled_totals
improvements, np.mean(improvements)

In [None]:
plt.figure(figsize=(12, 6))

n_eff_covars = compile_data[:, 2]

plt.subplot(131)
plt.title("Compile time")
plt.scatter(n_eff_covars, compile_data[:, 0], label="Compiled", c="r")
plt.scatter(n_eff_covars, optimized_compile_data[:, 0], label="Optimized Compiled", c="y")
plt.scatter(n_eff_covars, no_compile_data[:, 0], label="Non Compiled", c="b")
plt.scatter(n_eff_covars, optimized_no_compile_data[:, 0], label="Optimized Non Compiled", c="g")

plt.subplot(132)
plt.title("Eval time")
plt.scatter(n_eff_covars, compile_data[:, 1], label="Compiled", c="r")
plt.scatter(n_eff_covars, optimized_compile_data[:, 1], label="Optimized Compiled", c="y")
plt.scatter(n_eff_covars, no_compile_data[:, 1], label="Non Compiled", c="b")
plt.scatter(n_eff_covars, optimized_no_compile_data[:, 1], label="Non Compiled improved", c="g")

plt.subplot(133)
plt.title("Total time")
plt.scatter(n_eff_covars, compile_data[:, 1] + compile_data[:, 0], label="Compiled", c="r")
plt.scatter(n_eff_covars, optimized_compile_data[:, 1] + optimized_compile_data[:, 0], label="Optimized Compiled", c="y")
plt.scatter(n_eff_covars, no_compile_data[:, 1] + no_compile_data[:, 0], label="Non Compiled", c="b")
plt.scatter(n_eff_covars, optimized_no_compile_data[:, 1] + optimized_no_compile_data[:, 0], label="Non Compiled improved", c="g")


plt.legend()
plt.show()