Merge 4771a1b into 8bb020d

JamesYang007 · Apr 21, 2020 · 5b58fb8 · 5b58fb8
2 parents 8bb020d + 4771a1b
commit 5b58fb8
Show file tree

Hide file tree

Showing 43 changed files with 2,515 additions and 48 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -38,7 +38,7 @@ jobs:
                 coveralls 
                 --root ../../ 
                 --build-root ./ 
-                --include autoppl/include 
+                --include include 
                 --exclude lib
                 --gcov 'gcov-7' 
                 --gcov-options '\-lp'

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -10,7 +10,6 @@ option(AUTOPPL_ENABLE_BENCHMARK "Enable benchmarks to be built." OFF)
 option(AUTOPPL_ENABLE_TEST_COVERAGE "Build with test coverage (AUTOPPL_ENABLE_TEST must be ON)" OFF)
 
 # Automate the choosing of config
-# if CMAKE_BUILD_TYPE not defined
 if (NOT CMAKE_BUILD_TYPE)
     # if binary directory ends with "release", use release mode
     if (${PROJECT_BINARY_DIR} MATCHES "release$")
@@ -22,6 +21,16 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()
 message(STATUS "Compiling in ${CMAKE_BUILD_TYPE} mode")
 
+# Add this library as interface (header-only)
+add_library(${PROJECT_NAME} INTERFACE)
+
+target_include_directories(${PROJECT_NAME}
+    INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+
+# Set C++17 standard for project target
+target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
+
 # Configure tests
 if (AUTOPPL_ENABLE_TEST)
     include(CTest) # enable memcheck
@@ -35,6 +44,11 @@ if (AUTOPPL_ENABLE_TEST)
     add_subdirectory(${PROJECT_SOURCE_DIR}/test ${PROJECT_BINARY_DIR}/test)
 endif()
 
+# TODO: add src dir if needed
+#set(AUTOPPL_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/src)
+#file(GLOB_RECURSE AUTOPPL_SOURCE_FILES RELATIVE src LIST_DIRECTORIES false *.cpp)
+#set(AUTOPPL_SOURCE_FILES ${AUTOPPL_SOURCE_DIR}/autoppl.cpp)
+#set(AUTOPPL_HEADER_FILES ${AUTOPPL_INCLUDE_DIR}/autoppl.h)
+
 # Add subdirectories
-add_subdirectory(${PROJECT_SOURCE_DIR}/autoppl ${PROJECT_BINARY_DIR}/autoppl)
 add_subdirectory(${PROJECT_SOURCE_DIR}/lib ${PROJECT_BINARY_DIR}/lib)
diff --git a/autoppl/CMakeLists.txt b/autoppl/CMakeLists.txt
diff --git a/autoppl/include/autoppl.hpp b/autoppl/include/autoppl.hpp
diff --git a/doc/design/model_design2.cpp b/doc/design/model_design2.cpp
@@ -0,0 +1,32 @@
+Y ~ W.x + epsilon
+Y ~ N(W.x, sigma^2)
+
+Parameter<double> X {4.0};  // observed
+Parameter<double> Y {5.0}; // observed
+Parameter<double> W; // hidden
+
+Model m1 = Model(  // Model class defines a distribution over existing Parameters.
+	W |= Uniform(-10, 10), // linear regression
+	Y |= Normal(W * X, 3), // overload multiplication to build a graph from W * X
+);
+
+Model m2 = Model(
+	W |= Normal(0, 1), // ridge regression instead
+	Y |= Normal(W * X, 3),
+);
+
+m1.sample(1000);
+
+(3*x).pdf(10) => x.pdf(10 / 3)
+
+X.observe(3); // observe more data
+
+// P(Y, W | X) = P(Y | W, X) P(W | X) which is doable for multiple samples, just need to 
+// assert len(Y) == len(X) and then multiply out over all pairs of (X, Y) values.
+
+// P(Y | X) => this is a fine distribution, but I can't talk about P(Y, X) or P(X | Y) until I put a prior on Y.
+// I don't have a joint distribution yet.
+
+// Some issues:
+// how do we do (x ** 2).pdf(5)? This is pretty damn hard for non-bijective functions, need to integrate?
+// 
diff --git a/doc/design/model_inttest.cpp b/doc/design/model_inttest.cpp
@@ -0,0 +1,106 @@
+#include "gtest/gtest.h"
+#include <autoppl/expression/model.hpp>
+#include <autoppl/expression/rv_tag.hpp>
+#include <autoppl/expression/uniform.hpp>
+
+namespace ppl {
+
+template <class VectorType, class IndexType>
+struct BracketNode
+{
+    VectorType v;
+    IndexType i;
+};
+
+struct myvector
+{
+
+    rv_tag<double> operator[](rv_tag)
+    {
+        return rv
+    }
+    std::vector<rv_tags> v; // 3 things
+};
+
+template <class MuType, class SigType>
+auto normal(const MuType& mu, const SigType& sig)
+{
+    Normal<MuType, SigType>(mu, sig);
+}
+
+TEST(dummy, dummy_test)
+{
+    double x_data = 2.3; // 1-sample data
+
+    std::vector<double> sampled_theta_1(100);
+    std::vector<double> sampled_theta_2(100);
+
+    double* ptr;
+    rv_tag<double, ...> x;
+    rv_tag<double> theta_1(sampled_theta_1.data());
+    rv_tag<double> theta_2(sampled_theta_2.data());
+
+    std::vector<rv_tag<double>> v;
+    std::for_each(..., ... , [](){v[i].set_sample_storage(&mat.row(i));});
+
+    x.observe(x_data);
+
+    x_1.observe(...);
+    x_2.observe(...);
+
+    auto model = (
+        mu |= uniform(-10000, 10000),
+        y |= uniform({1,2,3})   // 
+        x_1 |= normal(mu[y], 1),
+        x_2 |= normal(mu[y], 1),
+    );
+
+    x.observe(...);
+
+    rv_tag<double> var, mu, x;
+    auto normal_model = (
+        var |= normal(0,1),
+        mu |= normal(1,5),
+        x |= normal(mu, var)
+    );
+
+    std::vector<double> var_storage(1000);
+    std::vector<double> mu_storage(1000);
+
+    var.set_storage(var_storage.data());
+    mu.set_storage(mu_storage.data());
+
+    metropolis_hastings(model, 1000, 400);
+
+    auto gmm_model = (
+        mu |=    
+    );
+
+    std::vector<rv_tag<double>> vec(model.param_num);
+    model.bind_storage(vec.begin(), vec.end(), ...);
+    model.pdf();
+
+    metropolis_hastings(model, 100);
+
+    std::vector<double> sampled_theta_1_again(1000);
+    std::vector<double> sampled_theta_2_again(1000);
+
+    theta_1.set_storage(sampled_theta_1_again.data());
+    theta_2.set_storage(sampled_theta_2_again.data());
+
+    metropolis_hastings(model, 1000);
+
+
+
+
+
+
+
+    auto model = (
+        w |= normal(0,1),
+        y |= normal(w*x, 1)
+    )
+    metropolis_hastings(modeli)
+}
+
+} 
diff --git a/include/autoppl/algorithm/mh.hpp b/include/autoppl/algorithm/mh.hpp
@@ -0,0 +1,123 @@
+#pragma once
+#include <chrono>
+#include <random>
+#include <algorithm>
+#include <vector>
+#include <autoppl/util/traits.hpp>
+#include <autoppl/variable.hpp>
+#include <autoppl/expression/distribution/base.hpp>
+
+/*
+ * Assumptions:
+ * - every variable referenced in model is of type Variable<double>
+ */
+
+namespace ppl {
+namespace details {
+
+struct MHData
+{
+    double next;
+    // TODO: maybe keep an array for batch sampling?
+};
+
+} // namespace details
+
+/*
+ * Metropolis-Hastings algorithm to sample from posterior distribution.
+ * The posterior distribution is a constant multiple of model.pdf().
+ * Any variables that model references which are in state "parameter"
+ * is sampled and in state "data" are not.
+ * So, model.pdf() is proportional to p(parameters... | data...).
+ *
+ * User must ensure that they allocated at least as large as n_sample
+ * in the storage associated with every parameter referenced in model.
+ */
+template <class ModelType>
+inline void mh_posterior(ModelType& model,
+                         double n_sample,
+                         double stddev = 1.0,
+                         double seed = std::chrono::duration_cast<
+                                        std::chrono::milliseconds>(
+                                            std::chrono::system_clock::now().time_since_epoch()
+                                            ).count()
+                        )
+{
+    using data_t = details::MHData;
+
+    // set-up auxiliary tools
+    std::mt19937 gen(seed);
+    std::uniform_real_distribution unif_sampler(0., 1.);
+
+    // get number of parameters to sample    
+    size_t n_params = 0.;
+    auto get_n_params = [&](auto& eq_node) {
+        auto& var = eq_node.get_variable();
+        using var_t = std::decay_t<decltype(var)>;
+        using state_t = typename util::var_traits<var_t>::state_t;
+        n_params += (var.get_state() == state_t::parameter);
+    };
+    model.traverse(get_n_params);
+
+    // vector of parameter-related data with candidate
+    std::vector<data_t> params(n_params);
+    double curr_log_pdf = model.log_pdf();
+    auto params_it = params.begin();
+
+    for (size_t iter = 0; iter < n_sample; ++iter) {
+
+        double log_alpha = -curr_log_pdf;
+
+        // generate next candidates and place them in parameter
+        // variables as next values; update log_alpha
+        // The old values are temporary stored in the params vector.
+        auto get_candidate = [=, &gen](auto& eq_node) mutable {
+            auto& var = eq_node.get_variable();
+            using var_t = std::decay_t<decltype(var)>;
+            using state_t = typename util::var_traits<var_t>::state_t;
+
+            if (var.get_state() == state_t::parameter) {
+                auto curr = var.get_value();
+                std::normal_distribution norm_sampler(curr, stddev);
+
+                // sample new candidate, place old value in params, 
+                // fill next candidate in var, and update log_alpha
+                auto cand = norm_sampler(gen); 
+                params_it->next = curr;
+                var.set_value(cand); 
+
+                ++params_it;
+            }
+        };
+        model.traverse(get_candidate);
+
+        // compute next candidate log pdf and update log_alpha
+        double cand_log_pdf = model.log_pdf();
+        log_alpha += cand_log_pdf;
+        bool accept = (std::log(unif_sampler(gen)) <= log_alpha);
+
+        // If accept, "current" sample for next iteration is already in the variables
+        // so simply append to storage.
+        // Otherwise, "current" sample for next iteration must be moved back from 
+        // params vector into variables.
+        auto add_to_storage = [params_it, iter, accept](auto& eq_node) mutable {
+            auto& var = eq_node.get_variable();
+            using var_t = std::decay_t<decltype(var)>;
+            using state_t = typename util::var_traits<var_t>::state_t;
+            if (var.get_state() == state_t::parameter) {
+                if (!accept) {
+                    var.set_value(params_it->next);
+                    ++params_it;
+                }
+                auto storage = var.get_storage();
+                storage[iter] = var.get_value();
+            } 
+        };
+        model.traverse(add_to_storage);
+
+        // update current log pdf for next iteration
+        if (accept) curr_log_pdf = cand_log_pdf;
+    }
+}
+
+} // namespace ppl
diff --git a/include/autoppl/autoppl b/include/autoppl/autoppl
@@ -0,0 +1,2 @@
+#pragma once
+// TODO: export all headers later! 
diff --git a/include/autoppl/distribution/discrete.hpp b/include/autoppl/distribution/discrete.hpp
@@ -0,0 +1,48 @@
+#pragma once
+#include <cassert>
+#include <random>
+#include <cmath>
+#include <numeric>
+
+namespace ppl {
+
+// TODO: change name to DiscreteDist and make class template.
+// Discrete should be a function that creates this kind of object.
+
+template <typename weight_type>
+struct Discrete 
+{
+    using value_t = int;
+    using dist_value_t = double;
+
+    Discrete(std::initializer_list<weight_type> weights)
+        : weights_{weights} { assert(weights.size() > 0); }
+
+    template <class GeneratorType>
+    value_t sample(GeneratorType& gen) const
+    {
+        std::discrete_distribution dist(weights_.begin(), weights_.end()); 
+        return dist(gen);
+    }
+
+    dist_value_t pdf(value_t i) const
+    {
+        assert( i >= 0 && i < (int) weights_.size() );
+        return weights(i) / std::accumulate(weights_.begin(), weights_.end(), 0.0 );
+
+    } 
+
+    dist_value_t log_pdf(value_t i) const
+    {
+        assert( i >= 0 && i < (int) weights_.size() );
+        return log(weights(i) / std::accumulate(weights_.begin(), weights_.end(), 0.0 ));
+    }
+
+    inline dist_value_t weights(value_t i) const { return static_cast<dist_value_t>(weights_[i]); }
+
+   private:
+    std::vector<weight_type> weights_;
+};
+
+} // namespace ppl
+