InfiniTensor · PanZezhong1725 · Oct 31, 2025 · Oct 23, 2025 · Oct 31, 2025 · Oct 31, 2025
diff --git a/include/infinicore.hpp b/include/infinicore.hpp
@@ -1,4 +1,5 @@
 #pragma once
 
+#include "infinicore/nn.hpp"
 #include "infinicore/ops.hpp"
 #include "infinicore/tensor.hpp"
diff --git a/include/infinicore/nn.hpp b/include/infinicore/nn.hpp
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "nn/embedding.hpp"
+#include "nn/linear.hpp"
+#include "nn/rmsnorm.hpp"
diff --git a/include/infinicore/nn/embedding.hpp b/include/infinicore/nn/embedding.hpp
@@ -0,0 +1,87 @@
+#pragma once
+
+#include "module.hpp"
+#include "../ops.hpp"
+#include <optional>
+
+namespace infinicore::nn {
+
+/**
+ * @brief Embedding layer that maps indices to dense vectors
+ *
+ * A simple lookup table that stores embeddings of a fixed dictionary and size.
+ * This module is often used to store word embeddings and retrieve them using indices.
+ * The input to the module is a tensor of indices, and the output is the corresponding
+ * embedding vectors.
+ *
+ * Similar to PyTorch's nn.Embedding:
+ * https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
+ *
+ * Example:
+ * @code
+ *   // Create embedding: 10000 words, 300-dimensional embeddings
+ *   auto embedding = Embedding(10000, 300);
+ *
+ *   // Input: tensor of indices [batch_size, seq_len]
+ *   auto indices = Tensor::from_data({2, 5}, {3, 5, 12, 8, 99, 0, 1, 45, 67, 23});
+ *
+ *   // Output: [batch_size, seq_len, embedding_dim] = [2, 5, 300]
+ *   auto embeddings = embedding.forward(indices);
+ * @endcode
+ */
+class Embedding : public Module {
+public:
+    /**
+     * @brief Construct an Embedding layer
+     *
+     * @param num_embeddings Size of the dictionary of embeddings (vocabulary size)
+     * @param embedding_dim The size of each embedding vector
+     * @param padding_idx If specified, the entries at padding_idx do not contribute to gradient
+     *                    and the embedding vector at padding_idx is not updated during training
+     * @param dtype Data type for the embedding weights (default: DataType::F32)
+     * @param device Device to create the embedding weight on
+     */
+    Embedding(size_t num_embeddings,
+              size_t embedding_dim,
+              std::optional<int64_t> padding_idx = std::nullopt,
+              const DataType &dtype = DataType::F32,
+              const Device &device = Device());
+
+    /**
+     * @brief Forward pass: lookup embeddings for given indices
+     *
+     * @param indices Tensor containing indices into the embedding matrix.
+     *                Can be any shape (*), typically [batch_size] or [batch_size, seq_len]
+     * @return Tensor containing the embedding vectors.
+     *         Shape: (*, embedding_dim) where * matches the input shape
+     *
+     * Example:
+     *   Input shape: [2, 3] -> Output shape: [2, 3, embedding_dim]
+     *   Input shape: [10] -> Output shape: [10, embedding_dim]
+     */
+    Tensor forward(const Tensor &indices) const;
+
+    // Module information
+    size_t num_embeddings() const { return num_embeddings_; }
+    size_t embedding_dim() const { return embedding_dim_; }
+    std::optional<int64_t> padding_idx() const { return padding_idx_; }
+    DataType dtype() const { return dtype_; }
+
+    // String representation
+    std::string extra_repr() const;
+
+    // Accessors for parameters
+    Tensor weight() const { return weight_; }
+
+protected:
+    // Parameters
+    INFINICORE_NN_PARAMETER(weight);
+
+private:
+    size_t num_embeddings_;   // Vocabulary size
+    size_t embedding_dim_;    // Embedding dimension
+    std::optional<int64_t> padding_idx_;  // Optional padding index
+    DataType dtype_;           // Data type for embedding weights
+};
+
+} // namespace infinicore::nn
diff --git a/include/infinicore/nn/linear.hpp b/include/infinicore/nn/linear.hpp
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "module.hpp"
+#include "../ops.hpp"
+
+namespace infinicore::nn {
+
+class Linear : public Module {
+public:
+    Linear(size_t in_features, size_t out_features, bool bias = true, const DataType &dtype = DataType::F32, const Device &device = Device());
+
+    // Forward pass: output = input @ weight.T + bias
+    Tensor forward(Tensor &input) const;
+
+    // Forward pass with residual connection (InfiniLM-style)
+    // output = input @ weight.T + bias + residual
+    Tensor forward(Tensor &input, Tensor &residual) const;
+
+    // Module information
+    size_t in_features() const { return in_features_; }
+    size_t out_features() const { return out_features_; }
+    bool has_bias() const { return has_bias_; }
+    DataType dtype() const { return dtype_; }
+
+    // String representation
+    std::string extra_repr() const;
+
+    // Accessors for parameters
+    Tensor weight() const { return weight_; }
+    Tensor bias() const { return bias_; }
+
+protected:
+    // Parameters
+    INFINICORE_NN_PARAMETER(weight);
+    INFINICORE_NN_PARAMETER(bias);
+
+private:
+    // Helper method for common forward computation
+    Tensor compute_linear(Tensor &input) const;
+
+    size_t in_features_;
+    size_t out_features_;
+    bool has_bias_;
+    DataType dtype_;
+};
+
+} // namespace infinicore::nn
diff --git a/include/infinicore/nn/module.hpp b/include/infinicore/nn/module.hpp
@@ -0,0 +1,137 @@
+#pragma once
+
+#include "parameter.hpp"
+#include "../tensor.hpp"
+
+#include <unordered_map>
+#include <type_traits>
+#include <vector>
+
+namespace infinicore::nn {
+class Module {
+public:
+    Module() = default;
+
+    const std::unordered_map<std::string, Parameter> &state_dict() const;
+
+    void load_state_dict(const std::unordered_map<std::string, Tensor> &_state_dict);
+
+    void load_parameter(const std::string &name, const Tensor &param);
+
+    void load_parameter_from_blob(const std::string &name, const void *data);
+
+protected:
+    Tensor register_parameter(const std::string &name, Parameter param);
+
+    // Add an existing submodule to this module's hierarchy
+    // Template parameter M must be a type derived from Module
+    // Returns the submodule for convenience (allows method chaining)
+    template <typename M>
+    std::shared_ptr<M> add_module(const std::string &name, std::shared_ptr<M> submodule) {
+        // Ensure M is derived from Module (compile-time check)
+        static_assert(std::is_base_of<Module, M>::value,
+                      "Template parameter M must be derived from infinicore::nn::Module");
+
+        // Store in the submodules map (std::shared_ptr<M> automatically converts to std::shared_ptr<Module>)
+        submodules_[name] = submodule;
+
+        return submodule;
+    }
+
+    // Create and register a new submodule by constructing it with the given arguments
+    // Template parameter M must be a type derived from Module
+    // Args are forwarded to M's constructor
+    template <typename M, typename... Args>
+    std::shared_ptr<M> register_module(const std::string &name, Args &&...args) {
+        // Ensure M is derived from Module (compile-time check)
+        static_assert(std::is_base_of<Module, M>::value,
+                      "Template parameter M must be derived from infinicore::nn::Module");
+
+        // Construct the submodule
+        auto submodule = std::make_shared<M>(std::forward<Args>(args)...);
+
+        return add_module(name, submodule);
+    }
+
+    // Create and register multiple submodules of the same type
+    // Each submodule is named as "name.0", "name.1", etc.
+    // Template parameter M must be a type derived from Module
+    template <typename M, typename... Args>
+    std::vector<std::shared_ptr<M>> register_modules(size_t count, const std::string &name, Args &&...args) {
+        static_assert(std::is_base_of<Module, M>::value,
+                      "Template parameter M must be derived from infinicore::nn::Module");
+
+        std::vector<std::shared_ptr<M>> modules;
+        modules.reserve(count);
+        for (size_t i = 0; i < count; i++) {
+            modules.push_back(register_module<M>(name + "." + std::to_string(i), std::forward<Args>(args)...));
+        }
+        return modules;
+    }
+
+protected:
+    Device device_;
+    std::unordered_map<std::string, std::shared_ptr<Module>> submodules_;
+    std::unordered_map<std::string, Parameter> parameters_;
+
+private:
+    void collect_all_parameters(std::unordered_map<std::string, Parameter> &all_params, const std::string &prefix = "") const;
+};
+
+// ============================================================================
+// PyTorch-like Macros for Convenient Module Registration
+// ============================================================================
+
+/**
+ * @brief Register submodules with automatic name inference from variable name
+ *
+ * Usage:
+ * @code
+ *   class MyModel : public Module {
+ *   protected:
+ *       INFINICORE_NN_MODULE(Linear, layer1);
+ *       INFINICORE_NN_MODULE(Linear, layer2);
+ *       INFINICORE_NN_MODULE_VEC(Linear, layers);
+ *       INFINICORE_NN_PARAMETER(scaling_factor);
+ *
+ *   public:
+ *       MyModel() {
+ *           INFINICORE_NN_MODULE_INIT(layer1, 128, 64);
+ *           INFINICORE_NN_MODULE_INIT(layer2, 64, 32);
+ *           INFINICORE_NN_MODULE_VEC_INIT(layers, 3, Linear, 32, 16);
+ *           INFINICORE_NN_PARAMETER_INIT(scaling_factor, ({1}, DataType::F32, Device()));
+ *       }
+ *   };
+ * @endcode
+ */
+
+// Declare a single module member variable
+#define INFINICORE_NN_MODULE(ModuleType, name) \
+    std::shared_ptr<ModuleType> name##_
+
+// Declare a vector of modules member variable
+#define INFINICORE_NN_MODULE_VEC(ModuleType, name) \
+    std::vector<std::shared_ptr<ModuleType>> name##_
+
+// Initialize a module in constructor
+#define INFINICORE_NN_MODULE_INIT(name, ...) \
+    name##_ = this->register_module<std::remove_reference<decltype(*name##_)>::type>(#name, ##__VA_ARGS__)
+
+// Initialize a vector of modules in constructor
+// Usage: INFINICORE_NN_MODULE_VEC_INIT(layers, count, ModuleType, ctor_args...)
+// Example: INFINICORE_NN_MODULE_VEC_INIT(layers, 3, Linear, 128, 64)
+#define INFINICORE_NN_MODULE_VEC_INIT(name, count, ModuleType, ...) \
+    name##_ = this->register_modules<ModuleType>(count, #name, ##__VA_ARGS__)
+
+// Declare a parameter member variable
+#define INFINICORE_NN_PARAMETER(name) \
+    infinicore::nn::Parameter name##_
+
+// Initialize a parameter in constructor
+// Usage: INFINICORE_NN_PARAMETER_INIT(name, (shape, dtype, device))
+// Example: INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, DataType::F32, device))
+#define INFINICORE_NN_PARAMETER_INIT(name, args) \
+    name##_ = infinicore::nn::Parameter args; \
+    this->register_parameter(#name, name##_)
+
+} // namespace infinicore::nn
diff --git a/include/infinicore/nn/parameter.hpp b/include/infinicore/nn/parameter.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../tensor.hpp"
+
+namespace infinicore::nn {
+class Parameter : public Tensor {
+public:
+    Parameter();
+
+    Parameter(const Shape &shape,
+              const DataType &dtype,
+              const Device &device);
+
+    void load_blob(const void *data);
+};
+} // namespace infinicore::nn
diff --git a/include/infinicore/nn/rmsnorm.hpp b/include/infinicore/nn/rmsnorm.hpp
@@ -0,0 +1,81 @@
+#pragma once
+
+#include "module.hpp"
+#include "../ops.hpp"
+
+namespace infinicore::nn {
+
+/**
+ * @brief Root Mean Square Layer Normalization (RMSNorm)
+ *
+ * Applies Root Mean Square Layer Normalization over the last dimension.
+ * Unlike LayerNorm, RMSNorm doesn't subtract mean and doesn't use bias.
+ *
+ * Formula: y = (x / RMS(x)) * weight
+ * where RMS(x) = sqrt(mean(x^2) + eps)
+ *
+ * Used in LLaMA, Galactica, and other modern language models as a
+ * simpler and faster alternative to LayerNorm.
+ *
+ * Example:
+ * @code
+ *   // Create RMSNorm for hidden size 4096
+ *   auto norm = RMSNorm(4096);
+ *
+ *   // Input: [batch, seq_len, hidden_size]
+ *   auto input = Tensor::randn({2, 10, 4096});
+ *
+ *   // Output: [batch, seq_len, hidden_size]
+ *   auto output = norm.forward(input);
+ * @endcode
+ */
+class RMSNorm : public Module {
+public:
+    /**
+     * @brief Construct a RMSNorm layer
+     *
+     * @param normalized_shape Size of the feature dimension to normalize (typically hidden_size)
+     * @param eps Small constant for numerical stability (default: 1e-6)
+     * @param dtype Data type for the weight (default: DataType::F32)
+     * @param device Device to create the weight on
+     */
+    RMSNorm(size_t normalized_shape,
+            double eps = 1e-6,
+            const DataType &dtype = DataType::F32,
+            const Device &device = Device());
+
+    /**
+     * @brief Forward pass: apply RMSNorm
+     *
+     * @param x Input tensor of shape (*, normalized_shape) where * is any number of dimensions
+     * @return Normalized tensor with same shape as input
+     *
+     * The normalization is applied over the last dimension.
+     * For example:
+     *   Input: [batch, seq_len, hidden_size] -> normalize over hidden_size
+     *   Input: [batch, hidden_size] -> normalize over hidden_size
+     */
+    Tensor forward(const Tensor &x) const;
+
+    // Module information
+    size_t normalized_shape() const { return normalized_shape_; }
+    double eps() const { return eps_; }
+    DataType dtype() const { return dtype_; }
+
+    // String representation
+    std::string extra_repr() const;
+
+    // Accessors for parameters
+    Tensor weight() const { return weight_; }
+
+protected:
+    // Parameters
+    INFINICORE_NN_PARAMETER(weight);
+
+private:
+    size_t normalized_shape_;  // Size of the feature dimension
+    double eps_;               // Epsilon for numerical stability
+    DataType dtype_;           // Data type for weight
+};
+
+} // namespace infinicore::nn