Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/infinicore.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#pragma once

#include "infinicore/nn.hpp"
#include "infinicore/ops.hpp"
#include "infinicore/tensor.hpp"
5 changes: 5 additions & 0 deletions include/infinicore/nn.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once

#include "nn/embedding.hpp"
#include "nn/linear.hpp"
#include "nn/rmsnorm.hpp"
87 changes: 87 additions & 0 deletions include/infinicore/nn/embedding.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#pragma once

#include "module.hpp"
#include "../ops.hpp"
#include <optional>

namespace infinicore::nn {

/**
* @brief Embedding layer that maps indices to dense vectors
*
* A simple lookup table that stores embeddings of a fixed dictionary and size.
* This module is often used to store word embeddings and retrieve them using indices.
* The input to the module is a tensor of indices, and the output is the corresponding
* embedding vectors.
*
* Similar to PyTorch's nn.Embedding:
* https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
*
* Example:
* @code
* // Create embedding: 10000 words, 300-dimensional embeddings
* auto embedding = Embedding(10000, 300);
*
* // Input: tensor of indices [batch_size, seq_len]
* auto indices = Tensor::from_data({2, 5}, {3, 5, 12, 8, 99, 0, 1, 45, 67, 23});
*
* // Output: [batch_size, seq_len, embedding_dim] = [2, 5, 300]
* auto embeddings = embedding.forward(indices);
* @endcode
*/
class Embedding : public Module {
public:
/**
* @brief Construct an Embedding layer
*
* @param num_embeddings Size of the dictionary of embeddings (vocabulary size)
* @param embedding_dim The size of each embedding vector
* @param padding_idx If specified, the entries at padding_idx do not contribute to gradient
* and the embedding vector at padding_idx is not updated during training
* @param dtype Data type for the embedding weights (default: DataType::F32)
* @param device Device to create the embedding weight on
*/
Embedding(size_t num_embeddings,
size_t embedding_dim,
std::optional<int64_t> padding_idx = std::nullopt,
const DataType &dtype = DataType::F32,
const Device &device = Device());

/**
* @brief Forward pass: lookup embeddings for given indices
*
* @param indices Tensor containing indices into the embedding matrix.
* Can be any shape (*), typically [batch_size] or [batch_size, seq_len]
* @return Tensor containing the embedding vectors.
* Shape: (*, embedding_dim) where * matches the input shape
*
* Example:
* Input shape: [2, 3] -> Output shape: [2, 3, embedding_dim]
* Input shape: [10] -> Output shape: [10, embedding_dim]
*/
Tensor forward(const Tensor &indices) const;

// Module information
size_t num_embeddings() const { return num_embeddings_; }
size_t embedding_dim() const { return embedding_dim_; }
std::optional<int64_t> padding_idx() const { return padding_idx_; }
DataType dtype() const { return dtype_; }

// String representation
std::string extra_repr() const;

// Accessors for parameters
Tensor weight() const { return weight_; }

protected:
// Parameters
INFINICORE_NN_PARAMETER(weight);

private:
size_t num_embeddings_; // Vocabulary size
size_t embedding_dim_; // Embedding dimension
std::optional<int64_t> padding_idx_; // Optional padding index
DataType dtype_; // Data type for embedding weights
};

} // namespace infinicore::nn
47 changes: 47 additions & 0 deletions include/infinicore/nn/linear.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#pragma once

#include "module.hpp"
#include "../ops.hpp"

namespace infinicore::nn {

class Linear : public Module {
public:
Linear(size_t in_features, size_t out_features, bool bias = true, const DataType &dtype = DataType::F32, const Device &device = Device());

// Forward pass: output = input @ weight.T + bias
Tensor forward(Tensor &input) const;

// Forward pass with residual connection (InfiniLM-style)
// output = input @ weight.T + bias + residual
Tensor forward(Tensor &input, Tensor &residual) const;

// Module information
size_t in_features() const { return in_features_; }
size_t out_features() const { return out_features_; }
bool has_bias() const { return has_bias_; }
DataType dtype() const { return dtype_; }

// String representation
std::string extra_repr() const;

// Accessors for parameters
Tensor weight() const { return weight_; }
Tensor bias() const { return bias_; }

protected:
// Parameters
INFINICORE_NN_PARAMETER(weight);
INFINICORE_NN_PARAMETER(bias);

private:
// Helper method for common forward computation
Tensor compute_linear(Tensor &input) const;

size_t in_features_;
Comment thread
PanZezhong1725 marked this conversation as resolved.
size_t out_features_;
bool has_bias_;
DataType dtype_;
};

} // namespace infinicore::nn
137 changes: 137 additions & 0 deletions include/infinicore/nn/module.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#pragma once

#include "parameter.hpp"
#include "../tensor.hpp"

#include <unordered_map>
#include <type_traits>
#include <vector>

namespace infinicore::nn {
class Module {
public:
Module() = default;

const std::unordered_map<std::string, Parameter> &state_dict() const;

void load_state_dict(const std::unordered_map<std::string, Tensor> &_state_dict);

void load_parameter(const std::string &name, const Tensor &param);

void load_parameter_from_blob(const std::string &name, const void *data);

protected:
Tensor register_parameter(const std::string &name, Parameter param);

// Add an existing submodule to this module's hierarchy
// Template parameter M must be a type derived from Module
// Returns the submodule for convenience (allows method chaining)
template <typename M>
std::shared_ptr<M> add_module(const std::string &name, std::shared_ptr<M> submodule) {
// Ensure M is derived from Module (compile-time check)
static_assert(std::is_base_of<Module, M>::value,
"Template parameter M must be derived from infinicore::nn::Module");

// Store in the submodules map (std::shared_ptr<M> automatically converts to std::shared_ptr<Module>)
submodules_[name] = submodule;

return submodule;
}

// Create and register a new submodule by constructing it with the given arguments
// Template parameter M must be a type derived from Module
// Args are forwarded to M's constructor
template <typename M, typename... Args>
std::shared_ptr<M> register_module(const std::string &name, Args &&...args) {
// Ensure M is derived from Module (compile-time check)
static_assert(std::is_base_of<Module, M>::value,
"Template parameter M must be derived from infinicore::nn::Module");

// Construct the submodule
auto submodule = std::make_shared<M>(std::forward<Args>(args)...);

return add_module(name, submodule);
}

// Create and register multiple submodules of the same type
// Each submodule is named as "name.0", "name.1", etc.
// Template parameter M must be a type derived from Module
template <typename M, typename... Args>
std::vector<std::shared_ptr<M>> register_modules(size_t count, const std::string &name, Args &&...args) {
static_assert(std::is_base_of<Module, M>::value,
"Template parameter M must be derived from infinicore::nn::Module");

std::vector<std::shared_ptr<M>> modules;
modules.reserve(count);
for (size_t i = 0; i < count; i++) {
modules.push_back(register_module<M>(name + "." + std::to_string(i), std::forward<Args>(args)...));
}
return modules;
}

protected:
Device device_;
std::unordered_map<std::string, std::shared_ptr<Module>> submodules_;
std::unordered_map<std::string, Parameter> parameters_;

private:
void collect_all_parameters(std::unordered_map<std::string, Parameter> &all_params, const std::string &prefix = "") const;
};

// ============================================================================
// PyTorch-like Macros for Convenient Module Registration
// ============================================================================

/**
* @brief Register submodules with automatic name inference from variable name
*
* Usage:
* @code
* class MyModel : public Module {
* protected:
* INFINICORE_NN_MODULE(Linear, layer1);
* INFINICORE_NN_MODULE(Linear, layer2);
* INFINICORE_NN_MODULE_VEC(Linear, layers);
* INFINICORE_NN_PARAMETER(scaling_factor);
*
* public:
* MyModel() {
* INFINICORE_NN_MODULE_INIT(layer1, 128, 64);
* INFINICORE_NN_MODULE_INIT(layer2, 64, 32);
* INFINICORE_NN_MODULE_VEC_INIT(layers, 3, Linear, 32, 16);
* INFINICORE_NN_PARAMETER_INIT(scaling_factor, ({1}, DataType::F32, Device()));
* }
* };
* @endcode
*/

// Declare a single module member variable
#define INFINICORE_NN_MODULE(ModuleType, name) \
std::shared_ptr<ModuleType> name##_

// Declare a vector of modules member variable
#define INFINICORE_NN_MODULE_VEC(ModuleType, name) \
std::vector<std::shared_ptr<ModuleType>> name##_

// Initialize a module in constructor
#define INFINICORE_NN_MODULE_INIT(name, ...) \
name##_ = this->register_module<std::remove_reference<decltype(*name##_)>::type>(#name, ##__VA_ARGS__)

// Initialize a vector of modules in constructor
// Usage: INFINICORE_NN_MODULE_VEC_INIT(layers, count, ModuleType, ctor_args...)
// Example: INFINICORE_NN_MODULE_VEC_INIT(layers, 3, Linear, 128, 64)
#define INFINICORE_NN_MODULE_VEC_INIT(name, count, ModuleType, ...) \
name##_ = this->register_modules<ModuleType>(count, #name, ##__VA_ARGS__)

Comment thread
PanZezhong1725 marked this conversation as resolved.
// Declare a parameter member variable
#define INFINICORE_NN_PARAMETER(name) \
infinicore::nn::Parameter name##_

// Initialize a parameter in constructor
// Usage: INFINICORE_NN_PARAMETER_INIT(name, (shape, dtype, device))
// Example: INFINICORE_NN_PARAMETER_INIT(weight, ({out_features, in_features}, DataType::F32, device))
#define INFINICORE_NN_PARAMETER_INIT(name, args) \
name##_ = infinicore::nn::Parameter args; \
this->register_parameter(#name, name##_)

} // namespace infinicore::nn
16 changes: 16 additions & 0 deletions include/infinicore/nn/parameter.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#pragma once

#include "../tensor.hpp"

namespace infinicore::nn {
class Parameter : public Tensor {
public:
Parameter();

Parameter(const Shape &shape,
const DataType &dtype,
const Device &device);

void load_blob(const void *data);
};
} // namespace infinicore::nn
81 changes: 81 additions & 0 deletions include/infinicore/nn/rmsnorm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#pragma once

#include "module.hpp"
#include "../ops.hpp"

namespace infinicore::nn {

/**
* @brief Root Mean Square Layer Normalization (RMSNorm)
*
* Applies Root Mean Square Layer Normalization over the last dimension.
* Unlike LayerNorm, RMSNorm doesn't subtract mean and doesn't use bias.
*
* Formula: y = (x / RMS(x)) * weight
* where RMS(x) = sqrt(mean(x^2) + eps)
*
* Used in LLaMA, Galactica, and other modern language models as a
* simpler and faster alternative to LayerNorm.
*
* Example:
* @code
* // Create RMSNorm for hidden size 4096
* auto norm = RMSNorm(4096);
*
* // Input: [batch, seq_len, hidden_size]
* auto input = Tensor::randn({2, 10, 4096});
*
* // Output: [batch, seq_len, hidden_size]
* auto output = norm.forward(input);
* @endcode
*/
class RMSNorm : public Module {
public:
/**
* @brief Construct a RMSNorm layer
*
* @param normalized_shape Size of the feature dimension to normalize (typically hidden_size)
* @param eps Small constant for numerical stability (default: 1e-6)
* @param dtype Data type for the weight (default: DataType::F32)
* @param device Device to create the weight on
*/
RMSNorm(size_t normalized_shape,
double eps = 1e-6,
const DataType &dtype = DataType::F32,
const Device &device = Device());

/**
* @brief Forward pass: apply RMSNorm
*
* @param x Input tensor of shape (*, normalized_shape) where * is any number of dimensions
* @return Normalized tensor with same shape as input
*
* The normalization is applied over the last dimension.
* For example:
* Input: [batch, seq_len, hidden_size] -> normalize over hidden_size
* Input: [batch, hidden_size] -> normalize over hidden_size
*/
Tensor forward(const Tensor &x) const;

// Module information
size_t normalized_shape() const { return normalized_shape_; }
double eps() const { return eps_; }
DataType dtype() const { return dtype_; }

// String representation
std::string extra_repr() const;

// Accessors for parameters
Tensor weight() const { return weight_; }

protected:
// Parameters
INFINICORE_NN_PARAMETER(weight);

private:
size_t normalized_shape_; // Size of the feature dimension
double eps_; // Epsilon for numerical stability
DataType dtype_; // Data type for weight
};

} // namespace infinicore::nn
Loading