Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
uses: actions/checkout@v5

- name: Setup Go
uses: actions/setup-go@v5
uses: actions/setup-go@v6
with:
go-version: '~1.24'

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ jobs:
strategy:
matrix:
# from https://github.com/kubernetes-sigs/controller-tools/blob/main/envtest-releases.yaml
envtest_k8s_version: [1.23.5, 1.33.0]
envtest_k8s_version: [1.23.5, 1.34.0]
steps:
- name: Clone the code
uses: actions/checkout@v5

- name: Setup Go
uses: actions/setup-go@v5
uses: actions/setup-go@v6
with:
go-version: '~1.24'

Expand Down
4 changes: 4 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"clientcmdapi",
"clientgoscheme",
"clientset",
"clientsetfake",
"cloudnative",
"cloudprovider",
"clusterissuers",
Expand All @@ -46,6 +47,7 @@
"envtest",
"essd",
"Eventf",
"featuregate",
"finalizer",
"Finalizers",
"frameworkruntime",
Expand Down Expand Up @@ -78,6 +80,8 @@
"iface",
"imageutils",
"influxdata",
"internalcache",
"internalqueue",
"jsonpatch",
"karpenter",
"karpv",
Expand Down
8 changes: 6 additions & 2 deletions api/v1/gpuresourcequota_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package v1
import (
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/pkg/scheduler/framework"
fwk "k8s.io/kube-scheduler/framework"
)

// GPUResourceQuotaSpec defines the desired state of GPUResourceQuota
Expand Down Expand Up @@ -188,6 +188,10 @@ type AllocRequest struct {
PodMeta metav1.ObjectMeta
}

func (p *AllocRequest) Clone() fwk.StateData {
return p
}

type GPUAllocationInfo struct {
Request Resource `json:"request,omitempty"`
Limit Resource `json:"limit,omitempty"`
Expand All @@ -203,7 +207,7 @@ type AdjustRequest struct {
NewLimit Resource
}

func (ar *AllocRequest) Clone() framework.StateData {
func (ar *AdjustRequest) Clone() fwk.StateData {
return ar
}

Expand Down
2 changes: 1 addition & 1 deletion charts/tensor-fusion/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.5.6
version: 1.5.7

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
4 changes: 2 additions & 2 deletions charts/tensor-fusion/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ controller:
image:
repository: tensorfusion/tensor-fusion-operator
# Overrides the image tag whose default is the chart appVersion.
tag: "latest"
tag: "1.43.4"
# This is for setting Kubernetes Annotations to a Pod.
# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/

Expand Down Expand Up @@ -120,7 +120,7 @@ agent:

image:
repository: tensorfusion/tensor-fusion-agent
tag: "latest"
tag: "1.0.0"

resources:
requests:
Expand Down
55 changes: 32 additions & 23 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,27 +27,6 @@ import (

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.

"k8s.io/client-go/kubernetes"
_ "k8s.io/client-go/plugin/pkg/client/auth"
"k8s.io/client-go/rest"
"k8s.io/klog/v2"

"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
"k8s.io/kubernetes/cmd/kube-scheduler/app"
"k8s.io/kubernetes/pkg/scheduler"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
"sigs.k8s.io/controller-runtime/pkg/webhook"

"sigs.k8s.io/yaml"

tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
"github.com/NexusGPU/tensor-fusion/cmd/sched"
"github.com/NexusGPU/tensor-fusion/internal/alert"
Expand All @@ -65,6 +44,25 @@ import (
"github.com/NexusGPU/tensor-fusion/internal/utils"
"github.com/NexusGPU/tensor-fusion/internal/version"
webhookcorev1 "github.com/NexusGPU/tensor-fusion/internal/webhook/v1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
k8sVer "k8s.io/apimachinery/pkg/util/version"
"k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/kubernetes"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
_ "k8s.io/client-go/plugin/pkg/client/auth"
"k8s.io/client-go/rest"
"k8s.io/klog/v2"
"k8s.io/kubernetes/cmd/kube-scheduler/app"
"k8s.io/kubernetes/pkg/scheduler"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
"sigs.k8s.io/controller-runtime/pkg/webhook"
"sigs.k8s.io/yaml"
// +kubebuilder:scaffold:imports
)

Expand Down Expand Up @@ -204,6 +202,14 @@ func main() {
_ = os.Setenv(constants.KubeApiVersionMajorEnv, version.Major)
_ = os.Setenv(constants.KubeApiVersionMinorEnv, version.Minor)

// TODO: there will still be risk after FeatureGate removed when the feature is stable for a long time
// To be compatible with long-term k8s version, need to patch Kubernetes source code
k8sVersion := k8sVer.MustParseSemantic(version.String())
err = feature.DefaultMutableFeatureGate.SetEmulationVersion(k8sVersion)
if err != nil {
setupLog.Error(err, "unable to set k8s version for feature gating")
}

alertEvaluatorReady = make(chan struct{})
setupTimeSeriesAndWatchGlobalConfigChanges(ctx, mgr)

Expand All @@ -221,7 +227,7 @@ func main() {
pricingProvider := pricing.NewStaticPricingProvider()
startWebhook(mgr, portAllocator, pricingProvider)

scheduler := startScheduler(ctx, allocator, mgr)
scheduler := startScheduler(ctx, allocator, mgr, k8sVersion)

startCustomResourceController(ctx, mgr, metricsRecorder, allocator, portAllocator)

Expand Down Expand Up @@ -461,6 +467,7 @@ func startScheduler(
ctx context.Context,
allocator *gpuallocator.GpuAllocator,
mgr manager.Manager,
k8sVersion *k8sVer.Version,
) *scheduler.Scheduler {
if os.Getenv(constants.EnableSchedulerEnv) == constants.FalseStringValue {
return nil
Expand All @@ -479,7 +486,9 @@ func startScheduler(
gpuTopoPlugin.NewWithDeps(allocator, mgr.GetClient()),
)

cc, scheduler, err := sched.SetupScheduler(ctx, mgr, schedulerConfigPath, false, gpuResourceFitOpt, gpuTopoOpt)
cc, scheduler, err := sched.SetupScheduler(
ctx, mgr, schedulerConfigPath, false, k8sVersion, gpuResourceFitOpt, gpuTopoOpt,
)
if err != nil {
setupLog.Error(err, "unable to create tensor fusion scheduler")
os.Exit(1)
Expand Down
9 changes: 9 additions & 0 deletions cmd/sched/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ import (
"strings"

utilerrors "k8s.io/apimachinery/pkg/util/errors"
k8sVer "k8s.io/apimachinery/pkg/util/version"
"k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/tools/events"
"k8s.io/component-base/configz"
"k8s.io/klog/v2"
Expand Down Expand Up @@ -50,6 +52,7 @@ func SetupScheduler(
mgr manager.Manager,
schedulerConfigPath string,
disableHttpEndpoint bool,
k8sVersion *k8sVer.Version,
outOfTreeRegistryOptions ...app.Option,
) (*schedulerserverconfig.CompletedConfig, *scheduler.Scheduler, error) {
opts := options.NewOptions()
Expand All @@ -73,6 +76,12 @@ func SetupScheduler(
return nil, nil, err
}

// Setup enumerationVersion again since it's overridden by the config
err = feature.DefaultMutableFeatureGate.SetEmulationVersion(k8sVersion)
if err != nil {
return nil, nil, err
}

if cfg, err := latest.Default(); err != nil {
return nil, nil, err
} else {
Expand Down
Loading
Loading