From 66d0dd1f10abb7adfa88cf71f7d16c55dcad0b47 Mon Sep 17 00:00:00 2001 From: naman-msft Date: Tue, 24 Jun 2025 14:02:49 -0700 Subject: [PATCH 1/2] added new batch of exec docs as of june 24, 2025 and tested them ready for PR --- .../aksdns-lookup-fail-error.md | 91 ++ ...luster-service-health-probe-mode-issues.md | 244 ++++ .../node-not-ready-after-being-healthy.md | 181 +++ ...ot-ready-custom-script-extension-errors.md | 150 +++ ...ent-ip-address-cannot-access-api-server.md | 122 ++ ...meouts-dial-tcp-nodeip-10250-io-timeout.md | 50 + ...tl-third-party-tools-connect-api-server.md | 143 ++ ...ot-cluster-connection-issues-api-server.md | 90 ++ .../user-cannot-get-cluster-resources.md | 107 ++ ...le-cluster-autoscaler-enabled-node-pool.md | 72 ++ ...-code-badrequest-or-invalidclientsecret.md | 73 ++ ...code-cnidownloadtimeoutvmextensionerror.md | 131 ++ .../upgrading-or-scaling-does-not-succeed.md | 79 ++ .../aks-cost-analysis-add-on-issues.md | 122 ++ scenarios/UseIGOnAKS/alert-bad-process.yaml | 13 + .../ama-metrics-settings-configmap.yaml | 84 ++ scenarios/UseIGOnAKS/use-ig-on-aks.md | 198 +++ .../aks/access-control-managed-azure-ad.md | 89 ++ .../articles/aks/access-private-cluster.md | 215 ++++ .../articles/aks/aks-migration.md | 308 +++++ .../concepts-network-azure-cni-pod-subnet.md | 133 ++ .../aks/concepts-preview-api-life-cycle.md | 80 ++ .../articles/aks/delete-cluster.md | 74 ++ .../articles/aks/enable-host-encryption.md | 103 ++ .../azure-aks-docs/articles/aks/events.md | 131 ++ .../aks/free-standard-pricing-tiers.md | 296 +++++ .../articles/aks/istio-meshconfig.md | 184 +++ .../articles/aks/istio-scale.md | 167 +++ .../articles/aks/kubelet-logs.md | 111 ++ .../articles/aks/nat-gateway.md | 431 +++++++ .../articles/aks/postgresql-ha-overview.md | 92 -- .../articles/aks/resize-cluster.md | 132 ++ .../azure-aks-docs/articles/aks/use-etags.md | 137 ++ .../azure-aks-docs/articles/aks/use-labels.md | 254 ++++ .../articles/ansible/vm-configure.md | 221 ++-- .../articles/iot-edge/quickstart-linux.md | 429 ++++++ scenarios/metadata.json | 1146 +++++++++++++++-- 37 files changed, 6359 insertions(+), 324 deletions(-) create mode 100644 scenarios/AKSDNSLookupFailError/aksdns-lookup-fail-error.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/cluster-service-health-probe-mode-issues.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/node-not-ready-after-being-healthy.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/node-not-ready-custom-script-extension-errors.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/client-ip-address-cannot-access-api-server.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/tcp-timeouts-dial-tcp-nodeip-10250-io-timeout.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/tcp-timeouts-kubetctl-third-party-tools-connect-api-server.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/troubleshoot-cluster-connection-issues-api-server.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/user-cannot-get-cluster-resources.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/cannot-scale-cluster-autoscaler-enabled-node-pool.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/error-code-badrequest-or-invalidclientsecret.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/error-code-cnidownloadtimeoutvmextensionerror.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/upgrading-or-scaling-does-not-succeed.md create mode 100644 scenarios/SupportArticles-docs/support/azure/azure-kubernetes/extensions/aks-cost-analysis-add-on-issues.md create mode 100644 scenarios/UseIGOnAKS/alert-bad-process.yaml create mode 100644 scenarios/UseIGOnAKS/ama-metrics-settings-configmap.yaml create mode 100644 scenarios/UseIGOnAKS/use-ig-on-aks.md create mode 100644 scenarios/azure-aks-docs/articles/aks/access-control-managed-azure-ad.md create mode 100644 scenarios/azure-aks-docs/articles/aks/access-private-cluster.md create mode 100644 scenarios/azure-aks-docs/articles/aks/aks-migration.md create mode 100644 scenarios/azure-aks-docs/articles/aks/concepts-network-azure-cni-pod-subnet.md create mode 100644 scenarios/azure-aks-docs/articles/aks/concepts-preview-api-life-cycle.md create mode 100644 scenarios/azure-aks-docs/articles/aks/delete-cluster.md create mode 100644 scenarios/azure-aks-docs/articles/aks/enable-host-encryption.md create mode 100644 scenarios/azure-aks-docs/articles/aks/events.md create mode 100644 scenarios/azure-aks-docs/articles/aks/free-standard-pricing-tiers.md create mode 100644 scenarios/azure-aks-docs/articles/aks/istio-meshconfig.md create mode 100644 scenarios/azure-aks-docs/articles/aks/istio-scale.md create mode 100644 scenarios/azure-aks-docs/articles/aks/kubelet-logs.md create mode 100644 scenarios/azure-aks-docs/articles/aks/nat-gateway.md delete mode 100644 scenarios/azure-aks-docs/articles/aks/postgresql-ha-overview.md create mode 100644 scenarios/azure-aks-docs/articles/aks/resize-cluster.md create mode 100644 scenarios/azure-aks-docs/articles/aks/use-etags.md create mode 100644 scenarios/azure-aks-docs/articles/aks/use-labels.md create mode 100644 scenarios/azure-docs/articles/iot-edge/quickstart-linux.md diff --git a/scenarios/AKSDNSLookupFailError/aksdns-lookup-fail-error.md b/scenarios/AKSDNSLookupFailError/aksdns-lookup-fail-error.md new file mode 100644 index 000000000..010a06192 --- /dev/null +++ b/scenarios/AKSDNSLookupFailError/aksdns-lookup-fail-error.md @@ -0,0 +1,91 @@ +--- +title: Troubleshoot the K8SAPIServerDNSLookupFailVMExtensionError error code (52) +description: Learn how to troubleshoot the K8SAPIServerDNSLookupFailVMExtensionError error (52) when you try to start or create and deploy an Azure Kubernetes Service (AKS) cluster. +ms.topic: article +ms.date: 06/14/2024 +author: MicrosoftDocsExec +ms.author: MicrosoftDocsExec +ms.custom: sap:Create, Upgrade, Scale and Delete operations (cluster or nodepool), innovation-engine +--- + +# Troubleshoot the K8SAPIServerDNSLookupFailVMExtensionError error code (52) + +This article discusses how to identify and resolve the `K8SAPIServerDNSLookupFailVMExtensionError` error (also known as error code ERR_K8S_API_SERVER_DNS_LOOKUP_FAIL, error number 52) that occurs when you try to start or create and deploy a Microsoft Azure Kubernetes Service (AKS) cluster. + +## Prerequisites + +- The [nslookup](/windows-server/administration/windows-commands/nslookup) DNS lookup tool for Windows nodes or the [dig](https://linuxize.com/post/how-to-use-dig-command-to-query-dns-in-linux/) tool for Linux nodes. + +- [Azure CLI](/cli/azure/install-azure-cli), version 2.0.59 or a later version. If Azure CLI is already installed, you can find the version number by running `az --version`. + +## Symptoms + +When you try to start or create an AKS cluster, you receive the following error message: + +> Agents are unable to resolve Kubernetes API server name. It's likely custom DNS server is not correctly configured, please see for more information. +> +> Details: Code="VMExtensionProvisioningError" +> +> Message="VM has reported a failure when processing extension 'vmssCSE'. +> +> Error message: "**Enable failed: failed to execute command: command terminated with exit status=52**\n[stdout]\n{ +> +> "ExitCode": "52", +> +> "Output": "Fri Oct 15 10:06:00 UTC 2021,aks- nodepool1-36696444-vmss000000\\nConnection to mcr.microsoft.com 443 port [tcp/https] + +## Cause + +The cluster nodes can't resolve the cluster's fully qualified domain name (FQDN) in Azure DNS. Run the following DNS lookup command on the failed cluster node to find DNS resolutions that are valid. + +| Node OS | Command | +| ------- | ------------------------- | +| Linux | `dig ` | +| Windows | `nslookup ` | + +## Solution + +On your DNS servers and firewall, make sure that nothing blocks the resolution to your cluster's FQDN. Your custom DNS server might be incorrectly configured if something is blocking even after you run the `nslookup` or `dig` command and apply any necessary fixes. For help to configure your custom DNS server, review the following articles: + +- [Create a private AKS cluster](/azure/aks/private-clusters) +- [Private Azure Kubernetes service with custom DNS server](https://github.com/Azure/terraform/tree/00d15e09c54f25fb6387330c36aa4366122c5aaa/quickstart/301-aks-private-cluster) +- [What is IP address 168.63.129.16?](/azure/virtual-network/what-is-ip-address-168-63-129-16) + +When you use a private cluster that has a custom DNS, a DNS zone is created. The DNS zone must be linked to the virtual network. This occurs after the cluster is created. Creating a private cluster that has a custom DNS fails during creation. However, you can restore the creation process to a "success" state by reconciling the cluster. To do this, run the [az resource update](/cli/azure/resource#az-resource-update) command in Azure CLI, as follows: + +Below, set your AKS cluster and resource group names, then run the update command to reconcile the cluster. The environment variables will make your resource names unique and are declared just before use. + +```azurecli-interactive +az resource update --resource-group $RESOURCE_GROUP_NAME \ + --name $CLUSTER_NAME \ + --namespace Microsoft.ContainerService \ + --resource-type ManagedClusters +``` + +Results: + + + +```output +{ + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx/providers/Microsoft.ContainerService/ManagedClusters/myAksClusterxxx", + "location": "eastus", + "name": "myAksClusterxxx", + "properties": { + // ...other properties... + }, + "resourceGroup": "myResourceGroupxxx", + "type": "Microsoft.ContainerService/ManagedClusters" +} +``` + +Also verify that your DNS server is configured correctly for your private cluster, as described earlier. + +> [!NOTE] +> Conditional Forwarding doesn't support subdomains. + +## More information + +- [General troubleshooting of AKS cluster creation issues](troubleshoot-aks-cluster-creation-issues.md) + +[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)] diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/cluster-service-health-probe-mode-issues.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/cluster-service-health-probe-mode-issues.md new file mode 100644 index 000000000..3386bf637 --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/cluster-service-health-probe-mode-issues.md @@ -0,0 +1,244 @@ +--- +title: Troubleshoot the health probe mode for AKS cluster service load balancer +description: Diagnoses and fixes common issues with the health probe mode feature. +ms.date: 06/03/2024 +ms.reviewer: niqi, cssakscic, v-weizhu +ms.service: azure-kubernetes-service +ms.custom: sap:Node/node pool availability and performance, devx-track-azurecli, innovation-engine +--- + +# Troubleshoot issues when enabling the AKS cluster service health probe mode + +The health probe mode feature allows you to configure how Azure Load Balancer probes the health of the nodes in your Azure Kubernetes Service (AKS) cluster. You can choose between two modes: Shared and ServiceNodePort. The Shared mode uses a single health probe for all external traffic policy cluster services that use the same load balancer. In contrast, the ServiceNodePort mode uses a separate health probe for each service. The Shared mode can reduce the number of health probes and improve the performance of the load balancer, but it requires some additional components to work properly. To enable this feature, see [How to enable the health probe mode feature using the Azure CLI](#how-to-enable-the-health-probe-mode-feature-using-the-azure-cli). + +This article describes some common issues about using the health probe mode feature in an AKS cluster and helps you troubleshoot and resolve these issues. + +## Symptoms + +When creating or updating an AKS cluster by using the Azure CLI, if you enable the health probe mode feature using the `--cluster-service-load-balancer-health-probe-mode Shared` flag, the following issues occur: + +- The load balancer doesn't distribute traffic to the nodes as expected. + +- The load balancer reports unhealthy nodes even if they're healthy. + +- The health-probe-proxy sidecar container crashes or doesn't start. + +- The cloud-node-manager pod crashes or doesn't start. + +The following operations also happen: + +1. RP frontend checks if the request is valid and updates the corresponding property in the LoadBalancerProfile. + +2. RP async calls the cloud provider config secret reconciler to update the cloud provider config secret based on the LoadBalancerProfile. + +3. Overlaymgr reconciles the cloud-node-manager chart to enable the health-probe-proxy sidecar. + +## Initial troubleshooting + +To troubleshoot these issues, follow these steps: + +0. First, connect to your AKS cluster using the Azure CLI: + + ```azurecli + export RESOURCE_GROUP="aks-rg" + export AKS_CLUSTER_NAME="aks-cluster" + az aks get-credentials --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER_NAME --overwrite-existing + ``` + +1. Next, check the RP frontend log to see if the health probe mode in the LoadBalancerProfile is properly configured. You can use the `az aks show` command to view the LoadBalancerProfile property of your cluster. + + ```azurecli + export RESOURCE_GROUP="aks-rg" + export AKS_CLUSTER_NAME="aks-cluster" + az aks show --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER_NAME --query "networkProfile.loadBalancerProfile" + ``` + Results: + + + + ```output + { + "clusterServiceLoadBalancerHealthProbeMode": "Shared", + "managedOutboundIPs": null, + "outboundIPs": null, + "outboundIPPrefixes": null, + "allocatedOutboundPorts": null, + "effectiveOutboundIPs": [ + { + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/MC_aks-rg_aks-cluster_eastus2/providers/Microsoft.Network/publicIPAddresses/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + } + ], + "idleTimeoutInMinutes": 30, + "loadBalancerSku": "standard", + "managedOutboundIPv6": null + } + ``` + +2. Check the cloud provider configuration. In modern AKS clusters, the cloud provider configuration is managed internally and the `ccp` namespace doesn't exist. Instead, check for cloud provider related resources and verify the cloud-node-manager pods are running properly: + + + ```bash + # Check for cloud provider related ConfigMaps in kube-system + kubectl get configmap -n kube-system | grep -i azure + + # Check if cloud-node-manager pods are running (indicates cloud provider integration is working) + kubectl get pods -n kube-system | grep cloud-node-manager + + # Check the azure-ip-masq-agent-config if it exists + kubectl get configmap azure-ip-masq-agent-config-reconciled -n kube-system -o yaml 2>/dev/null || echo "ConfigMap not found" + ``` + Results: + + + + ```output + configmap/azure-ip-masq-agent-config-reconciled 1 11h + + cloud-node-manager-rfb2w 2/2 Running 0 16m + ``` + +3. Check the chart or overlay daemonset cloud-node-manager to see if the health-probe-proxy sidecar container is enabled. You can use the `kubectl get ds` command to view the daemonset. + + ```shell + kubectl get ds -n kube-system cloud-node-manager -o yaml + ``` + Results: + + + + ```output + apiVersion: apps/v1 + kind: DaemonSet + metadata: + name: cloud-node-manager + namespace: kube-system + ... + spec: + template: + spec: + containers: + - name: cloud-node-manager + image: mcr.microsoft.com/oss/kubernetes/azure-cloud-node-manager:xxxxxxxx + - name: health-probe-proxy + image: mcr.microsoft.com/oss/kubernetes/azure-health-probe-proxy:xxxxxxxx + ... + ``` + +## Cause 1: The health probe mode isn't Shared or ServiceNodePort + +The health probe mode feature only works with these two modes. If you use any other mode, the feature won't work. + +### Solution 1: Use the correct health probe mode + +Make sure you use the Shared or ServiceNodePort mode when creating or updating your cluster. You can use the `--cluster-service-load-balancer-health-probe-mode` flag to specify the mode. + +## Cause 2: The toggle for the health probe mode feature is off + +The health probe mode feature is controlled by a toggle that can be enabled or disabled by the AKS team. If the toggle is off, the feature won't work. + +### Solution 2: Turn on the toggle + +Contact the AKS team to check if the toggle for the health probe mode feature is on or off. If it's off, ask them to turn it on for your subscription. + +## Cause 3: The load balancer SKU is Basic + +The health probe mode feature only works with the Standard Load Balancer SKU. If you use the Basic Load Balancer SKU, the feature won't work. + +### Solution 3: Use the Standard Load Balancer SKU + +Make sure you use the Standard Load Balancer SKU when creating or updating your cluster. You can use the `--load-balancer-sku` flag to specify the SKU. + +## Cause 4: The feature isn't registered + +The health probe mode feature requires you to register the feature on your subscription. If the feature isn't registered, it won't work. + +### Solution 4: Register the feature + +Make sure you register the feature for your subscription before creating or updating your cluster. You can use the `az feature register` command to register the feature. + +```azurecli +export FEATURE_NAME="EnableSLBSharedHealthProbePreview" +export PROVIDER_NAMESPACE="Microsoft.ContainerService" +az feature register --name $FEATURE_NAME --namespace $PROVIDER_NAMESPACE +``` +Results: + + + +```output +{ + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/providers/Microsoft.Features/providers/Microsoft.ContainerService/features/EnableAKSClusterServiceLoadBalancerHealthProbeMode", + "name": "Microsoft.ContainerService/EnableAKSClusterServiceLoadBalancerHealthProbeMode", + "properties": { + "state": "Registering" + }, + "type": "Microsoft.Features/providers/features" +} +``` + +## Cause 5: The Kubernetes version is earlier than v1.28.0 + +The health probe mode feature requires a minimum Kubernetes version of v1.28.0. If you use an older version, the feature won't work. + +### Solution 5: Upgrade the Kubernetes version + +Make sure you use Kubernetes v1.28.0 or a later version when creating or updating your cluster. You can use the `--kubernetes-version` flag to specify the version. + +## Known issues + +For Windows, the kube-proxy component doesn't start until you create the first non-HPC pod in a node. This issue affects the health probe mode feature and causes the load balancer to report unhealthy nodes. It will be fixed in a future update. + +## How to enable the health probe mode feature using the Azure CLI + +To enable the health probe mode feature, run one of the following commands: + +Enable `ServiceNodePort` health probe mode (default) for a cluster: + +```shell +export RESOURCE_GROUP="aks-rg" +export AKS_CLUSTER_NAME="aks-cluster" +az aks update --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER_NAME --cluster-service-load-balancer-health-probe-mode ServiceNodePort +``` +Results: + +```output +{ + "name": "aks-cluster", + "location": "eastus2", + "resourceGroup": "aks-rg", + "kubernetesVersion": "1.28.x", + "provisioningState": "Succeeded", + "loadBalancerProfile": { + "clusterServiceLoadBalancerHealthProbeMode": "ServiceNodePort", + ... + }, + ... +} +``` + +Enable `Shared` health probe mode for a cluster: + +```shell +export RESOURCE_GROUP="MyAksResourceGroup" +export AKS_CLUSTER_NAME="MyAksCluster" +az aks update --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER_NAME --cluster-service-load-balancer-health-probe-mode Shared +``` + +Results: + +```output +{ + "name": "MyAksCluster", + "location": "eastus2", + "resourceGroup": "MyAksResourceGroup", + "kubernetesVersion": "1.28.x", + "provisioningState": "Succeeded", + "loadBalancerProfile": { + "clusterServiceLoadBalancerHealthProbeMode": "Shared", + ... + }, + ... +} +``` + +[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)] diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/node-not-ready-after-being-healthy.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/node-not-ready-after-being-healthy.md new file mode 100644 index 000000000..0ece14b57 --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/node-not-ready-after-being-healthy.md @@ -0,0 +1,181 @@ +--- +title: Node Not Ready status after node is in a healthy state +description: Troubleshoot scenarios in which an Azure Kubernetes Service (AKS) cluster node goes to a Not Ready status after is in a healthy state. +ms.date: 08/27/2024 +ms.reviewer: rissing, chiragpa, momajed, v-leedennis +ms.service: azure-kubernetes-service +#Customer intent: As an Azure Kubernetes user, I want to prevent an Azure Kubernetes Service (AKS) cluster node from regressing to a Not Ready status so that I can continue to use the cluster node successfully. +ms.custom: sap:Node/node pool availability and performance, innovation-engine +--- + +# Troubleshoot a change in a healthy node to Not Ready status + +This article discusses a scenario in which the status of an Azure Kubernetes Service (AKS) cluster node changes to **Not Ready** after the node is in a healthy state for some time. This article outlines the particular cause and provides a possible solution. + +## Prerequisites + +- The Kubernetes [kubectl](https://kubernetes.io/docs/reference/kubectl/overview/) tool. To install kubectl by using Azure CLI, run the [az aks install-cli](/cli/azure/aks#az-aks-install-cli) command. +- The Kubernetes [kubelet](https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/) tool. +- The Kubernetes [containerd](https://kubernetes.io/docs/setup/production-environment/container-runtimes/#containerd) tool. +- The following Linux tools: + - [awk](https://man7.org/linux/man-pages/man1/awk.1p.html) + - [head](https://man7.org/linux/man-pages/man1/head.1.html) + - [journalctl](https://man7.org/linux/man-pages/man1/journalctl.1.html) + - [ps](https://man7.org/linux/man-pages/man1/ps.1.html) + - [sort](https://man7.org/linux/man-pages/man1/sort.1.html) + - [watch](https://man7.org/linux/man-pages/man1/watch.1.html) + +## Connect to the AKS cluster + +Before you can troubleshoot the issue, you must connect to the AKS cluster. To do so, run the following commands: + +```bash +export RANDOM_SUFFIX=$(head -c 3 /dev/urandom | xxd -p) +export RESOURCE_GROUP="my-resource-group$RANDOM_SUFFIX" +export AKS_CLUSTER="my-aks-cluster$RANDOM_SUFFIX" +az aks get-credentials --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER --overwrite-existing +``` + +## Symptoms + +The status of a cluster node that has a healthy state (all services running) unexpectedly changes to **Not Ready**. To view the status of a node, run the following [kubectl describe](https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#describe) command: + +```bash +kubectl describe nodes +``` + +## Cause + +The [kubelet](https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/) stopped posting its **Ready** status. + +Examine the output of the `kubectl describe nodes` command to find the [Conditions](https://kubernetes.io/docs/reference/node/node-status/#condition) field and the [Capacity and Allocatable](https://kubernetes.io/docs/reference/node/node-status/#capacity) blocks. Do the content of these fields appear as expected? (For example, in the **Conditions** field, does the `message` property contain the "kubelet is posting ready status" string?) In this case, if you have direct Secure Shell (SSH) access to the node, check the recent events to understand the error. Look within the */var/log/syslog* file instead of */var/log/messages* (not available on all distributions). Or, generate the kubelet and container daemon log files by running the following shell commands: + +```bash +# First, identify the NotReady node +export NODE_NAME=$(kubectl get nodes --no-headers | grep NotReady | awk '{print $1}' | head -1) + +if [ -z "$NODE_NAME" ]; then + echo "No NotReady nodes found" + kubectl get nodes +else + echo "Found NotReady node: $NODE_NAME" + + # Use kubectl debug to access the node + kubectl debug node/$NODE_NAME -it --image=mcr.microsoft.com/dotnet/runtime-deps:6.0 -- chroot /host bash -c " + echo '=== Checking syslog ===' + if [ -f /var/log/syslog ]; then + tail -100 /var/log/syslog + else + echo 'syslog not found' + fi + + echo '=== Checking kubelet logs ===' + journalctl -u kubelet --no-pager | tail -100 + + echo '=== Checking containerd logs ===' + journalctl -u containerd --no-pager | tail -100 + " +fi +``` + +After you run these commands, examine the syslog and daemon log files for more information about the error. + +## Solution + +### Step 1: Check for changes in network-level + +If all cluster nodes regressed to a **Not Ready** status, check whether any changes occurred at the network level. Examples of network-level changes include: + +- Domain name system (DNS) changes +- Firewall rule changes, such as port, fully qualified domain names (FQDNs), and so on. +- Added network security groups (NSGs) +- Applied or changed route table configurations for AKS traffic + +If there were changes at the network level, make any necessary corrections. If you have direct Secure Shell (SSH) access to the node, you can use the `curl` or `telnet` command to check the connectivity to [AKS outbound requirements](/azure/aks/outbound-rules-control-egress). After you've fixed the issues, stop and restart the nodes. If the nodes stay in a healthy state after these fixes, you can safely skip the remaining steps. + +### Step 2: Stop and restart the nodes + +If only a few nodes regressed to a **Not Ready** status, simply stop and restart the nodes. This action alone might return the nodes to a healthy state. Then, check [Azure Kubernetes Service diagnostics overview](/azure/aks/concepts-diagnostics) to determine whether there are any issues, such as the following issues: + +- Node faults +- Source network address translation (SNAT) failures +- Node input/output operations per second (IOPS) performance issues +- Other issues + +If the diagnostics don't discover any underlying issues and the nodes returned to Ready status, you can safely skip the remaining steps. + +### Step 3: Fix SNAT issues for public AKS API clusters + +Did AKS diagnostics uncover any SNAT issues? If so, take some of the following actions, as appropriate: + +- Check whether your connections remain idle for a long time and rely on the default idle time-out to release its port. If the connections exhibit this behavior, you might have to reduce the default time-out of 30 minutes. + +- Determine how your application creates outbound connectivity. For example, does it use code review or packet capture? + +- Determine whether this activity represents the expected behavior or, instead, it shows that the application is misbehaving. Use metrics and logs in Azure Monitor to substantiate your findings. For example, you can use the **Failed** category as a SNAT Connections metric. + +- Evaluate whether appropriate patterns are followed. + +- Evaluate whether you should mitigate SNAT port exhaustion by using extra outbound IP addresses and more allocated outbound ports. For more information, see [Scale the number of managed outbound public IPs](/azure/aks/load-balancer-standard#scale-the-number-of-managed-outbound-public-ips) and [Configure the allocated outbound ports](/azure/aks/load-balancer-standard#configure-the-allocated-outbound-ports). + +For more information about how to troubleshoot SNAT port exhaution, see [Troubleshoot SNAT port exhaustion on AKS nodes](../connectivity/snat-port-exhaustion.md?tabs=for-a-linux-pod). + +### Step 4: Fix IOPS performance issues + +If AKS diagnostics uncover issues that reduce IOPS performance, take some of the following actions, as appropriate: + +- To increase IOPS on virtual machine (VM) scale sets, choose a a larger disk size that offers better IOPS performance by deploying a new node pool. Direct resizing VMSS directly isn't supported. For more information on resizing node pools, see [Resize node pools in Azure Kubernetes Service (AKS)](/azure/aks/resize-node-pool?tabs=azure-cli). + +- Increase the node SKU size for more memory and CPU processing capability. + +- Consider using [Ephemeral OS](/azure/aks/cluster-configuration#ephemeral-os). + +- Limit the CPU and memory usage for pods. These limits help prevent node CPU consumption and out-of-memory situations. + +- Use scheduling topology methods to add more nodes and distribute the load among the nodes. For more information, see [Pod topology spread constraints](https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/). + +### Step 5: Fix threading issues + +Kubernetes components such as kubelets and [containerd runtimes](https://kubernetes.io/docs/setup/production-environment/container-runtimes/#containerd) rely heavily on threading, and they spawn new threads regularly. If the allocation of new threads is unsuccessful, this failure can affect service readiness, as follows: + +- The node status changes to **Not Ready**, but it's restarted by a remediator, and is able to recover. + +- In the */var/log/messages* and */var/log/syslog* log files, there are repeated occurrences of the following error entries: + + > pthread_create failed: Resource temporarily unavailable by various processes + + The processes that are cited include containerd and possibly kubelet. + +- The node status changes to **Not Ready** soon after the `pthread_create` failure entries are written to the log files. + +Process IDs (PIDs) represent threads. The default number of PIDs that a pod can use might be dependent on the operating system. However, the default number is at least 32,768. This amount is more than enough PIDs for most situations. Are there any known application requirements for higher PID resources? If there aren't, then even an eight-fold increase to 262,144 PIDs might not be enough to accommodate a high-resource application. + +Instead, identify the offending application, and then take the appropriate action. Consider other options, such as increasing the VM size or upgrading AKS. These actions can mitigate the issue temporarily, but they aren't a guarantee that the issue won't reappear again. + +To monitor the thread count for each control group (cgroup) and print the top eight cgroups, run the following shell command: + +```bash +# Show current thread count for each cgroup (top 8) +ps -e -w -o "thcount,cgname" --no-headers | awk '{a[$2] += $1} END{for (i in a) print a[i], i}' | sort --numeric-sort --reverse | head --lines=8 +``` + +For more information, see [Process ID limits and reservations](https://kubernetes.io/docs/concepts/policy/pid-limiting/). + +Kubernetes offers two methods to manage PID exhaustion at the node level: + +1. Configure the maximum number of PIDs that are allowed on a pod within a kubelet by using the `--pod-max-pids` parameter. This configuration sets the `pids.max` setting within the cgroup of each pod. You can also use the `--system-reserved` and `--kube-reserved` parameters to configure the system and kubelet limits, respectively. + +1. Configure PID-based eviction. + +> [!NOTE] +> By default, neither of these methods are set up. Additionally, you can't currently configure either method by using [Node configuration for AKS node pools](/azure/aks/custom-node-configuration). + +### Step 6: Use a higher service tier + +You can make sure that the AKS API server has high availability by using a higher service tier. For more information, see the [Azure Kubernetes Service (AKS) Uptime SLA](/azure/aks/uptime-sla). + +## More information + +- To view the health and performance of the AKS API server and kubelets, see [Managed AKS components](/azure/aks/monitor-aks#level-2---managed-aks-components). + +- For general troubleshooting steps, see [Basic troubleshooting of node not ready failures](node-not-ready-basic-troubleshooting.md). diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/node-not-ready-custom-script-extension-errors.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/node-not-ready-custom-script-extension-errors.md new file mode 100644 index 000000000..a08f76e3d --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/node-not-ready-custom-script-extension-errors.md @@ -0,0 +1,150 @@ +--- +title: Node Not Ready because of custom script extension (CSE) errors +description: Troubleshoot scenarios in which custom script extension (CSE) errors cause Node Not Ready states in an Azure Kubernetes Service (AKS) cluster node pool. +ms.date: 06/08/2024 +ms.reviewer: rissing, chiragpa, momajed, v-leedennis +ms.service: azure-kubernetes-service +ms.custom: sap:Node/node pool availability and performance, devx-track-azurecli, innovation-engine +author: MicrosoftDocs +ms.author: MicrosoftDocs +--- + +# Troubleshoot node not ready failures caused by CSE errors + +This article helps you troubleshoot scenarios in which a Microsoft Azure Kubernetes Service (AKS) cluster isn't in the `Succeeded` state and an AKS node isn't ready within a node pool because of custom script extension (CSE) errors. + +## Prerequisites + +- [Azure CLI](/cli/azure/install-azure-cli) + +## Symptoms + +Because of CSE errors, an AKS cluster node isn't ready within a node pool, and the AKS cluster isn't in the `Succeeded` state. + +## Cause + +The node extension deployment fails and returns more than one error code when you provision the [kubelet](https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/) and other components. This is the most common cause of errors. To verify that the node extension deployment is failing when you provision the kubelet, follow these steps: + +1. To better understand the current failure on the cluster, run the [az aks show](/cli/azure/aks#az-aks-show) and [az resource update](/cli/azure/resource#az-resource-update) commands to set up debugging: + + Set your environment variables and run the commands to view the cluster's status and debug information. + + ```azurecli + export RG_NAME="my-aks-rg" + export CLUSTER_NAME="myakscluster" + clusterResourceId=$(az aks show \ + --resource-group $RG_NAME --name $CLUSTER_NAME --output tsv --query id) + az resource update --debug --verbose --ids $clusterResourceId + ``` + + Results: + + + + ```output + { + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/my-aks-rg-xxx/providers/Microsoft.ContainerService/managedClusters/myaksclusterxxx", + "name": "myaksclusterxxx", + "type": "Microsoft.ContainerService/managedClusters", + "location": "eastus2", + "tags": null, + "properties": { + ... + } + } + ``` + +1. Check the debugging output and the error messages that you received from the `az resource update` command against the error list in the [CSE helper](https://github.com/Azure/AgentBaker/blob/1bf9892afd715a34e0c6b7312e712047f10319ce/parts/linux/cloud-init/artifacts/cse_helpers.sh) executable file on GitHub. + +If any of the errors involve the CSE deployment of the kubelet, then you've verified that the scenario that's described here's the cause of the Node Not Ready failure. + +In general, exit codes identify the specific issue that's causing the failure. For example, you see messages such as "Unable to communicate with API server" or "Unable to connect to internet." Or the exit codes might alert you to API network time-outs, or a node fault that needs a replacement. + +## Solution 1: Make sure your custom DNS server is configured correctly + +Set up your custom Domain Name System (DNS) server so that it can do name resolution correctly. Configure the server to meet the following requirements: + +- If you're using custom DNS servers, make sure that the servers are healthy and reachable over the network. + +- Make sure that custom DNS servers have the required [conditional forwarders to the Azure DNS IP address](/azure/private-link/private-endpoint-dns#on-premises-workloads-using-a-dns-forwarder) (or the forwarder to that address). + +- Make sure that your private AKS DNS zone is linked to your custom DNS virtual networks if they're hosted on Azure. + +- Don't use the Azure DNS IP address with the IP addresses of your custom DNS server. Doing this isn't recommended. + +- Avoid using IP addresses instead of the DNS server in DNS settings. You can use Azure CLI commands to check for this situation on a Virtual Machine Scale Set or availability set. + + - For Virtual Machine Scale Set nodes, use the [az vmss run-command invoke](/cli/azure/vmss/run-command#az-vmss-run-command-invoke) command: + + > **Important:** You must specify the `--instance-id` of the VM scale set. Here, we demonstrate querying for a valid instance ID (e.g., 0) and a likely VMSS in an AKS node resource group. Update values appropriately to match your environment. + + ```azurecli + export NODE_RESOURCE_GROUP=$(az aks show --resource-group $RG_NAME --name $CLUSTER_NAME --query nodeResourceGroup -o tsv) + export VMSS_NAME=$(az vmss list --resource-group $NODE_RESOURCE_GROUP --query "[0].name" -o tsv) + export DNS_IP_ADDRESS="10.0.0.10" + export INSTANCE_ID=$(az vmss list-instances --resource-group $NODE_RESOURCE_GROUP --name $VMSS_NAME --query "[0].instanceId" -o tsv) + export API_FQDN=$(az aks show --resource-group $RG_NAME --name $CLUSTER_NAME --query fqdn -o tsv) + + az vmss run-command invoke \ + --resource-group $NODE_RESOURCE_GROUP \ + --name $VMSS_NAME \ + --instance-id $INSTANCE_ID \ + --command-id RunShellScript \ + --output tsv \ + --query "value[0].message" \ + --scripts "telnet $DNS_IP_ADDRESS 53" + az vmss run-command invoke \ + --resource-group $NODE_RESOURCE_GROUP \ + --name $VMSS_NAME \ + --instance-id $INSTANCE_ID \ + --command-id RunShellScript \ + --output tsv \ + --query "value[0].message" \ + --scripts "nslookup $API_FQDN $DNS_IP_ADDRESS" + ``` + + - For VM availability set nodes, use the [az vm run-command invoke](/cli/azure/vm/run-command#az-vm-run-command-invoke) command: + + > **Important:** You must specify the `--name` of a valid VM in an availability set in your resource group. Here is a template for running network checks. + + ```azurecli + az vm run-command invoke \ + --resource-group $RG_NAME \ + --name $AVAILABILITY_SET_VM \ + --command-id RunShellScript \ + --output tsv \ + --query "value[0].message" \ + --scripts "telnet $DNS_IP_ADDRESS 53" + az vm run-command invoke \ + --resource-group $RG_NAME \ + --name $AVAILABILITY_SET_VM \ + --command-id RunShellScript \ + --output tsv \ + --query "value[0].message" \ + --scripts "nslookup $API_FQDN $DNS_IP_ADDRESS" + ``` + +For more information, see [Name resolution for resources in Azure virtual networks](/azure/virtual-network/virtual-networks-name-resolution-for-vms-and-role-instances) and [Hub and spoke with custom DNS](/azure/aks/private-clusters#hub-and-spoke-with-custom-dns). + +## Solution 2: Fix API network time-outs + +Make sure that the API server can be reached and isn't subject to delays. To do this, follow these steps: + +- Check the AKS subnet to see whether the assigned network security group (NSG) is blocking the egress traffic port 443 to the API server. + +- Check the node itself to see whether the node has another NSG that's blocking the traffic. + +- Check the AKS subnet for any assigned route table. If a route table has a network virtual appliance (NVA) or firewall, make sure that port 443 is available for egress traffic. For more information, see [Control egress traffic for cluster nodes in AKS](/azure/aks/limit-egress-traffic). + +- If the DNS resolves names successfully and the API is reachable, but the node CSE failed because of an API time-out, take the appropriate action as shown in the following table. + + | Set type | Action | + | -------- | ------ | + | VM availability set | Delete the node from the Azure portal and the AKS API by using the [kubectl delete](https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#delete) node command, and then scale up the cluster again. | + | Virtual Machine Scale Set | Either reimage the node from the Azure portal, or delete the node, and then scale up the cluster again. To delete the specific node, use [az aks nodepool delete-machines](/cli/azure/aks/nodepool#az-aks-nodepool-delete-machines) command. It will cordon & drain first and then delete the node. | + +- If the requests are being throttled by the AKS API server, upgrade to a higher service tier. For more information, see [Pricing tiers for AKS](/azure/aks/free-standard-pricing-tiers). + +## More information + +- For general troubleshooting steps, see [Basic troubleshooting of Node Not Ready failures](node-not-ready-basic-troubleshooting.md). diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/client-ip-address-cannot-access-api-server.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/client-ip-address-cannot-access-api-server.md new file mode 100644 index 000000000..1c6380502 --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/client-ip-address-cannot-access-api-server.md @@ -0,0 +1,122 @@ +--- +title: Client IP address can't access the API server +description: Troubleshoot issues caused when the client IP address can't access the API server on an Azure Kubernetes Service (AKS) cluster. +ms.topic: article +ms.date: 06/11/2024 +author: microsoftdocs +ms.author: microsoftdocs +ms.custom: sap:Connectivity, innovation-engine +--- + +# Client IP address can't access the API server + +This article describes how to fix issues that occur when you can't connect to an Azure Kubernetes Service (AKS) cluster because your client IP address can't access the AKS API server. + +## Prerequisites + +- [Azure CLI](/cli/azure/install-azure-cli). +- The client URL ([curl](https://techcommunity.microsoft.com/t5/containers/tar-and-curl-come-to-windows/ba-p/382409)) tool. + +## Symptoms + +### [Azure portal](#tab/azure-portal) + +When you try to access Kubernetes resources such as mamespaces and workloads from the Azure portal, you might encounter the following errors: + +> Network error +> +> Unable to reach the api server 'https://\' or api server is too busy to respond. Check your network settings and refresh to try again. + +:::image type="content" source="media/client-ip-address-cannot-access-api-server/network-error.png" alt-text="Screenshot of mamespaces in the AKS resource." lightbox="media/client-ip-address-cannot-access-api-server/network-error.png"::: + +### [Azure CLI](#tab/azure-cli) + +When you try to connect to a cluster using the Azure CLI, you might see the following errors: + +```output +"Unhandled Error" err="couldn't get current server API group list: Get \"https://:443/api?timeout=32s\": dial tcp :443: i/o timeout" + +Unable to connect to the server: dial tcp :443: i/o timeout + +Unable to connect to the server: dial tcp :443: connectex: A connection attempt failed because the connected party did not properly respond after a period, or established connection failed because connected host has failed to respond. +``` + +--- + +## Cause + +[API server-authorized IP ranges](/azure/aks/api-server-authorized-ip-ranges) may have been enabled on the cluster's API server, but the client's IP address wasn't included in the IP ranges. To check whether this feature has been enabled, see if the following [az aks show](/cli/azure/aks#az-aks-show) command in Azure CLI produces a list of IP ranges: + +```azurecli +az aks show --resource-group ${RG_NAME} \ + --name ${CLUSTER_NAME} \ + --query apiServerAccessProfile.authorizedIpRanges +``` + +## Solution + +Look at the cluster's API server-authorized ranges, and add your client's IP address within that range. + +> [!NOTE] +> +> 1. Do you access the API server from a corporate network where traffic is routed through a proxy server or firewall? Then ask your network administrator before you add your client IP address to the list of authorized ranges for the API server. +> +> 1. Also ask your cluster administrator before you add your client IP address, because there might be security concerns with adding a temporary IP address to the list of authorized ranges. + +### [Azure portal](#tab/azure-portal) + +1. Navigate to the cluster from the Azure portal. +2. In the left menu, locate **Settings** and then select **Networking**. +3. On the **Networking** page, select the **Overview** tab. +4. Select **Manage** under **Resource settings**. +5. In the **Authorized IP ranges** pane, add your client IP address as shown in the following screenshot: + + :::image type="content" source="media/client-ip-address-cannot-access-api-server/authorized-ip-ranges.png" alt-text="Screenshot of Authorized-ip-ranges pane." lightbox="media/client-ip-address-cannot-access-api-server/authorized-ip-ranges.png"::: + +### [Azure CLI](#tab/azure-cli) + +1. Get your client IP address by running this [curl](https://curl.se/docs/manpage.html) command: + + ```azurecli + export CLIENT_IP=$(curl --silent https://ipinfo.io/ip | tr -d '\n') + echo $CLIENT_IP + ``` + + Results: + + + + ```output + 0.255.127.63 + ``` + +2. Update the API server-authorized range with the [az aks update](/cli/azure/aks#az-aks-update) command in Azure CLI, using your client IP address: + + ```azurecli + az aks update --resource-group $RG_NAME \ + --name $CLUSTER_NAME \ + --api-server-authorized-ip-ranges $CLIENT_IP + ``` + + Results: + + + + ```output + { + "apiServerAccessProfile": { + "authorizedIpRanges": [ + "0.255.127.63/32" + ], + ... + }, + ... + "name": "aks-cluster-xxx", + "resourceGroup": "aks-rg-xxx", + ... + } + ``` + +--- + +[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)] diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/tcp-timeouts-dial-tcp-nodeip-10250-io-timeout.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/tcp-timeouts-dial-tcp-nodeip-10250-io-timeout.md new file mode 100644 index 000000000..9c1406b85 --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/tcp-timeouts-dial-tcp-nodeip-10250-io-timeout.md @@ -0,0 +1,50 @@ +--- +title: TCP 10250 I/O timeout errors when connecting to a node's Kubelet for log retrieval +description: Learn how to troubleshoot TCP 10250 I/O timeout errors that occur when retrieving kubectl logs from a pod in an Azure Kubernetes Service (AKS) cluster. +ms.topic: article +ms.date: 06/03/2025 +author: '' +ms.author: '' +ms.custom: sap:Connectivity, innovation-engine +ms.reviewer: chiragpa, nickoman, v-leedennis +ms.service: azure-kubernetes-service +keywords: +#Customer intent: As an Azure Kubernetes user, I want to troubleshoot why I'm receiving TCP timeouts (such as 'dial tcp :10250: i/o timeout') so that I can use my Azure Kubernetes Service (AKS) cluster successfully. +--- + +# 10250 I/O timeouts error when running kubectl log command + +TCP timeouts can be caused by blockages of internal traffic that runs between nodes. To investigate TCP time-outs, verify that this traffic isn't being blocked, for example, by [network security groups](/azure/aks/concepts-security#azure-network-security-groups) (NSGs) on the subnet for your cluster nodes. + +## Connect to the cluster + +First, connect to your Azure Kubernetes Service (AKS) cluster by running the following command: + +```bash +export RESOURCE_GROUP= +export CLUSTER_NAME= + +az aks get-credentials --resource-group $RESOURCE_GROUP --name $CLUSTER_NAME +``` + +## Symptoms + +Tunnel functionalities, such as `kubectl logs` and code execution, work only for pods that are hosted on nodes on which tunnel service pods are deployed. Pods on other nodes that have no tunnel service pods cannot reach to the tunnel. When viewing the logs of these pods, you receive the following error message: + +```bash +kubectl logs $POD_NAME +``` + +Results: + + + +```output +Error from server: Get "https://aks-agentpool-xxxxxxxxx-vmssxxxxxxxxx:10250/containerLogs/vsm-mba-prod/mba-api-app-xxxxxxxxxx/technosvc": dial tcp :10250: i/o timeout +``` + +## Solution + +To resolve this issue, allow traffic on port 10250 as described in this [article](tunnel-connectivity-issues.md). + +[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)] diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/tcp-timeouts-kubetctl-third-party-tools-connect-api-server.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/tcp-timeouts-kubetctl-third-party-tools-connect-api-server.md new file mode 100644 index 000000000..82cea04cc --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/tcp-timeouts-kubetctl-third-party-tools-connect-api-server.md @@ -0,0 +1,143 @@ +--- +title: TCP time-outs when kubectl or other 3rd-party tools connect to API +description: Troubleshoot TCP time-outs that occur when kubectl or other third-party tools connect to the API server in Azure Kubernetes Service (AKS). +ms.topic: article +ms.date: 06/03/2024 +author: azureuser +ms.author: azureuser +ms.custom: sap:Connectivity,innovation-engine +--- + +# TCP time-outs when kubectl or other third-party tools connect to the API server + +This article discusses how to troubleshoot TCP time-outs that occur when [kubectl](https://kubernetes.io/docs/reference/kubectl/) or other third-party tools are used to connect to the API server in Microsoft Azure Kubernetes Service (AKS). To ensure its service-level objectives (SLOs) and service-level agreements (SLAs), AKS uses high-availability (HA) control planes that scale vertically and horizontally, based on the number of cores. + +## Symptoms + +You experience repeated connection time-outs. + +## Cause 1: Pods that are responsible for node-to-control plane communication aren't running + +If only a few of your API commands are timing out consistently, the following pods might not be in a running state: + +- `konnectivity-agent` +- `tunnelfront` +- `aks-link` + +> [!NOTE] +> In newer AKS versions, `tunnelfront` and `aks-link` are replaced with `konnectivity-agent`, so you'll only see `konnectivity-agent`. + +These pods are responsible for communication between a node and the control plane. + +### Solution: Reduce the utilization or stress of the node hosts + +Make sure the nodes that host these pods aren't overly utilized or under stress. Consider moving the nodes to their own [system node pool](/azure/aks/use-system-pools). + +To check which node the `konnectivity-agent` pod is hosted on and the usage of the node, run the following commands: + +Set access to the AKS cluster. Replace the values of `ResourceGroupName` and `AKSClusterName` with your own. + +```bash +az aks get-credentials --resource-group ${ResourceGroupName} --name ${AKSClusterName} --overwrite-existing +``` + +Check the running pods in the kube-system namespace and which node each one is assigned to: + +```bash +kubectl get pod -n kube-system -o wide +``` + +Results: + + + +```output +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +konnectivity-agent-xxxxx 1/1 Running 0 22h 10.xxx.xx.xxx aks-nodepool1-xxxxx-vmss000000 +coredns-xxxxx 1/1 Running 0 22h 10.xxx.xx.xxx aks-nodepool1-xxxxx-vmss000001 +# ...other pods... +``` + +Check the usage of the nodes and see resource utilization for each node: + +```bash +kubectl top node +``` + +Results: + + + +```output +NAME CPU(cores) CPU% MEMORY(bytes) MEMORY% +aks-nodepool1-xxxxx-vmss000000 125m 12% 1510Mi 37% +aks-nodepool1-xxxxx-vmss000001 106m 10% 1203Mi 42% +# ...other nodes... +``` + +## Cause 2: Access is blocked on some required ports, FQDNs, and IP addresses + +If the required ports, fully qualified domain names (FQDNs), and IP addresses aren't all opened, several command calls might fail. Secure, tunneled communication on AKS between the API server and the [kubelet](https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/) (through the `konnectivity-agent` pod) requires some of those items to work successfully. + +### Solution: Open the necessary ports, FQDNs, and IP addresses + +For more information about what ports, FQDNs, and IP addresses need to be opened, see [Outbound network and FQDN rules for Azure Kubernetes Service (AKS) clusters](/azure/aks/outbound-rules-control-egress). + +## Cause 3: The Application-Layer Protocol Negotiation TLS extension is blocked + +To establish a connection between the control plane and nodes, the `konnectivity-agent` pod requires the [Transport Layer Security (TLS) extension for Application-Layer Protocol Negotiation (ALPN)](https://datatracker.ietf.org/doc/html/rfc7301). You might have previously blocked this extension. + +### Solution: Enable the ALPN extension + +Enable the ALPN extension on the `konnectivity-agent` pod to prevent TCP time-outs. + +## Cause 4: The API server's IP authorized ranges doesn't cover your current IP address + +If you use authorized IP address ranges on your API server, your API calls will be blocked if your IP isn't included in the authorized ranges. + +### Solution: Modify the authorized IP address ranges so that it covers your IP address + +Change the authorized IP address ranges so that your IP address is covered. For more information, see [Update a cluster's API server authorized IP ranges](/azure/aks/api-server-authorized-ip-ranges#update-a-clusters-api-server-authorized-ip-ranges). + +## Cause 5: A client or application leaks calls to the API server + +Frequent GET calls can accumulate and overload the API server. + +### Solution: Use watches instead of GET calls, but make sure the application doesn't leak those calls + +Make sure that you use watches instead of frequent GET calls to the API server. You also have to make sure that your third-party applications don't leak any watch connections or GET calls. For example, in the [Istio microservice architecture](https://istio-releases.github.io/v0.1/docs/concepts/what-is-istio/overview.html), a [bug in the mixer application](https://github.com/istio/istio/issues/19481) creates a new API server watch connection whenever a secret is read internally. Because this behavior happens at a regular interval, the watch connections quickly accumulate. These connections eventually cause the API server to become overloaded no matter the scaling pattern. + +## Cause 6: Too many releases in your Helm deployments + +If you use too many releases in your deployments of [Helm](https://helm.sh/) (the Kubernetes package manager), the nodes start to consume too much memory. It also results in a large amount of `ConfigMap` (configuration data) objects, which might cause unnecessary usage spikes on the API server. + +### Solution: Limit the maximum number of revisions for each release + +Because the maximum number of revisions for each release is infinite by default, you need to run a command to set this maximum number to a reasonable value. For Helm 2, the command is [helm init](https://v2.helm.sh/docs/helm/#helm-init). For Helm 3, the command is [helm upgrade](https://helm.sh/docs/helm/helm_upgrade/). Set the `--history-max ` parameter when you run the command. + +| Version | Command | +|---------|--------------------------------------------------------------------------------| +| Helm 2 | `helm init --history-max ...` | +| Helm 3 | `helm upgrade ... --history-max ...` | + +## Cause 7: Internal traffic between nodes is being blocked + +There might be internal traffic blockages between nodes in your AKS cluster. + +### Solution: Troubleshoot the "dial tcp :10250: i/o timeout" error + +See [Troubleshoot TCP timeouts, such as "dial tcp :10250: i/o timeout"](tcp-timeouts-dial-tcp-nodeip-10250-io-timeout.md). + +## Cause 8: Your cluster is private + +Your cluster is a private cluster, but the client from which you're trying to access the API server is in a public or different network that can't connect to the subnet used by AKS. + +### Solution: Use a client that can access the AKS subnet + +Since your cluster is private and its control plane is in the AKS subnet, it can't be connected to the API server unless it's in a network that can connect to the AKS subnet. It's an expected behavior. + +In this case, try to access the API server from a client in a network that can communicate with the AKS subnet. Additionally, verify network security groups (NSGs) or other appliances between networks aren't blocking packets. + +[!INCLUDE [Third-party disclaimer](../../../includes/third-party-disclaimer.md)] + +[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)] diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/troubleshoot-cluster-connection-issues-api-server.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/troubleshoot-cluster-connection-issues-api-server.md new file mode 100644 index 000000000..cf8f9023d --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/troubleshoot-cluster-connection-issues-api-server.md @@ -0,0 +1,90 @@ +--- +title: Troubleshoot cluster connection issues with the API server +description: Troubleshoot issues that occur when you attempt to connect to the API server of an Azure Kubernetes Service (AKS) cluster. +ms.date: 08/30/2024 +ms.reviewer: rissing chiragpa, beleite, v-leedennis, v-weizhu +ms.service: azure-kubernetes-service +#Customer intent: As an Azure Kubernetes user, I want to take basic troubleshooting measures so that I can avoid cluster connectivity issues with the API server. +ms.custom: sap:Connectivity,innovation-engine +--- + +# Basic troubleshooting of cluster connection issues with the API server + +This article discusses connection issues to an Azure Kubernetes Service (AKS) cluster when you can't reach the cluster's API server through the Kubernetes cluster command-line tool ([kubectl](https://kubernetes.io/docs/reference/kubectl/overview/)) or any other tool, such as using REST API through a programming language. + +## Prerequisites + +- [Azure CLI](/cli/azure/install-azure-cli). + +## Root cause and solutions + +Connection issues to the API server can occur for many reasons, but the root cause is often related to an error with one of these items: + +- Network +- Authentication +- Authorization + +You can take these common troubleshooting steps to check the connectivity to the AKS cluster's API server: + +1. Enter the following [az aks show](/cli/azure/aks#az-aks-show) command in Azure CLI. This command gets the fully qualified domain name (FQDN) of your AKS cluster. + + First, export your resource names to environment variables and add a random suffix to the resource group and cluster names for unique testing. + + ```azurecli + export RANDOM_SUFFIX=$(head -c 3 /dev/urandom | xxd -p) + export RESOURCE_GROUP="my-aks-rg$RANDOM_SUFFIX" + export AKS_CLUSTER="myakscluster$RANDOM_SUFFIX" + az aks show --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER --query fqdn + ``` + + Results: + + + + ```output + "xxxxxx-xxxxxxxx.hcp.eastus2.azmk8s.io" + ``` + +2. With the FQDN, check whether the API server is reachable from the client machine by using the name server lookup ([nslookup](/windows-server/administration/windows-commands/nslookup)), client URL ([curl](https://curl.se/docs/manpage.html)), and [telnet](/windows-server/administration/windows-commands/telnet) commands: + + Replace `` with the actual FQDN returned from the previous step. For demonstration, we use a variable. + + ```bash + export CLUSTER_FQDN=$(az aks show --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER --query fqdn -o tsv) + + # Check if the DNS Resolution is working: + nslookup $CLUSTER_FQDN + + # Then check if the API Server is reachable: + curl -k -Iv https://$CLUSTER_FQDN + + # Test raw TCP connectivity (output will vary depending on environment) + timeout 5 telnet $CLUSTER_FQDN 443 || echo "Connection test completed" + ``` + +3. If the AKS cluster is private, make sure you run the command from a virtual machine (VM) that can access the AKS cluster's Azure Virtual Network. See [Options for connecting to the private cluster](/azure/aks/private-clusters#options-for-connecting-to-the-private-cluster). + +4. If necessary, follow the steps in the troubleshooting article [Client IP address can't access the API server](client-ip-address-cannot-access-api-server.md), so the API server adds your client IP address to the IP ranges it authorizes. + +5. Make sure the version of kubectl on your client machine isn't two or more minor versions behind the AKS cluster's version of that tool. To install the latest version of kubectl, run the [az aks install-cli](/cli/azure/aks#az-aks-install-cli) command in Azure CLI. You can then run [kubectl version](https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#version) command to check the version number of the new installation. + + For example, on Linux you would run these commands: + + ```shell + sudo az aks install-cli + kubectl version --client + ``` + + For other client operating systems, use these [kubectl installation instructions](https://kubernetes.io/docs/tasks/tools/). + +6. If necessary, follow the steps in the troubleshooting article [Config file isn't available when connecting](config-file-is-not-available-when-connecting.md), so your Kubernetes configuration file (*config*) is valid and can be found at connection time. + +7. If necessary, follow the steps in the troubleshooting article [User can't get cluster resources](user-cannot-get-cluster-resources.md), so you can list the details of your cluster nodes. + +8. If you're using a firewall to control egress traffic from AKS worker nodes, make sure the firewall allows the [minimum required egress rules for AKS](/azure/aks/limit-egress-traffic). + +9. Make sure the [network security group that's associated with AKS nodes](/azure/aks/concepts-security#azure-network-security-groups) allows communication on TCP port 10250 within the AKS nodes. + +For other common troubleshooting steps, see [TCP time-outs when kubectl or other third-party tools connect to the API server](tcp-timeouts-kubetctl-third-party-tools-connect-api-server.md). + +[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)] diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/user-cannot-get-cluster-resources.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/user-cannot-get-cluster-resources.md new file mode 100644 index 000000000..35096c287 --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/connectivity/user-cannot-get-cluster-resources.md @@ -0,0 +1,107 @@ +--- +title: Troubleshoot "Forbidden" error when trying to access AKS cluster resources +description: Troubleshoot "Error from server (Forbidden)" RBAC-related errors that occur when you try to view Kubernetes resources in an AKS cluster. +ms.date: 08/26/2024 +ms.reviewer: rissing chiragpa, v-leedennis +ms.service: azure-kubernetes-service +#Customer intent: As an Azure Kubernetes administrator, I want to fix RBAC-related errors so that users can access their cluster resources. +ms.custom: sap:Connectivity,innovation-engine +--- + +# Troubleshoot "Forbidden" error when trying to access AKS cluster resources + +This article explains how to troubleshoot and resolve "Error from server (Forbidden)" errors that are related to Role-Based Access Control (RBAC) when you try to view Kubernetes resources in an Azure Kubernetes Service (AKS) cluster. + +## Prerequisites + +The Kubernetes cluster command-line tool ([kubectl](https://kubernetes.io/docs/tasks/tools/)) + +> [!NOTE] +> If you use [Azure Cloud Shell](/azure/cloud-shell/overview) to run shell commands, kubectl is already installed. If you use a local shell and already have [Azure CLI](/cli/azure/install-azure-cli) installed, you can alternatively install kubectl by running the [az aks install-cli](/cli/azure/aks#az-aks-install-cli) command. + +## Symptoms + +When you run `kubectl` commands to view details of a Kubernetes resource type, such as a deployment, pod, or worker node, you receive the following error message: + +```output +$ kubectl get nodes +Error from server (Forbidden): nodes is forbidden: User "aaaa11111-11aa-aa11-a1a1-111111aaaaa" cannot list resource "nodes" in API group "" at the cluster scope +``` + +## Cause + +This error indicates that you're trying to access Kubernetes resources by using a Microsoft Entra ID account that doesn’t have the required role-based access control (RBAC) permissions. + +## Solution + +Depending on the RBAC type that's configured for the cluster ([Kubernetes RBAC](/azure/aks/azure-ad-rbac) or [Azure RBAC](/azure/aks/manage-azure-rbac)), different solutions might apply. Run the following command to determine which RBAC type the cluster is using: + +Run the following command to determine which RBAC type your AKS cluster is using: + +```bash +az aks show -g $RESOURCE_GROUP -n $CLUSTER_NAME --query aadProfile.enableAzureRbac +``` + +Results: + +```output +false +``` + +- If the result is **null** or empty, the cluster doesn't have Azure AD integration enabled. See [Solving permission issues in local Kubernetes RBAC clusters](#solving-permissions-issues-in-local-kubernetes-rbac-clusters). +- If the result is **false**, the cluster uses Kubernetes RBAC. See [Solving permission issues in Kubernetes RBAC-based AKS clusters](#solving-permissions-issues-in-kubernetes-rbac-based-aks-clusters). +- If the result is **true**, the cluster uses Azure RBAC. See [Solving permission issues in Azure RBAC-based AKS clusters](#solving-permissions-issues-in-azure-rbac-based-aks-clusters). + +### Solving permissions issues in local Kubernetes RBAC clusters + +If your cluster doesn't have Azure AD integration (result was null), it uses cluster admin credentials: + +```bash +# Get admin credentials for full access +az aks get-credentials --resource-group $RESOURCE_GROUP --name $CLUSTER_NAME --admin + +# Verify access +kubectl get nodes +``` + +**Warning**: Admin credentials provide full cluster access. Use carefully and consider enabling Azure AD integration for better security. + +### Solving permissions issues in Kubernetes RBAC-based AKS clusters + +If the cluster uses Kubernetes RBAC, permissions for the user account are configured through the creation of RoleBinding or ClusterRoleBinding Kubernetes resources. For more information, see [Kubernetes RBAC documentation](https://kubernetes.io/docs/reference/access-authn-authz/rbac/). + +Additionally, in Microsoft Entra ID integrated clusters, a ClusterRoleBinding resource is automatically created to grant the administrator access to the cluster to members of a pre-designated Microsoft Entra ID group. + +To resolve the "Error from server (Forbidden)" error for a specific user, use one of the following methods. + +#### Method 1: Create a custom RoleBinding or ClusterRoleBinding resource + +You can create a custom RoleBinding or ClusterRoleBinding resource to grant the necessary permissions to the user (or a group of which the user is a member). For detailed steps, see [Use Kubernetes role-based access control with Microsoft Entra ID in Azure Kubernetes Service](/azure/aks/azure-ad-rbac). + +#### Method 2: Add the user to the pre-designated Microsoft Entra ID admin group + +1. Retrieve the ID of the pre-designated Microsoft Entra ID admin group. To do this, run the following command: + + ```bash + az aks show -g $RESOURCE_GROUP -n $CLUSTER_NAME --query aadProfile.adminGroupObjectIDs + ``` + + Results: + + ```output + [ + "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + ] + ``` + +2. Add the user to the pre-designated Microsoft Entra ID admin group by using the group ID that you retrieved in the previous step. For more detailed steps, see [Add members or owners of a group](/entra/fundamentals/how-to-manage-groups#add-members-or-owners-of-a-group). + +### Solving permissions issues in Azure RBAC-based AKS clusters + +If the cluster uses Azure RBAC, permissions for users are configured through the creation of [Azure role assignments](/azure/role-based-access-control/role-assignments). + +AKS provides a set of built-in roles that can be used to create role assignments for the Microsoft Entra ID users or groups to give them access to Kubernetes objects in a specific namespace or at cluster scope. For detailed steps to assign built-in roles to users or groups in Azure RBAC-based clusters, see [AKS built-in roles](/azure/aks/manage-azure-rbac#aks-built-in-roles). + +Alternatively, you can create your own custom Azure role definitions to provide a more granular management of permissions over specific types of Kubernetes objects and operations. For detailed guidance to create and assign custom roles to users and groups in Azure RBAC-based clusters, see [Create custom roles definitions](/azure/aks/manage-azure-rbac#create-custom-roles-definitions). + +[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)] diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/cannot-scale-cluster-autoscaler-enabled-node-pool.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/cannot-scale-cluster-autoscaler-enabled-node-pool.md new file mode 100644 index 000000000..a2ca2e94f --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/cannot-scale-cluster-autoscaler-enabled-node-pool.md @@ -0,0 +1,72 @@ +--- +title: Cluster autoscaler fails to scale with cannot scale cluster autoscaler enabled node pool error +description: Learn how to troubleshoot the cannot scale cluster autoscaler enabled node pool error when your autoscaler isn't scaling up or down. +author: sgeannina +ms.author: ninasegares +ms.date: 06/09/2024 +ms.reviewer: aritraghosh, chiragpa +ms.service: azure-kubernetes-service +ms.custom: sap:Create, Upgrade, Scale and Delete operations (cluster or nodepool), innovation-engine +--- + +# Cluster autoscaler fails to scale with "cannot scale cluster autoscaler enabled node pool" error + +This article discusses how to resolve the "cannot scale cluster autoscaler enabled node pool" error that appears when scaling a cluster with an autoscaler enabled node pool. + +## Symptoms + +You receive an error message that resembles the following message: + +> `kubectl get nodes` outputs "No resources found" +> All pods state is `Pending` +> Scale operations are failing with "Cannot scale cluster autoscaler enabled node pool" error + +## Troubleshooting checklist + +Azure Kubernetes Service (AKS) uses virtual machine scale sets-based agent pools, which contain cluster nodes and [cluster autoscaling capabilities](/azure/aks/cluster-autoscaler) if enabled. + +### Check that the cluster virtual machine scale set exists + +1. Sign in to [Azure portal](https://portal.azure.com). +1. Find the node resource group by searching the following names: + + - The default name `MC_{AksResourceGroupName}_{YourAksClusterName}_{AksResourceLocation}`. + - The custom name (if it was provided at creation). + + > [!NOTE] + > When you create a new cluster, AKS automatically creates a second resource group to store the AKS resources. For more information, see [Why are two resource groups created with AKS?](/azure/aks/faq#why-are-two-resource-groups-created-with-aks) + +1. Check the list of resources and make sure that there's a virtual machine scale set. + +## Cause 1: The cluster virtual machine scale set was deleted + +Deleting the virtual machine scale set attached to the cluster causes the cluster autoscaler to fail. It also causes issues when provisioning resources such as nodes and pods. + +> [!NOTE] +> Modifying any resource under the node resource group in the AKS cluster is an unsupported action and will cause cluster operation failures. You can prevent changes from being made to the node resource group by [blocking users from modifying resources](/azure/aks/cluster-configuration#fully-managed-resource-group-preview) managed by the AKS cluster. + +## Cause 2: Tags or any other properties were modified from the node resource group + +You may receive scaling errors if you modify or delete Azure-created tags and other resource properties in the node resource group. For more information, see [Can I modify tags and other properties of the AKS resources in the node resource group?](/azure/aks/faq#can-i-modify-tags-and-other-properties-of-the-aks-resources-in-the-node-resource-group) + +## Cause 3: The cluster node resource group was deleted + +Deleting the cluster node resource group causes issues when provisioning the infrastructure resources required by the cluster, which causes the cluster autoscaler to fail. + +## Solution: Update the cluster to the goal state without changing the configuration + +To resolve this issue, you can run the following command to recover the deleted virtual machine scale set or any tags (missing or modified): + +> [!NOTE] +> It might take a few minutes until the operation completes. + +Set your environment variables for the AKS cluster resource group and cluster name before running the command. A random suffix is included to prevent name collisions during repeatable executions, but you must ensure the resource group and cluster exist. + +```azurecli +export RANDOM_SUFFIX=$(head -c 3 /dev/urandom | xxd -p) +export AKS_RG_NAME="MyAksResourceGroup$RANDOM_SUFFIX" +export AKS_CLUSTER_NAME="MyAksCluster$RANDOM_SUFFIX" +az aks update --resource-group $AKS_RG_NAME --name $AKS_CLUSTER_NAME --no-wait +``` + +[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)] diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/error-code-badrequest-or-invalidclientsecret.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/error-code-badrequest-or-invalidclientsecret.md new file mode 100644 index 000000000..5d0bbf509 --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/error-code-badrequest-or-invalidclientsecret.md @@ -0,0 +1,73 @@ +--- +title: AADSTS7000222 - BadRequest or InvalidClientSecret error +description: Learn how to troubleshoot the BadRequest or InvalidClientSecret error when you try to create or upgrade an Azure Kubernetes Service (AKS) cluster. +ms.topic: article +ms.date: 06/13/2024 +author: axelgMS +ms.author: axelg +ms.custom: sap:Create, Upgrade, Scale and Delete operations (cluster or nodepool), innovation-engine +--- + +# AADSTS7000222 - BadRequest or InvalidClientSecret error + +This article discusses how to identify and resolve the `AADSTS7000222` error (`BadRequest` or `InvalidClientSecret`) that occurs when you try to create or upgrade a Microsoft Azure Kubernetes Service (AKS) cluster. + +## Prerequisites + +- [Azure CLI](/cli/azure/install-azure-cli) + +## Symptoms + +When you try to create or upgrade an AKS cluster, you receive one of the following error messages. + +| Error code | Message | +|--|--| +| `BadRequest` | **The credentials in ServicePrincipalProfile were invalid.** Please see for more details. (Details: adal: Refresh request failed. Status Code = '401'. Response body: {"error": "invalid_client", "error_description": "**AADSTS7000222: The provided client secret keys for app '\' are expired.** Visit the Azure portal to create new keys for your app: , or consider using certificate credentials for added security: ." | +| `InvalidClientSecret` | **Customer auth is not valid for tenant: \**: adal: Refresh request failed. Status Code = '401'. Response body: {"error": "invalid_client", "error_description": "**AADSTS7000222: The provided client secret keys for app '\' are expired.** Visit the Azure portal to create new keys for your app: , or consider using certificate credentials for added security: ." | + +## Cause + +The issue that generates this service principal alert usually occurs for one of the following reasons: + +- The client secret expired. + +- Incorrect credentials were provided. + +- The service principal doesn't exist within the Microsoft Entra ID tenant of the subscription. + +#### Verify the cause + +Use the following commands to retrieve the service principal profile for your AKS cluster and check the expiration date of the service principal. Make sure to set the appropriate variables for your AKS resource group and cluster name. + +```azurecli +SP_ID=$(az aks show --resource-group $RESOURCE_GROUP_NAME \ + --name $AKS_CLUSTER_NAME \ + --query servicePrincipalProfile.clientId \ + --output tsv) +az ad app credential list --id "$SP_ID" +``` + +Alternatively, you can verify that the service principal name and secret are correct and aren't expired. To do this, follow these steps: + +1. In the [Azure portal](https://portal.azure.com), search for and select **Microsoft Entra ID**. + +1. In the navigation pane of Microsoft Entra ID, select **App registrations**. + +1. On the **Owned applications** tab, select the affected application. + +1. Find the service principal name and secret information, and verify that the information is correct and current. + +## Solution + +1. In the [Update or rotate the credentials for an AKS cluster](/azure/aks/update-credentials) article, follow the instructions in one of the following article sections, as appropriate: + + - [Reset the existing service principal credentials](/azure/aks/update-credentials#reset-the-existing-service-principal-credentials) + - [Create a new service principal](/azure/aks/update-credentials#create-a-new-service-principal) + +1. Using your new service principal credentials, follow the instructions in the [Update AKS cluster with service principal credentials](/azure/aks/update-credentials#update-aks-cluster-with-service-principal-credentials) section of that article. + +## More information + +- [Use a service principal with Azure Kubernetes Service (AKS)](/azure/aks/kubernetes-service-principal) (especially the [Troubleshoot](/azure/aks/kubernetes-service-principal#troubleshoot) section) + +[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)] diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/error-code-cnidownloadtimeoutvmextensionerror.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/error-code-cnidownloadtimeoutvmextensionerror.md new file mode 100644 index 000000000..62fe39a22 --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/error-code-cnidownloadtimeoutvmextensionerror.md @@ -0,0 +1,131 @@ +--- +title: Troubleshoot Container Network Interface download failures +description: Learn how to resolve Container Network Interface download failures when you try to create and deploy an Azure Kubernetes Service (AKS) cluster. +ms.topic: article +ms.date: 06/12/2024 +author: v-jsitser +ms.author: v-jsitser +ms.custom: sap:Create, Upgrade, Scale and Delete operations (cluster or nodepool), innovation-engine +editor: v-jsitser +ms.reviewer: axelg, chiragpa, mariochaves, v-weizhu, v-leedennis +#Customer intent: As an Azure Kubernetes user, I want to troubleshoot the container network interface download failures so that I can successfully create and deploy an Azure Kubernetes Service (AKS) cluster. +--- + +# Troubleshoot Container Network Interface download failures + +This article discusses how to identify and resolve the `CniDownloadTimeoutVMExtensionError` error code (also known as error code `ERR_CNI_DOWNLOAD_TIMEOUT`, error number 41) or the `WINDOWS_CSE_ERROR_DOWNLOAD_CNI_PACKAGE` error code (error number 35) that occurs when you try to create and deploy a Microsoft Azure Kubernetes Service (AKS) cluster. + +## Prerequisites + +- The [Curl](https://curl.se/download.html) command-line tool +- Network access from the same environment where AKS nodes will be deployed (same VNet, firewall rules, etc.) + +## Symptoms + +When you try to create a Linux-based AKS cluster, you receive the following error message: + +```output +Message: We are unable to serve this request due to an internal error +SubCode: CniDownloadTimeoutVMExtensionError; +Message="VM has reported a failure when processing extension 'vmssCSE'. +Error message: "Enable failed: failed to execute command: command terminated with exit status=41\n[stdout]\n{ +"ExitCode": "41", +``` + +When you try to create a Windows-based AKS cluster, you receive the following error message: + +```output +Message="VM has reported a failure when processing extension 'vmssCSE' (publisher 'Microsoft.Compute' and type 'CustomScriptExtension'). +Error message: 'Command execution finished, but failed because it returned a non-zero exit code of: '1'. The command had an error output of: 'ExitCode: |35|, +Output: |WINDOWS_CSE_ERROR_DOWNLOAD_CNI_PACKAGE|, Error: |Failed in downloading \r\nhttps://acs-mirror.azureedge.net/azure-cni/v1.4.56/binaries/azure-vnet-cni-overlay-windows-amd64-v1.4.56.zip. +Error: \r\nUnable to connect to the r|\r\nAt line:1 ...' +For more information, check the instance view by executing Get-AzVmssVm or Get-AzVm (https://aka.ms/GetAzVm). These commands can be executed using CloudShell (https://aka.ms/CloudShell)'. More information on troubleshooting is available at https://aka.ms/VMExtensionCSEWindowsTroubleshoot. +``` + +## Cause + +Your cluster nodes can't connect to the endpoint that's used to download the Container Network Interface (CNI) libraries. In most cases, this issue occurs because a network virtual appliance is blocking Secure Sockets Layer (SSL) communication or an SSL certificate. + +## Solution + +Run a Curl command to verify that your nodes can download the binaries: + +First, attempt a test download of the Azure CNI package for Linux from the official mirror endpoint. + +```bash +curl -I https://acs-mirror.azureedge.net/cni/azure-vnet-cni-linux-amd64-v1.0.25.tgz +``` + +Results: + + + +```output +HTTP/2 200 +content-length: 970752 +content-type: application/x-gzip +last-modified: Wed, 22 Jun 2022 00:00:00 GMT +etag: "0x8DA53F1234567" +server: ECAcc (dab/4B9E) +x-cache: HIT +cache-control: public, max-age=86400 +accept-ranges: bytes +date: Thu, 05 Jun 2025 00:00:00 GMT +``` + +This command checks if the endpoint is reachable and returns the HTTP headers. If you see a `200 OK` response, it indicates that the endpoint is accessible. + +Next, attempt a download with validation and save the file locally for further troubleshooting. This will help determine if SSL or outbound connectivity is correctly configured. + +```bash +# Create a temporary directory for testing +mkdir -p /tmp/cni-test + +# Download the CNI package to the temp directory +curl -L --fail https://acs-mirror.azureedge.net/cni/azure-vnet-cni-linux-amd64-v1.0.25.tgz --output /tmp/cni-test/azure-vnet-cni-linux-amd64-v1.0.25.tgz && echo "Download successful" || echo "Download failed" +``` + +Results: + + + +```output + % Total % Received % Xferd Average Speed Time Time Time Current + Dload Upload Total Spent Left Speed +100 6495k 100 6495k 0 0 8234k 0 --:--:-- --:--:-- --:--:-- 8230k +Download successful +``` + +Verify the downloaded file: + +```bash +ls -la /tmp/cni-test/ +file /tmp/cni-test/azure-vnet-cni-linux-amd64-v1.0.25.tgz +``` + +Results: + + + +```output +total 6500 +drwxr-xr-x 2 user user 4096 Jun 20 10:30 . +drwxrwxrwt 8 root root 4096 Jun 20 10:30 .. +-rw-r--r-- 1 user user 6651392 Jun 20 10:30 azure-vnet-cni-linux-amd64-v1.0.25.tgz + +/tmp/cni-test/azure-vnet-cni-linux-amd64-v1.0.25.tgz: gzip compressed data, from Unix, original size modulo 2^32 20070400 +``` + +Clean up the test files: + +```bash +rm -rf /tmp/cni-test/ +``` + +If you can't download these files, make sure that traffic is allowed to the downloading endpoint. For more information, see [Azure Global required FQDN/application rules](/azure/aks/outbound-rules-control-egress#azure-global-required-fqdn--application-rules). + +## References + +- [General troubleshooting of AKS cluster creation issues](troubleshoot-aks-cluster-creation-issues.md) + +[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)] diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/upgrading-or-scaling-does-not-succeed.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/upgrading-or-scaling-does-not-succeed.md new file mode 100644 index 000000000..4d57eca7d --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/upgrading-or-scaling-does-not-succeed.md @@ -0,0 +1,79 @@ +--- +title: Troubleshoot cluster upgrading and scaling errors +description: Troubleshoot errors that occur when you try to upgrade or scale an Azure Kubernetes Service (AKS) cluster. +ms.topic: article +ms.date: 06/12/2024 +author: v-jsitser +ms.author: v-jsitser +ms.custom: sap:Create, Upgrade, Scale and Delete operations (cluster or nodepool), innovation-engine +--- + +# Troubleshoot cluster upgrading and scaling errors + +This article discusses how to troubleshoot errors that occur when you try to upgrade or scale a Microsoft Azure Kubernetes Service (AKS) cluster. + +Some causes of failure when you try to upgrade or scale an AKS cluster are as follows. + +## Cause 1: Cluster is in a failed state + +If a cluster is in a `failed` state, `upgrade` or `scale` operations won't succeed. A cluster can enter a failed state for many reasons. + +Here are the most common reasons and corresponding solutions: + +- Scaling while having an insufficient Compute Resource Provider (CRP) quota. + + To resolve this issue, increase your resource quota before you scale by following these steps: + + 1. Scale your cluster back to a stable goal state within the quota. + + 2. [Request an increase in your resource quota](/azure/azure-resource-manager/troubleshooting/error-resource-quota#solution). + + 3. Try to scale up again beyond the initial quota limits. + + 4. Retry the original operation. This second operation should bring your cluster to a successful state. + +- Scaling a cluster that uses advanced networking such as Azure Container Networking Interface (CNI), Azure CNI for dynamic IP allocation but has insufficient subnet (networking) resources. + + To resolve this issue, see [Troubleshoot the SubnetIsFull error code](error-code-subnetisfull.md). + +- Upgrading a cluster that has Pod Disruption Budgets (PDBs) which may cause eviction failures. + + To resolve this issue, remove or adjust the PDB so that the pod can be drained. For more information, see [Troubleshoot UpgradeFailed errors due to eviction failures caused by PDBs](error-code-poddrainfailure.md). + +- Upgrading a cluster that uses deprecated APIs. + + For Kubernetes versions upgrading to 1.26 or later, AKS checks whether deprecated APIs are used before starting the cluster upgrade. To resolve this issue and start to upgrade, see [How to mitigate stopped upgrade operations due to deprecated APIs](/azure/aks/stop-cluster-upgrade-api-breaking-changes#mitigate-stopped-upgrade-operations). + +## Cause 2: You're trying to upgrade and scale at the same time + +A cluster or node pool can't simultaneously upgrade and scale. Instead, each operation type must finish on the target resource before the next request runs on that same resource. Therefore, operations are limited when active upgrade or scale operations are occurring or attempted. + +To resolve this issue, follow these steps: + +1. Determine the current status of your cluster before you try an operation. + + To retrieve detailed status about your cluster, run the following [az aks show](/cli/azure/aks#az-aks-show) command: + + ```azurecli + az aks show --resource-group $RESOURCE_GROUP_NAME --name $CLUSTER_NAME --output table + ``` + + Results: + + + + ```output + Name Location ResourceGroup KubernetesVersion ProvisioningState Fqdn + ------------- ----------- ------------------- ------------------- ------------------- --------------- + myAKSClusterx eastus2 myResourceGroupx 1.27.x Succeeded xxxxx.xxxxxx.x + ``` + +2. Refer to the following table to take the appropriate action based on the cluster's status: + + | ProvisioningState | Action | + |-------------------------------|-----------------------------------------------------------------------------------------| + | Upgrading | Wait until the operation finishes. | + | Failed | Follow the solutions that are outlined in [Cause 1](#cause-1-cluster-is-in-a-failed-state). | + | Succeeded | Retry the scale or other previously failed operation. | + +[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)] \ No newline at end of file diff --git a/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/extensions/aks-cost-analysis-add-on-issues.md b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/extensions/aks-cost-analysis-add-on-issues.md new file mode 100644 index 000000000..779b5aaf6 --- /dev/null +++ b/scenarios/SupportArticles-docs/support/azure/azure-kubernetes/extensions/aks-cost-analysis-add-on-issues.md @@ -0,0 +1,122 @@ +--- +title: Azure Kubernetes Service Cost Analysis add-on issues +description: Learn how to resolve issues that occur when you try to enable the Azure Kubernetes Service (AKS) Cost Analysis add-on. +ms.date: 06/25/2024 +author: kaysieyu +ms.author: kaysieyu +ms.reviewer: pram, chiragpa, joharder, cssakscic, dafell, v-leedennis, v-weizhu +editor: v-jsitser +ms.service: azure-kubernetes-service +ms.custom: sap:Extensions, Policies and Add-Ons, references_regions, innovation-engine +--- + +# AKS Cost Analysis add-on issues + +This article discusses how to troubleshoot problems that you might experience when you enable the Microsoft Azure Kubernetes Service (AKS) Cost Analysis add-on during cluster creation or a cluster update. + +## Prerequisites + +- [Azure CLI](/cli/azure/install-azure-cli) + +## Symptoms + +After you create or update an AKS cluster, you receive an error message in the following format: + +| Error code | Cause | +|--|--| +| `InvalidDiskCSISettingForCostAnalysis` | [Cause 1: Azure Disk CSI driver is disabled](#cause-1-azure-disk-csi-driver-is-disabled) | +| `InvalidManagedIdentitySettingForCostAnalysis` | [Cause 2: Managed identity is disabled](#cause-2-managed-identity-is-disabled) | +| `CostAnalysisNotEnabledInRegion` | [Cause 3: The add-on is unavailable in your region](#cause-3-the-add-on-is-unavailable-in-your-region) | +| `InvalidManagedClusterSKUForFeature` | [Cause 4: The add-on is unavailable on the free pricing tier](#cause-4-the-add-on-is-unavailable-on-the-free-pricing-tier) | +| Pod `OOMKilled` | [Cause 5: The cost-analysis-agent pod gets the OOMKilled error](#cause-5-the-cost-analysis-agent-pod-gets-the-oomkilled-error) | +| Pod `Pending` | [Cause 6:The cost-analysis-agent pod is stuck in the Pending state](#cause-6-the-cost-analysis-agent-pod-is-stuck-in-the-pending-state) | + +## Cause 1: Azure Disk CSI driver is disabled + +You can't enable the Cost Analysis add-on on a cluster in which the [Azure Disk Container Storage Interface (CSI) driver](/azure/aks/azure-disk-csi) is disabled. + +### Solution: Update the cluster to enable the Azure Disk CSI driver + +Run the [az aks update][aks-update] command, and specify the `--enable-disk-driver` parameter. This parameter enables the Azure Disk CSI driver in AKS. + +First, define the environment variables for your resource group and AKS cluster, using unique values for repeated runs: + +```azurecli +export RANDOM_SUFFIX=$(head -c 3 /dev/urandom | xxd -p) +export RESOURCE_GROUP="my-aks-resource-group$RANDOM_SUFFIX" +export AKS_CLUSTER="my-aks-cluster$RANDOM_SUFFIX" +az aks update --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER --enable-disk-driver +``` + +For more information, see [CSI drivers on AKS](/azure/aks/csi-storage-drivers). + +## Cause 2: Managed identity is disabled + +You can enable the Cost Analysis add-on only on a cluster that has a system-assigned or user-assigned managed identity. + +### Solution: Update the cluster to enable managed identity + +Run the [az aks update][aks-update] command, and specify the `--enable-managed-identity` parameter: + +```azurecli +export RANDOM_SUFFIX=$(head -c 3 /dev/urandom | xxd -p) +export RESOURCE_GROUP="my-aks-resource-group$RANDOM_SUFFIX" +export AKS_CLUSTER="my-aks-cluster$RANDOM_SUFFIX" +az aks update --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER --enable-managed-identity +``` + +For more information, see [Use a managed identity in AKS](/azure/aks/use-managed-identity). + +## Cause 3: The add-on is unavailable in your region + +The Cost Analysis add-on isn't currently enabled in your region. + +> [!NOTE] +> The AKS Cost Analysis add-on is currently unavailable in the following regions: +> +> - `usnateast` +> - `usnatwest` +> - `usseceast` +> - `ussecwest` + + +## Cause 4: The add-on is unavailable on the free pricing tier + +You can't enable the Cost Analysis add-on on AKS clusters that are on the free pricing tier. + +### Solution: Update the cluster to use the Standard or Premium pricing tier + +Upgrade the AKS cluster to the Standard or Premium pricing tier. To do this, run the below [az aks update][aks-update] command that specify the `--tier` parameter. The `--tier` parameter can be set to either `standard` or `premium` (example below shows `standard`): + +```azurecli +export RANDOM_SUFFIX=$(head -c 3 /dev/urandom | xxd -p) +export RESOURCE_GROUP="my-aks-resource-group$RANDOM_SUFFIX" +export AKS_CLUSTER="my-aks-cluster$RANDOM_SUFFIX" +az aks update --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER --tier standard +``` + +For more information, see [Free and Standard pricing tiers for AKS cluster management](/azure/aks/free-standard-pricing-tiers). + +## Cause 5: The cost-analysis-agent pod gets the OOMKilled error + +The current memory limit for the cost-analysis-agent pod is set to 4 GB. + +The pod's usage depends on the number of deployed containers, which can be roughly 200 MB + 0.5 MB per container. The current memory limit supports approximately 7000 containers per cluster. + +When the pod's usage exceeds the allocated 4 GB limit, large clusters may experience the `OOMKill` error. + +### Solution: Disable the add-on + +Currently, customizing or manually increasing memory limits for the add-on isn't supported. To resolve this issue, disable the add-on. + +## Cause 6: The cost-analysis-agent pod is stuck in the Pending state + +If the pod is stuck in the Pending state with the FailedScheduling error, the nodes in the cluster have exhausted memory capacity. + +### Solution: Ensure there's sufficient allocatable memory + +The current memory request of the cost-analysis-agent pod is set to 500 MB. Ensure that there's sufficient allocatable memory for the pod to be scheduled + +[!INCLUDE [Azure Help Support](../../../includes/azure-help-support.md)] + +[aks-update]: /cli/azure/aks#az-aks-update \ No newline at end of file diff --git a/scenarios/UseIGOnAKS/alert-bad-process.yaml b/scenarios/UseIGOnAKS/alert-bad-process.yaml new file mode 100644 index 000000000..c7f812a2f --- /dev/null +++ b/scenarios/UseIGOnAKS/alert-bad-process.yaml @@ -0,0 +1,13 @@ +apiVersion: 1 +kind: instance-spec +image: trace_exec:v0.38.0 +name: alert-bad-process +paramValues: + # monitor all namespaces + operator.KubeManager.all-namespaces: true + # monitor shell executions (only bash on this example) + operator.filter.filter: proc.comm==bash + # name of the metric to export + operator.otel-metrics.otel-metrics-name: 'exec:shell_executions' + # annotate gadget to enable metrics collection + operator.oci.annotate: exec:metrics.collect=true,exec:metrics.implicit-counter.name=shell_executions,exec.k8s.namespace:metrics.type=key,exec.k8s.podname:metrics.type=key,exec.k8s.containername:metrics.type=key diff --git a/scenarios/UseIGOnAKS/ama-metrics-settings-configmap.yaml b/scenarios/UseIGOnAKS/ama-metrics-settings-configmap.yaml new file mode 100644 index 000000000..73481cb01 --- /dev/null +++ b/scenarios/UseIGOnAKS/ama-metrics-settings-configmap.yaml @@ -0,0 +1,84 @@ +kind: ConfigMap +apiVersion: v1 +data: + schema-version: + #string.used by agent to parse config. supported versions are {v1}. Configs with other schema versions will be rejected by the agent. + v1 + config-version: + #string.used by customer to keep track of this config file's version in their source control/repository (max allowed 10 chars, other chars will be truncated) + ver1 + prometheus-collector-settings: |- + cluster_alias = "" + default-scrape-settings-enabled: |- + kubelet = true + coredns = false + cadvisor = true + kubeproxy = false + apiserver = false + kubestate = true + nodeexporter = true + windowsexporter = false + windowskubeproxy = false + kappiebasic = true + networkobservabilityRetina = true + networkobservabilityHubble = true + networkobservabilityCilium = true + prometheuscollectorhealth = false + controlplane-apiserver = true + controlplane-cluster-autoscaler = false + controlplane-kube-scheduler = false + controlplane-kube-controller-manager = false + controlplane-etcd = true + acstor-capacity-provisioner = true + acstor-metrics-exporter = true + # Regex for which namespaces to scrape through pod annotation based scraping. + # This is none by default. + # Ex: Use 'namespace1|namespace2' to scrape the pods in the namespaces 'namespace1' and 'namespace2'. + pod-annotation-based-scraping: |- + podannotationnamespaceregex = "" + default-targets-metrics-keep-list: |- + kubelet = "" + coredns = "" + cadvisor = "" + kubeproxy = "" + apiserver = "" + kubestate = "" + nodeexporter = "" + windowsexporter = "" + windowskubeproxy = "" + podannotations = "" + kappiebasic = "" + networkobservabilityRetina = "" + networkobservabilityHubble = "" + networkobservabilityCilium = "" + controlplane-apiserver = "" + controlplane-cluster-autoscaler = "" + controlplane-kube-scheduler = "" + controlplane-kube-controller-manager = "" + controlplane-etcd = "" + acstor-capacity-provisioner = "" + acstor-metrics-exporter = "" + minimalingestionprofile = true + default-targets-scrape-interval-settings: |- + kubelet = "30s" + coredns = "30s" + cadvisor = "30s" + kubeproxy = "30s" + apiserver = "30s" + kubestate = "30s" + nodeexporter = "30s" + windowsexporter = "30s" + windowskubeproxy = "30s" + kappiebasic = "30s" + networkobservabilityRetina = "30s" + networkobservabilityHubble = "30s" + networkobservabilityCilium = "30s" + prometheuscollectorhealth = "30s" + acstor-capacity-provisioner = "30s" + acstor-metrics-exporter = "30s" + podannotations = "30s" + debug-mode: |- + enabled = false +metadata: + name: ama-metrics-settings-configmap + namespace: kube-system \ No newline at end of file diff --git a/scenarios/UseIGOnAKS/use-ig-on-aks.md b/scenarios/UseIGOnAKS/use-ig-on-aks.md new file mode 100644 index 000000000..38acf8803 --- /dev/null +++ b/scenarios/UseIGOnAKS/use-ig-on-aks.md @@ -0,0 +1,198 @@ +--- +title: Comprehensive Guide to Using Inspektor Gadget in Kubernetes +description: This Exec Doc provides a detailed walkthrough of a shell script that demonstrates various operations with the Inspektor Gadget in a Kubernetes environment. It explains each functional block, how the gadget plugin is installed, deployed, and used to run examples, export metrics, and verify configurations. +ms.topic: article +ms.date: 03/19/2025 +author: yourgithubusername +ms.author: yourmsalias +ms.custom: innovation-engine, kubernetes, gadget, monitoring +--- + +# Detailed Walkthrough: Inspektor Gadget Shell Script + +This document provides a step-by-step explanation of the provided shell script. The script demonstrates several operations related to the Inspektor Gadget in a Kubernetes environment. Each section below explains the purpose and the functionality of the code blocks that follow. The commands remain unchanged; only the documentation around them has been added for clarity. + +--- + +## Connecting to Your AKS Cluster + +Before running any commands, ensure that your local environment is connected to the desired AKS (Azure Kubernetes Service) cluster. Use the following command to retrieve the cluster credentials and configure `kubectl` to interact with the cluster: + +```bash +# Retrieve AKS cluster credentials: +az aks get-credentials --resource-group $RESOURCE_GROUP --name $CLUSTER_NAME +``` + +After executing this command, `kubectl` will be configured to communicate with the specified AKS cluster. + +--- + +## Viewing AKS Cluster Nodes + +In this section, the script lists the nodes of the current AKS (Azure Kubernetes Service) cluster using the Kubernetes CLI (`kubectl`). This allows you to verify that your cluster is up and running and view the status of the nodes. + +```bash +# Show AKS cluster: + +kubectl get nodes +``` + +After executing this block, the output will display the current nodes in the cluster along with their status, roles, and version information. + +--- + +## Installing the Inspektor Gadget Plugin + +This section installs the Inspektor Gadget plugin using `kubectl krew`. The gadget plugin extends kubectl with additional functionalities, enabling more effective monitoring and tracing within the cluster. + +```bash +# Install kubectl plugin: + +kubectl krew install gadget +``` + +Once installed, the gadget plugin is available for subsequent commands in the script. + +--- + +## Verifying Gadget Plugin Version + +Here, the script verifies the version and server status of the gadget plugin. It checks that the plugin is correctly installed and provides details about its client and server versions. The expected output is a client version (e.g., vX.Y.Z) and a note that the server version is not available. + +```bash +# Verify version and server status: + +kubectl gadget version +# Expected output: +# Client version: vX.Y.Z +# Server version: not available +``` + +This output helps determine that the gadget plugin is operational on your local client. You may compare the shown version with the expected output. + +--- + +## Deploying Inspektor Gadget and Re-Verification + +In this section, the script deploys the Inspektor Gadget in the Kubernetes environment. The command includes options to enable the OpenTelemetry (OTEL) metrics listener on the specified address (0.0.0.0:2223). After deploying, the version command is run again to verify that the gadget deployment is correctly configured, even though the server version remains "not available". + +```bash +# Deploy Inspektor Gadget: + +kubectl gadget deploy --otel-metrics-listen --otel-metrics-listen-address 0.0.0.0:2223 + +# Verify version and server status: + +kubectl gadget version +# Expected output: +# Client version: vX.Y.Z +# Server version: not available +``` + +This deployment sets up the gadget to collect the required metrics, and the follow-up version check confirms that the plugin is still active. + +--- + +## Demonstrating Gadget Usage with trace_exec + +This section illustrates different methods to run the gadget plugin using the `trace_exec` example. The commands include: + +1. Running the gadget with a specific trace_exec version. +2. Creating a test pod running Ubuntu in an interactive session, which is automatically removed after exit. +3. Running the gadget with JSON formatted output. +4. Running the gadget with filtering to display only processes with the command matching "bash". + +These examples show various ways to leverage the gadget for tracing executions in the cluster. + +```bash +# Run simple example with trace_exec with a 10-second timeout to prevent indefinite execution: +timeout 5s kubectl gadget run trace_exec || true + +kubectl delete pod demo-pod + +# Create a background pod that will generate events for us to trace: +kubectl run demo-pod --image=ubuntu -- /bin/bash -c "for i in {1..11}; do echo Running commands...; ls -la /; sleep 1; done" + +# Wait briefly for the pod to start generating events +sleep 5 + +# Run gadget with JSON output and timeout +timeout 5s kubectl gadget run trace_exec --output jsonpretty || true + +# Run gadget with filtering and timeout +timeout 5s kubectl gadget run trace_exec --all-namespaces --filter proc.comm=bash || echo "Attachment timed out, continuing with demo" +``` + +Each command demonstrates a different facet of the gadget's capabilities, from initiating traces to filtering outputs based on process names. + +--- + +## Creating Metrics Configuration for Alerting + +In this part of the script, a metrics configuration file is edited. The file (alert-bad-process.yaml) is intended to define rules to generate a metric based on certain events in the cluster. The metric, in this context, is used to track shell executions. + +```bash +# Generate a metric based on these events: + +cat alert-bad-process.yaml +``` + +--- + +## Exporting Metrics and Managing Gadget Lifecycle + +This section deploys the gadget manifest using the YAML file created in the previous section. The command includes several annotations to instruct the gadget to collect metrics. The process is detached so that it runs in the background. Subsequently, the script lists the running gadget instances. + +```bash +# Clean up any existing instance of the same name +kubectl gadget delete alert-bad-process + +# Run gadget manifest to export metrics: +kubectl gadget run -f alert-bad-process.yaml --annotate exec:metrics.collect=true,exec:metrics.implicit-counter.name=shell_executions,exec.k8s.namespace:metrics.type=key,exec.k8s.podname:metrics.type=key,exec.k8s.containername:metrics.type=key --detach +``` + +These commands ensure that metrics are being collected as defined in the YAML manifest and verify that the gadget is running correctly in headless mode. + +--- + +## Verifying Prometheus Configuration for Metrics Collection + +This section checks the managed Prometheus configuration to ensure that it is set up to scrape metrics from the OTEL listener endpoint exposed on each Inspektor Gadget pod. The first command retrieves the relevant configmap, and the second command displays its full YAML definition with a pager for detailed inspection. Review the output to confirm that the configuration contains the expected annotation for pod-based scraping related to the gadget. + +```bash +# Configure managed Prometheus to collect data from the OTEL listener endpoint we expose on each IG pod? +# Documentation: https://learn.microsoft.com/en-us/azure/azure-monitor/containers/prometheus-metrics-scrape-configuration?tabs=CRDConfig%2CCRDScrapeConfig%2CConfigFileScrapeConfigBasicAuth%2CConfigFileScrapeConfigTLSAuth#configmaps + +kubectl get configmaps -n kube-system ama-metrics-settings-configmap + +# It should contain: pod-annotation-based-scraping: podannotationnamespaceregex = "gadget" +kubectl get configmaps -n kube-system ama-metrics-settings-configmap -o yaml | grep -A 5 "pod-annotation-based-scraping" +``` + +--- + +## Monitoring, Alerting, and Cleanup + +In the final part of the script, the focus shifts to monitoring and alerting: + +1. It provides guidance for viewing the `shell_executions_total` metric in the Grafana dashboard. +2. It suggests creating a Prometheus group alert with a rule that triggers when `shell_executions_total` exceeds 0. +3. Finally, the script undeploys the Inspektor Gadget to clean up resources. + +```bash +# Show shell_executions_total metric in Grafana dashboard: shell_executions_total +# Documentation: https://learn.microsoft.com/en-us/azure/managed-grafana/overview + +# Create a prometheus group alert with the rule "shell_executions_total > 0" +# Documentation: https://learn.microsoft.com/en-us/azure/azure-monitor/essentials/prometheus-rule-groups + +# Undeploy IG +kubectl gadget undeploy +``` + +These steps ensure that your metrics are visually accessible via Grafana and that alerts are configured for proactive monitoring. The final undeploy command removes the deployed gadget from the cluster, wrapping up the execution workflow. + +## Next Steps +- [Real-world scenarios where Inspektor Gadget can help you](https://go.microsoft.com/fwlink/p/?linkid=2260402#use-cases) +- [Explore the available gadgets](https://go.microsoft.com/fwlink/p/?linkid=2260070) +- [Run your own eBPF program](https://go.microsoft.com/fwlink/p/?linkid=2259865) \ No newline at end of file diff --git a/scenarios/azure-aks-docs/articles/aks/access-control-managed-azure-ad.md b/scenarios/azure-aks-docs/articles/aks/access-control-managed-azure-ad.md new file mode 100644 index 000000000..6ff342dcd --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/access-control-managed-azure-ad.md @@ -0,0 +1,89 @@ +--- +title: Control cluster access using Conditional Access with AKS-managed Microsoft Entra integration +description: Learn how to access clusters using Conditional Access when integrating Microsoft Entra ID in your Azure Kubernetes Service (AKS) clusters. +ms.topic: concept-article +ms.subservice: aks-integration +ms.date: 06/25/2024 +author: nickomang +ms.author: nickoman +ms.custom: devx-track-azurecli, innovation-engine +--- + +# Control cluster access using Conditional Access with AKS-managed Microsoft Entra integration + +When you integrate Microsoft Entra ID with your AKS cluster, you can use [Conditional Access][aad-conditional-access] for just-in-time requests to control access to your cluster. This article shows you how to enable Conditional Access on your AKS clusters. + +> [!NOTE] +> Microsoft Entra Conditional Access has Microsoft Entra ID P1, P2, or Governance capabilities requiring a Premium P2 SKU. For more on Microsoft Entra ID licenses and SKUs, see [Microsoft Entra ID Governance licensing fundamentals][licensing-fundamentals] and [pricing guide][aad-pricing]. + +## Before you begin + +* See [AKS-managed Microsoft Entra integration](./managed-azure-ad.md) for an overview and setup instructions. + +## Use Conditional Access with Microsoft Entra ID and AKS + +1. In the Azure portal, go to the **Microsoft Entra ID** page and select **Enterprise applications**. +1. Select **Conditional Access** > **Policies** > **New policy**. +1. Enter a name for the policy, such as *aks-policy*. +1. Under **Assignments**, select **Users and groups**. Choose the users and groups you want to apply the policy to. In this example, choose the same Microsoft Entra group that has administrator access to your cluster. +1. Under **Cloud apps or actions** > **Include**, select **Select apps**. Search for **Azure Kubernetes Service** and select **Azure Kubernetes Service Microsoft Entra Server**. +1. Under **Access controls** > **Grant**, select **Grant access**, **Require device to be marked as compliant**, and **Require all the selected controls**. +1. Confirm your settings, set **Enable policy** to **On**, and then select **Create**. + +## Verify your Conditional Access policy has been successfully listed + +After implementing your Conditional Access policy, verify that it works as expected by accessing the AKS cluster and checking the sign-in activity. + +1. Get the user credentials to access the cluster using the [`az aks get-credentials`][az-aks-get-credentials] command. + + Assign values to the required environment variables. The AKS cluster and resource group must exist. + + ```azurecli-interactive + export RANDOM_SUFFIX=$(head -c 3 /dev/urandom | xxd -p) + export RESOURCE_GROUP="myResourceGroup$RANDOM_SUFFIX" + export AKS_CLUSTER="myManagedCluster$RANDOM_SUFFIX" + ``` + + Download credentials required to access your AKS cluster. + + ```azurecli-interactive + az aks get-credentials --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER --overwrite-existing + ``` + +1. Follow the instructions to sign in. + +1. View the nodes in the cluster using the `kubectl get nodes` command. + + ```azurecli-interactive + kubectl get nodes + ``` + + Results: + + + + ```output + NAME STATUS ROLES AGE VERSION + aks-nodepool1-xxxxx-vmss000000 Ready agent 3d2h v1.xx.x + aks-nodepool1-xxxxx-vmss000001 Ready agent 3d2h v1.xx.x + ``` + +1. In the Azure portal, navigate to **Microsoft Entra ID** and select **Enterprise applications** > **Activity** > **Sign-ins**. + +1. Under the **Conditional Access** column you should see a status of *Success*. Select the event and then select the **Conditional Access** tab. Your Conditional Access policy will be listed. + +## Next steps + +For more information, see the following articles: + +* Use [kubelogin](https://github.com/Azure/kubelogin) to access features for Azure authentication that aren't available in kubectl. +* [Use Privileged Identity Management (PIM) to control access to your Azure Kubernetes Service (AKS) clusters][pim-aks]. + + +[aad-pricing]: https://azure.microsoft.com/pricing/details/active-directory/ + + +[aad-conditional-access]: /azure/active-directory/conditional-access/overview +[licensing-fundamentals]: /entra/id-governance/licensing-fundamentals +[az-aks-get-credentials]: /cli/azure/aks#az_aks_get_credentials +[pim-aks]: ./privileged-identity-management.md diff --git a/scenarios/azure-aks-docs/articles/aks/access-private-cluster.md b/scenarios/azure-aks-docs/articles/aks/access-private-cluster.md new file mode 100644 index 000000000..f03aa86d9 --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/access-private-cluster.md @@ -0,0 +1,215 @@ +--- +title: 'Access a private Azure Kubernetes Service (AKS) cluster using the command invoke or Run command feature' +description: Learn how to access a private Azure Kubernetes Service (AKS) cluster using the Azure CLI command invoke feature or the Azure portal Run command feature. +ms.topic: concept-article +ms.subservice: aks-security +ms.custom: devx-track-azurecli,innovation-engine +ms.date: 06/13/2024 +author: schaffererin +ms.author: schaffererin +--- + +# Access a private Azure Kubernetes Service (AKS) cluster using the command invoke or Run command feature + +When you access a private AKS cluster, you need to connect to the cluster from the cluster virtual network, a peered network, or a configured private endpoint. These approaches require configuring a VPN, Express Route, deploying a *jumpbox* within the cluster virtual network, or creating a private endpoint inside of another virtual network. + +With the Azure CLI, you can use `command invoke` to access private clusters without the need to configure a VPN or Express Route. `command invoke` allows you to remotely invoke commands, like `kubectl` and `helm`, on your private cluster through the Azure API without directly connecting to the cluster. The `Microsoft.ContainerService/managedClusters/runcommand/action` and `Microsoft.ContainerService/managedclusters/commandResults/read` actions control the permissions for using `command invoke`. + +With the Azure portal, you can use the `Run command` feature to run commands on your private cluster. The `Run command` feature uses the same `command invoke` functionality to run commands on your cluster. + +The pod created by the `Run command` provides `kubectl` and `helm` for operating your cluster. `jq`, `xargs`, `grep`, and `awk` are available for Bash support. + +## Before you begin + +Before you begin, make sure you have the following resources and permissions: + +* An existing private cluster. If you don't have one, see [Create a private AKS cluster](./private-clusters.md). +* The Azure CLI version 2.24.0 or later. Run `az --version` to find the version. If you need to install or upgrade, see [Install Azure CLI](/cli/azure/install-azure-cli). +* Access to the `Microsoft.ContainerService/managedClusters/runcommand/action` and `Microsoft.ContainerService/managedclusters/commandResults/read` roles on the cluster. + +### Limitations + +This feature is designed to simplify cluster access and is ***not designed for programmatic access***. If you have a program invoke Kubernetes using `Run command`, the following disadvantages apply: + +* You only get *exitCode* and *text output*, and you lose API level details. +* One extra hop introduces extra failure points. + +The pod created by the `Run command` is hard coded with a `200m CPU` and `500Mi memory` request, and a `500m CPU` and `1Gi memory` limit. In rare cases where all your node is packed, the pod can't be scheduled within the ARM API limitation of 60 seconds. This means that the `Run command` would fail, even if it's configured to autoscale. + +`command invoke` runs the commands from your cluster, so any commands run in this manner are subject to your configured networking restrictions and any other configured restrictions. Make sure there are enough nodes and resources in your cluster to schedule this command pod. + +> [!NOTE] +> The output for `command invoke` is limited to 512kB in size. + +## Run commands on your AKS cluster + +### [Azure CLI - `command invoke`](#tab/azure-cli) + +Below are examples of how to use `az aks command invoke` to execute commands against a private AKS cluster. These examples assume you have an existing resource group and AKS cluster. + +#### Use `command invoke` to run a single command + +You can run a command on your cluster using the `az aks command invoke --command` command. The following example command runs the `kubectl get pods -n kube-system` command on the *myPrivateCluster* cluster in *myResourceGroup*. + +First, set environment variables for your resource group and cluster name to use in subsequent commands. + +```bash +export AKS_RESOURCE_GROUP="myResourceGroup" +export AKS_CLUSTER_NAME="myPrivateCluster" +``` + +The environment variables above will allow you to run AKS commands in the next sections without having to rewrite their names. + +To run a single kubectl command on your AKS cluster: + +```azurecli +az aks command invoke \ + --resource-group $AKS_RESOURCE_GROUP \ + --name $AKS_CLUSTER_NAME \ + --command "kubectl get pods -n kube-system" +``` + +#### Use `command invoke` to run multiple commands + +You can also run multiple commands. The following example executes three `helm` commands in sequence on the cluster. + +```azurecli +az aks command invoke \ + --resource-group $AKS_RESOURCE_GROUP \ + --name $AKS_CLUSTER_NAME \ + --command "helm repo add bitnami https://charts.bitnami.com/bitnami && helm repo update && helm install my-release bitnami/nginx" +``` + +#### Use `command invoke` to run commands with an attached file + +When using the `--file` parameter with `az aks command invoke`, the file must exist and be accessible in your current working directory. Below, we create a minimal deployment file for demonstration. + +To run a command with a file attached, first create a Kubernetes manifest file named `deployment.yaml`. The following example creates a small nginx deployment and applies it with `command invoke`: + +```bash +cat < deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nginx-demo +spec: + replicas: 1 + selector: + matchLabels: + app: nginx-demo + template: + metadata: + labels: + app: nginx-demo + spec: + containers: + - name: nginx + image: nginx:1.21.6 + ports: + - containerPort: 80 +EOF + +az aks command invoke \ + --resource-group $AKS_RESOURCE_GROUP \ + --name $AKS_CLUSTER_NAME \ + --command "kubectl apply -f deployment.yaml -n default" \ + --file deployment.yaml +``` + +#### Use `command invoke` to run commands with all files in the current directory attached + +Use only small, necessary files to avoid exceeding system size limits. Below, two minimal YAML files are created before attaching them. + +```bash +cat < deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nginx-demo +spec: + replicas: 1 + selector: + matchLabels: + app: nginx-demo + template: + metadata: + labels: + app: nginx-demo + spec: + containers: + - name: nginx + image: nginx:1.21.6 + ports: + - containerPort: 80 +EOF + +cat < configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: nginx-config +data: + welcome-message: "Hello from configmap" +EOF + +az aks command invoke \ + --resource-group $AKS_RESOURCE_GROUP \ + --name $AKS_CLUSTER_NAME \ + --command "kubectl apply -f deployment.yaml -f configmap.yaml -n default" \ + --file deployment.yaml \ + --file configmap.yaml +``` + +### [Azure portal - `Run command`](#tab/azure-portal) + +To get started with `Run command`, navigate to your private cluster in the Azure portal. In the service menu, under **Kubernetes resources**, select **Run command**. + +### `Run command` commands + +You can use the following kubectl commands with the `Run command` feature: + +* `kubectl get nodes` +* `kubectl get deployments` +* `kubectl get pods` +* `kubectl describe nodes` +* `kubectl describe pod ` +* `kubectl describe deployment ` +* `kubectl apply -f ` + +### Use `Run command` to run a single command + +1. In the Azure portal, navigate to your private cluster. +2. In the service menu, under **Kubernetes resources**, select **Run command**. +3. Enter the command you want to run and select **Run**. + +### Use `Run command` to run commands with attached files + +1. In the Azure portal, navigate to your private cluster. +2. In the service menu, under **Kubernetes resources**, select **Run command**. +3. Select **Attach files** > **Browse for files**. + + :::image type="content" source="media/access-private-cluster/azure-portal-run-command-attach-files.png" alt-text="Screenshot of attaching files to the Azure portal Run command."::: + +4. Select the file(s) you want to attach and then select **Attach**. +5. Enter the command you want to run and select **Run**. + +## Disable `Run command` + +Currently, the only way you can disable the `Run command` feature is by setting [`.properties.apiServerAccessProfile.disableRunCommand` to `true`](/rest/api/aks/managed-clusters/create-or-update). + +--- + +## Troubleshooting + +For information on the most common issues with `az aks command invoke` and how to fix them, see [Resolve `az aks command invoke` failures][command-invoke-troubleshoot]. + +## Next steps + +In this article, you learned how to access a private cluster and run commands on that cluster. For more information on AKS clusters, see the following articles: + +* [Use a private endpoint connection in AKS](./private-clusters.md#use-a-private-endpoint-connection) +* [Virtual networking peering in AKS](./private-clusters.md#virtual-network-peering) +* [Hub and spoke with custom DNS in AKS](./private-clusters.md#hub-and-spoke-with-custom-dns) + + +[command-invoke-troubleshoot]: /troubleshoot/azure/azure-kubernetes/resolve-az-aks-command-invoke-failures \ No newline at end of file diff --git a/scenarios/azure-aks-docs/articles/aks/aks-migration.md b/scenarios/azure-aks-docs/articles/aks/aks-migration.md new file mode 100644 index 000000000..c2873003a --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/aks-migration.md @@ -0,0 +1,308 @@ +--- +title: Migrate to Azure Kubernetes Service (AKS) +description: This article shows you how to migrate to Azure Kubernetes Service (AKS). +ms.topic: concept-article +ms.date: 06/12/2024 +author: your-github-username +ms.author: your-alias +ms.custom: mvc, devx-track-azurecli, innovation-engine +ms.collection: + - migration +--- + +# Migrate to Azure Kubernetes Service (AKS) + +To help you plan and execute a successful migration to Azure Kubernetes Service (AKS), this guide provides details for the current recommended AKS configuration. While this article doesn't cover every scenario, it contains links to more detailed information for planning a successful migration. + +In this article, we summarize migration details for: + +> [!div class="checklist"] +> +> * Containerizing applications through Azure Migrate +> * AKS with Azure Load Balancer (Standard) and Virtual Machine Scale Sets +> * Existing attached Azure services +> * Ensure valid quotas +> * High availability and business continuity +> * Considerations for stateless applications +> * Considerations for stateful applications +> * Deployment of your cluster configuration + +> [!NOTE] +> Depending on your scenario, the following open-source tools might help with your migration: +> +> * [Velero](https://velero.io/) (Requires Kubernetes 1.7+) +> * [Azure Kube CLI extension](https://github.com/yaron2/azure-kube-cli) + +## Before you begin + +* Ensure your target Kubernetes version is within the supported window for AKS. Older versions may not be within the supported range and require a version upgrade for AKS support. For more information, see [AKS supported Kubernetes versions](./supported-kubernetes-versions.md). +* If you're migrating to a newer version of Kubernetes, review the [Kubernetes version and version skew support policy](https://kubernetes.io/docs/setup/release/version-skew-policy/#supported-versions). + +An important practice that you should include as part of your migration process is remembering to follow commonly used deployment and testing patterns. Testing your application before deployment is an important step to ensure its quality, functionality, and compatibility with the target environment. It can help you identify and fix any errors, bugs, or issues that might affect the performance, security, or usability of the application or underlying infrastructure. + +## Use Azure Migrate to migrate your applications to AKS + +Azure Migrate offers a unified platform to assess and migrate to Azure on-premises servers, infrastructure, applications, and data. For AKS, you can use Azure Migrate for the following tasks: + +* [Containerizing ASP.NET applications and migrating to AKS](/azure/migrate/tutorial-app-containerization-aspnet-kubernetes). +* [Containerizing Java web applications and migrating to AKS](/azure/migrate/tutorial-app-containerization-java-kubernetes). + +## AKS with Standard Load Balancer and Virtual Machine Scale Sets + +AKS is a managed service offering unique capabilities with lower management overhead. Since AKS is a managed service, you must select from a set of AKS-supported [regions](./quotas-skus-regions.md). You may need to modify your existing applications to keep them healthy on the AKS-managed control plane during the transition from your existing cluster to AKS. + +We recommend using AKS clusters backed by [Virtual Machine Scale Sets](/azure/virtual-machine-scale-sets/) and [Load Balancer (Standard)](./load-balancer-standard.md) to ensure you get the following features: + +* [Multiple node pools](./create-node-pools.md), +* [Availability zones](/azure/reliability/availability-zones-overview), +* [Authorized IP ranges](./api-server-authorized-ip-ranges.md), +* [Cluster autoscaler](./cluster-autoscaler.md), +* [Azure Policy for AKS](/azure/governance/policy/concepts/policy-for-kubernetes), and +* Other new features as they're released. + +AKS clusters backed by [virtual machine (VM) availability sets](/azure/virtual-machines/availability#availability-sets) lack support for many of these features. + +### Create an AKS cluster with Load Balancer (Standard) and Virtual Machine Scale Sets + +### [Azure CLI](#tab/azure-cli) + +The following example creates an AKS cluster with single node pool backed by a Virtual Machine Scale Set. It enables the cluster autoscaler on the node pool for the cluster and sets a minimum of *one* and a maximum of *three* nodes. + +1. Create a resource group using the [`az group create`][az-group-create] command. + + First, export variables and add a random suffix to ensure resource names are unique. A reliable VM size is also specified for broad subscription compatibility. + + ```azurecli-interactive + export RANDOM_SUFFIX=$(head -c 3 /dev/urandom | xxd -p) + export RESOURCE_GROUP="myResourceGroup$RANDOM_SUFFIX" + export REGION="eastus2" + az group create --name $RESOURCE_GROUP --location $REGION + ``` + + Results: + + + + ```output + { + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx", + "location": "eastus2", + "managedBy": null, + "name": "myResourceGroupxxx", + "properties": { + "provisioningState": "Succeeded" + }, + "tags": null, + "type": "Microsoft.Resources/resourceGroups" + } + ``` + +2. Create an AKS cluster using the [`az aks create`][az-aks-create] command. + + Set the cluster name and create an AKS cluster with autoscaler and standard load balancer. The VM size is set to Standard_DS2_v2 for reliability in most subscriptions. + + ```azurecli-interactive + export CLUSTER_NAME="myAKSCluster$RANDOM_SUFFIX" + az aks create \ + --resource-group $RESOURCE_GROUP \ + --name $CLUSTER_NAME \ + --node-count 1 \ + --node-vm-size Standard_DS2_v2 \ + --vm-set-type VirtualMachineScaleSets \ + --load-balancer-sku standard \ + --enable-cluster-autoscaler \ + --min-count 1 \ + --max-count 3 \ + --generate-ssh-keys + ``` + + Results: + + + + ```output + { + "aadProfile": null, + "addonProfiles": {}, + "agentPoolProfiles": [ + { + "count": 1, + "enableAutoScaling": true, + "maxCount": 3, + "minCount": 1, + "name": "nodepool1", + "orchestratorVersion": "x.xx.x", + "osType": "Linux", + "provisioningState": "Succeeded", + "vmSize": "Standard_DS2_v2", + "type": "VirtualMachineScaleSets" + } + ], + "dnsPrefix": "myaksclusterxxx-dns-xxxxxxxx", + "enableRBAC": true, + "fqdn": "myaksclusterxxx-dns-xxxxxxxx.eastus2.cloudapp.azure.com", + ... + "provisioningState": "Succeeded", + ... + } + ``` + +### [Terraform](#tab/terraform) + +The following code creates a resource group and a Kubernetes cluster in Azure, with auto-scaling enabled and specific network settings, using Terraform. + +> [!NOTE] +> The sample code for this article is located in the [Azure Terraform GitHub repo](https://github.com/Azure/terraform/tree/master/quickstart/101-aks-standard-lb-and-vmss). You can view the log file containing the [test results from current and previous versions of Terraform](https://github.com/Azure/terraform/tree/master/quickstart/101-aks-standard-lb-and-vmss/TestRecord.md). +> +> See more [articles and sample code showing how to use Terraform to manage Azure resources](/azure/terraform) + +1. Create a directory in which to test and run the sample Terraform code, and make it the current directory. + +1. Create a file named `main.tf`, and insert the following code: + :::code language="Terraform" source="~/terraform_samples/quickstart/101-aks-standard-lb-and-vmss/main.tf"::: + +1. Create a file named `outputs.tf`, and insert the following code: + :::code language="Terraform" source="~/terraform_samples/quickstart/101-aks-standard-lb-and-vmss/outputs.tf"::: + +1. Create a file named `providers.tf`, and insert the following code: + :::code language="Terraform" source="~/terraform_samples/quickstart/101-aks-standard-lb-and-vmss/providers.tf"::: + +1. Create a file named `variables.tf`, and insert the following code: + :::code language="Terraform" source="~/terraform_samples/quickstart/101-aks-standard-lb-and-vmss/variables.tf"::: + +1. Initialize Terraform. + + [!INCLUDE [terraform-init.md](~/azure-dev-docs-pr/articles/terraform/includes/terraform-init.md)] + +1. Create a Terraform execution plan. + + [!INCLUDE [terraform-plan.md](~/azure-dev-docs-pr/articles/terraform/includes/terraform-plan.md)] + +1. Apply a Terraform execution plan. + + [!INCLUDE [terraform-apply-plan.md](~/azure-dev-docs-pr/articles/terraform/includes/terraform-apply-plan.md)] + +--- + +## Existing attached Azure Services + +When migrating clusters, you may have attached external Azure services. While the following services don't require resource recreation, they require updating connections from previous to new clusters to maintain functionality: + +* Azure Container Registry +* Azure Log Analytics +* Azure Application Insights +* Azure Traffic Manager +* Azure Storage account +* External databases + +## Ensure valid quotas + +Since other VMs are deployed into your subscription during migration, you should verify your quotas and limits are sufficient for these resources. If necessary, request an increase in [vCPU quota](/azure/azure-portal/supportability/per-vm-quota-requests). + +You may need to request an increase for [network quotas](/azure/azure-portal/supportability/networking-quota-requests) to ensure you don't exhaust IPs. For more information, see [networking and IP ranges for AKS](./configure-kubenet.md). + +For more information, see [Azure subscription and service limits](/azure/azure-resource-manager/management/azure-subscription-service-limits). To check your current quotas, in the Azure portal, go to the [subscriptions blade](https://portal.azure.com/#blade/Microsoft_Azure_Billing/SubscriptionsBlade), select your subscription, and then select **Usage + quotas**. + +## High availability and business continuity + +If your application can't handle downtime, you need to follow best practices for high availability migration scenarios. Read more about [Best practices for complex business continuity planning, disaster recovery, and maximizing uptime in Azure Kubernetes Service (AKS)](./operator-best-practices-multi-region.md). + +For complex applications, you typically migrate over time rather than all at once, meaning the old and new environments might need to communicate over the network. Applications previously using `ClusterIP` services to communicate might need to be exposed as type `LoadBalancer` and secured appropriately. + +To complete the migration, you want to point clients to the new services that run on AKS. We recommend you redirect traffic by updating DNS to point to the load balancer sitting in front of your AKS cluster. + +[Azure Traffic Manager](/azure/traffic-manager/) can direct customers to the desired Kubernetes cluster and application instance. Traffic Manager is a DNS-based traffic load balancer that can distribute network traffic across regions. For the best performance and redundancy, direct all application traffic through Traffic Manager before it goes to your AKS cluster. + +In a multi-cluster deployment, customers should connect to a Traffic Manager DNS name that points to the services on each AKS cluster. Define these services by using Traffic Manager endpoints. Each endpoint is the *service load balancer IP*. Use this configuration to direct network traffic from the Traffic Manager endpoint in one region to the endpoint in a different region. + +![AKS with Traffic Manager](media/operator-best-practices-bc-dr/aks-azure-traffic-manager.png) + +[Azure Front Door](/azure/frontdoor/front-door-overview) is another option for routing traffic for AKS clusters. With Azure Front Door, you can define, manage, and monitor the global routing for your web traffic by optimizing for best performance and instant global failover for high availability. + +### Considerations for stateless applications + +Stateless application migration involves the following steps: + +1. Apply your resource definitions (YAML or Helm) to the new cluster. +2. Ensure everything works as expected. +3. Redirect traffic to activate your new cluster. + +### Considerations for stateful applications + +Carefully plan your migration of stateful applications to avoid data loss or unexpected downtime. + +* If you use Azure Files, you can mount the file share as a volume into the new cluster. See [Mount Static Azure Files as a Volume](./azure-csi-files-storage-provision.md#mount-file-share-as-a-persistent-volume). +* If you use Azure Managed Disks, you can only mount the disk if unattached to any VM. See [Mount Static Azure Disk as a Volume](./azure-csi-disk-storage-provision.md#mount-disk-as-a-volume). +* If neither of those approaches work, you can use a backup and restore options. See [Velero on Azure](https://github.com/vmware-tanzu/velero-plugin-for-microsoft-azure/blob/master/README.md). + +#### Azure Files + +Unlike disks, Azure Files can be mounted to multiple hosts concurrently. In your AKS cluster, Azure and Kubernetes don't prevent you from creating a pod that your AKS cluster still uses. To prevent data loss and unexpected behavior, ensure the clusters don't simultaneously write to the same files. + +If your application can host multiple replicas that point to the same file share, follow the stateless migration steps and deploy your YAML definitions to your new cluster. + +If not, a possible migration approach involves the following steps: + +1. Validate your application is working correctly. +2. Point your live traffic to your new AKS cluster. +3. Disconnect the old cluster. + +If you want to start with an empty share and make a copy of the source data, you can use the [`az storage file copy`](/cli/azure/storage/file/copy) command to migrate your data. + +#### Migrating persistent volumes + +If you're migrating existing persistent volumes to AKS, you generally follow these steps: + +1. Quiesce writes to the application. + * This step is optional and requires downtime. +1. Take snapshots of the disks. +1. Create new managed disks from the snapshots. +1. Create persistent volumes in AKS. +1. Update pod specifications to [use existing volumes](./azure-disk-csi.md) rather than PersistentVolumeClaims (static provisioning). +1. Deploy your application to AKS. +1. Validate your application is working correctly. +1. Point your live traffic to your new AKS cluster. + +> [!IMPORTANT] +> If you choose not to quiesce writes, you need to replicate data to the new deployment. Otherwise, you miss the data that was written after you took the disk snapshots. + +The following open-source tools can help you create managed disks and migrate volumes between Kubernetes clusters: + +* [Azure CLI Disk Copy extension](https://github.com/noelbundick/azure-cli-disk-copy-extension) copies and converts disks across resource groups and Azure regions. +* [Azure Kube CLI extension](https://github.com/yaron2/azure-kube-cli) enumerates ACS Kubernetes volumes and migrates them to an AKS cluster. + +### Deployment of your cluster configuration + +We recommend you use your existing continuous integration and continuous delivery pipeline to deploy a known-good configuration to AKS. You can use Azure Pipelines to [build and deploy your applications to AKS](/azure/devops/pipelines/ecosystems/kubernetes/aks-template). Clone your existing deployment tasks and ensure `kubeconfig` points to the new AKS cluster. + +If that's not possible, export resource definitions from your existing Kubernetes cluster, and then apply them to AKS. You can use `kubectl` to export objects. For example: + +```console +kubectl get deployment -o yaml > deployments.yaml +``` + +Be sure to examine the output and remove any unnecessary live data fields. + +### Moving existing resources to another region + +You might want to move your AKS cluster to a [different region supported by AKS][region-availability]. We recommend you create a new cluster in the other region and then deploy your resources and applications to your new cluster. + +If you have any services running on your AKS cluster, you need to install and configure those services on your cluster in the new region. + +In this article, we summarized migration details for: + +> [!div class="checklist"] +> +> * Containerizing applications through Azure Migrate +> * AKS with Load Balancer (Standard) and Virtual Machine Scale Sets +> * Existing attached Azure services +> * Ensuring valid quotas +> * High availability and business continuity +> * Considerations for stateless applications +> * Considerations for stateful applications +> * Deploying your cluster configuration + + +[region-availability]: https://azure.microsoft.com/global-infrastructure/services/?products=kubernetes-service +[az-group-create]: /cli/azure/group#az_group_create +[az-aks-create]: /cli/azure/aks#az_aks_create \ No newline at end of file diff --git a/scenarios/azure-aks-docs/articles/aks/concepts-network-azure-cni-pod-subnet.md b/scenarios/azure-aks-docs/articles/aks/concepts-network-azure-cni-pod-subnet.md new file mode 100644 index 000000000..ce27025a1 --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/concepts-network-azure-cni-pod-subnet.md @@ -0,0 +1,133 @@ +--- +title: Concepts - Azure CNI Pod Subnet networking in AKS +description: Learn about Azure CNI Pod Subnet, dynamic IP allocation mode, and static block allocation mode in Azure Kubernetes Service (AKS). +ms.topic: concept-article +ms.date: 05/21/2024 +author: schaffererin +ms.author: schaffererin +ms.custom: references_regions, innovation-engine +--- + +# Azure Container Networking Interface (CNI) Pod Subnet + +Azure CNI Pod Subnet assigns IP addresses to pods from a separate subnet from your cluster Nodes. This feature is available in two modes: Dynamic IP Allocation and Static Block Allocation (Preview). + +## Prerequisites + +> [!NOTE] +> When using static block allocation of CIDRs, exposing an application as a Private Link Service using a Kubernetes Load Balancer Service isn't supported. + +- Review the [prerequisites][azure-cni-prereq] for configuring basic Azure CNI networking in AKS, as the same prerequisites apply to this article. +- Review the [deployment parameters][azure-cni-deployment-parameters] for configuring basic Azure CNI networking in AKS, as the same parameters apply. +- AKS Engine and DIY clusters aren't supported. +- Azure CLI version `2.37.0` or later and the `aks-preview` extension version `2.0.0b2` or later. +- Register the subscription-level feature flag for your subscription: 'Microsoft.ContainerService/AzureVnetScalePreview'. + +## Enable Container Insights (AKS monitoring) + +If you have an existing cluster, you can enable Container Insights (AKS monitoring) using the following command **only if your cluster was created with monitoring enabled or is associated with a valid Log Analytics Workspace in the same region**. Otherwise, refer to Microsoft Docs for additional workspace setup requirements. + +```azurecli-interactive +az aks enable-addons --addons monitoring --name $CLUSTER_NAME --resource-group $RESOURCE_GROUP_NAME +``` + +Results: + + + +```output +{ + "addons": [ + { + "addonType": "Monitoring", + "enabled": true, + "identity": { + "clientId": "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", + "objectId": "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", + "resourceId": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/xxxxxxxx/providers/Microsoft.ManagedIdentity/userAssignedIdentities/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + }, + "name": "omsagent", + "config": { + ... + } + }, + ... + ], + "name": "my-aks-cluster", + "resourceGroup": "my-aks-rg", + ... +} +``` + +## Dynamic IP allocation mode + +Dynamic IP allocation helps mitigate pod IP address exhaustion issues by allocating pod IPs from a subnet that's separate from the subnet hosting the AKS cluster. + +The dynamic IP allocation mode offers the following benefits: + +- **Better IP utilization**: IPs are dynamically allocated to cluster Pods from the Pod subnet. This leads to better utilization of IPs in the cluster compared to the traditional CNI solution, which does static allocation of IPs for every node. +- **Scalable and flexible**: Node and pod subnets can be scaled independently. A single pod subnet can be shared across multiple node pools of a cluster or across multiple AKS clusters deployed in the same VNet. You can also configure a separate pod subnet for a node pool. +- **High performance**: Since pods are assigned VNet IPs, they have direct connectivity to other cluster pods and resources in the VNet. The solution supports very large clusters without any degradation in performance. +- **Separate VNet policies for pods**: Since pods have a separate subnet, you can configure separate VNet policies for them that are different from node policies. This enables many useful scenarios, such as allowing internet connectivity only for pods and not for nodes, fixing the source IP for pod in a node pool using an Azure NAT Gateway, and using network security groups (NSGs) to filter traffic between node pools. +- **Kubernetes network policies**: Both the Azure Network Policies and Calico work with this mode. + +### Plan IP addressing + +With dynamic IP allocation, nodes and pods scale independently, so you can plan their address spaces separately. Since pod subnets can be configured to the granularity of a node pool, you can always add a new subnet when you add a node pool. The system pods in a cluster/node pool also receive IPs from the pod subnet, so this behavior needs to be accounted for. + +IPs are allocated to nodes in batches of 16. Pod subnet IP allocation should be planned with a minimum of 16 IPs per node in the cluster, as the nodes request 16 IPs on startup and request another batch of 16 anytime there are <8 IPs unallocated in their allotment. + +IP address planning for Kubernetes services and Docker Bridge remain unchanged. + +## Static block allocation mode (Preview) + +Static block allocation helps mitigate potential pod subnet sizing and Azure address mapping limitations by assigning CIDR blocks to nodes rather than individual IPs. + +The static block allocation mode offers the following benefits: + +- **Better IP scalability**: CIDR blocks are statically allocated to the cluster nodes and are present for the lifetime of the node, as opposed to the traditional dynamic allocation of individual IPs with traditional CNI. This enables routing based on CIDR blocks and helps scale the cluster limit up to 1 million pods from the traditional 65K pods per cluster. Your Azure Virtual Network must be large enough to accommodate the scale of your cluster. +- **Flexibility**: Node and pod subnets can be scaled independently. A single pod subnet can be shared across multiple node pools of a cluster or across multiple AKS clusters deployed in the same VNet. You can also configure a separate pod subnet for a node pool. +- **High performance**: Since pods are assigned virtual network IPs, they have direct connectivity to other cluster pods and resources in the VNet. +- **Separate VNet policies for pods**: Since pods have a separate subnet, you can configure separate VNet policies for them that are different from node policies. This enables many useful scenarios such as allowing internet connectivity only for pods and not for nodes, fixing the source IP for pod in a node pool using an Azure NAT Gateway, and using NSGs to filter traffic between node pools. +- **Kubernetes network policies**: Cilium, Azure NPM, and Calico work with this solution. + +### Limitations + +Below are some of the limitations of using Azure CNI Static Block allocation: +- Minimum Kubernetes Version required is 1.28 +- Maximum subnet size supported is x.x.x.x/12 ~ 1 million IPs +- Only a single mode of operation can be used per subnet. If a subnet uses Static Block allocation mode, it cannot be use Dynamic IP allocation mode in a different cluster or node pool with the same subnet and vice versa. +- Only supported in new clusters or when adding node pools with a different subnet to existing clusters. Migrating or updating existing clusters or node pools is not supported. +- Across all the CIDR blocks assigned to a node in the node pool, one IP will be selected as the primary IP of the node. Thus, for network administrators selecting the `--max-pods` value try to use the calculation below to best serve your needs and have optimal usage of IPs in the subnet: + +`max_pods = (N * 16) - 1` where `N` is any positive integer and `N` > 0 + +### Plan IP addressing + +With static block allocation, nodes and pods scale independently, so you can plan their address spaces separately. Since pod subnets can be configured to the granularity of a node pool, you can always add a new subnet when you add a node pool. The system pods in a cluster/node pool also receive IPs from the pod subnet, so this behavior needs to be accounted for. + +CIDR blocks of /28 (16 IPs) are allocated to nodes based on your `--max-pods` configuration for your node pool, which defines the maximum number of pods per node. 1 IP is reserved on each node from all the available IPs on that node for internal purposes. + +While planning your IPs, it's important to define your `--max-pods` configuration using the following calculation: `max_pods_per_node = (16 * N) - 1`, where `N` is any positive integer greater than `0`. + +Ideal values with no IP wastage would require the max pods value to conform to the above expression. + +See the following example cases: + +| Example case | `max_pods` | CIDR Blocks allocated per node | Total IP available for pods | IP wastage for node | +| --- | --- | --- | --- | --- | +| Low wastage (acceptable) | 30 | 2 | (16 * 2) - 1 = 32 - 1 = 31 | 31 - 30 = 1 | +| Ideal case | 31 | 2 | (16 * 2) - 1 = 32 - 1 = 31 | 31 - 31 = 0 | +| High wastage (not recommended) | 32 | 3 | (16 * 3) - 1 = 48 - 1 = 47 | 47 - 32 = 15 | + +IP address planning for Kubernetes services remains unchanged. + +> [!NOTE] +> Ensure your VNet has a sufficiently large and contiguous address space to support your cluster's scale. + + + + +[azure-cni-prereq]: ./configure-azure-cni.md#prerequisites +[azure-cni-deployment-parameters]: ./azure-cni-overview.md#deployment-parameters +[az-aks-enable-addons]: /cli/azure/aks#az_aks_enable_addons \ No newline at end of file diff --git a/scenarios/azure-aks-docs/articles/aks/concepts-preview-api-life-cycle.md b/scenarios/azure-aks-docs/articles/aks/concepts-preview-api-life-cycle.md new file mode 100644 index 000000000..e3dc80cf1 --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/concepts-preview-api-life-cycle.md @@ -0,0 +1,80 @@ +--- +title: AKS Preview API life cycle +description: Learn about the AKS preview API life cycle. +ms.custom: azure-kubernetes-service,innovation-engine +ms.topic: concept-article +ms.date: 06/06/2024 +author: matthchr +ms.author: matthchr + +--- + +# AKS Preview API life cycle + +The Azure Kubernetes Service (AKS) preview APIs (APIs that end in `-preview`) have a lifespan of ~one year from their release date. +This means that you can expect the 2023-01-02-preview API to be deprecated somewhere around January 1st, 2024. + +We love when people try our preview features and give us feedback, so we encourage you to use the preview APIs and the +tools built on them (such as the [AKS Preview CLI Extension](https://github.com/Azure/azure-cli-extensions/tree/main/src/aks-preview)). + +After an API version is deprecated, it will no longer function! We recommend you routinely: +- Update your ARM/BICEP templates using preview API versions to use the latest version of the preview API. +- Update your AKS preview CLI extension to the latest version. +- Update any preview SDKs or other tools built on the preview API to the latest version. + +You should perform these updates at a minimum every 6-9 months. If you fail to do so, you will be notified that you are using a soon-to-be deprecated +API version as deprecation approaches. + +## How to check what API versions you're using + +If you're unsure what client or tool is using this API version, check the [activity logs](/azure/azure-monitor/essentials/activity-log) +using the following command: + +Set the API version you want to inspect for recent usage in the activity log. In this example, we are checking for the `2022-04-01-preview` API version. + +```bash +export API_VERSION="2022-04-01-preview" +az monitor activity-log list --offset 30d --max-events 10000 --namespace microsoft.containerservice --query "[?eventName.value == 'EndRequest' && contains(not_null(httpRequest.uri,''), '$API_VERSION')]" +``` + +## How to update to a newer version of the API + +- For Azure SDKs: use a newer API version by updating to a [newer version of the SDK](https://azure.github.io/azure-sdk/releases/latest/index.html?search=containerservice). +- For Azure CLI: Update the CLI itself and the aks-preview extension (if used) to the latest version by running `az upgrade` and `az extension update --name "aks-preview"`. +- For Terraform: Update to the latest version of the AzureRM Terraform module. To find out what version of the API a particular Terraform release is using, + check the [Terraform release notes](/azure/developer/terraform/provider-version-history-azurerm) or + git log [this file](https://github.com/hashicorp/terraform-provider-azurerm/blob/main/internal/services/containers/client/client.go). +- For other tools: Update the tool to the latest version. + + +## Upcoming deprecations + +| API version | Announce Date | Deprecation Date | +|--------------------|-------------------|-------------------| +| 2022-09-02-preview | March 27, 2024 | June 20, 2024 | +| 2022-10-02-preview | March 27, 2024 | June 20, 2024 | +| 2023-01-02-preview | March 27, 2024 | June 20, 2024 | +| 2023-02-02-preview | March 27, 2024 | June 20, 2024 | +| 2023-03-02-preview | Oct 21, 2024 | February 3, 2025 | +| 2023-04-02-preview | Oct 21, 2024 | February 10, 2025 | +| 2023-05-02-preview | Oct 21, 2024 | February 17, 2025 | +| 2023-06-02-preview | Oct 21, 2024 | February 24, 2025 | +| 2023-07-02-preview | Oct 21, 2024 | March 3, 2025 | +| 2023-08-02-preview | Oct 21, 2024 | March 10, 2025 | + +## Completed deprecations + +| API version | Announce Date | Deprecation Date | +|--------------------|-------------------|-------------------| +| 2018-08-01-preview | March 7, 2023 | June 1, 2023 | +| 2021-11-01-preview | March 23, 2023 | July 1, 2023 | +| 2022-02-02-preview | April 27, 2023 | August 1, 2023 | +| 2022-01-02-preview | May 3, 2023 | Sept 1, 2023 | +| 2022-03-02-preview | May 3, 2023 | Sept 1, 2023 | +| 2022-04-02-preview | May 3, 2023 | Sept 1, 2023 | +| 2022-05-02-preview | May 3, 2023 | Sept 1, 2023 | +| 2022-06-02-preview | May 3, 2023 | Sept 1, 2023 | +| 2022-07-02-preview | November 20, 2023 | February 14, 2024 | +| 2022-08-02-preview | March 27, 2024 | June 20, 2024 | +| 2022-08-03-preview | March 27, 2024 | June 20, 2024 | +| 2022-11-02-preview | March 27, 2024 | June 20, 2024 | diff --git a/scenarios/azure-aks-docs/articles/aks/delete-cluster.md b/scenarios/azure-aks-docs/articles/aks/delete-cluster.md new file mode 100644 index 000000000..559bf7a6d --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/delete-cluster.md @@ -0,0 +1,74 @@ +--- +title: Delete an Azure Kubernetes Service (AKS) cluster +description: Learn about deleting a cluster in Azure Kubernetes Service (AKS). +ms.topic: overview +ms.author: schaffererin +author: schaffererin +ms.date: 04/16/2024 +ms.custom: innovation-engine +--- + +# Delete an Azure Kubernetes Service (AKS) cluster + +This article outlines cluster deletion in Azure Kubernetes Service (AKS), including what happens when you delete a cluster, alternatives to deleting a cluster, and how to delete a cluster. + +## What happens when you delete a cluster? + +When you delete a cluster, the following resources are deleted: + +* The [node resource group][node-resource-group] and its resources, including: + * The virtual machine scale sets and virtual machines (VMs) for each node in the cluster + * The virtual network and its subnets for the cluster + * The storage for the cluster +* The control plane and its resources +* Any node instances in the cluster along with any pods running on those nodes + +## Alternatives to deleting a cluster + +Before you delete a cluster, consider **stopping the cluster**. Stopping an AKS cluster stops the control plane and agent nodes, allowing you to save on compute costs while maintaining all objects except standalone pods. When you stop a cluster, its state is saved and you can restart the cluster at any time. For more information, see [Stop an AKS cluster][stop-cluster]. + +If you want to delete a cluster to change its configuration, you can instead use the [AKS cluster upgrade][upgrade-cluster] feature to upgrade the cluster to a different Kubernetes version or change the node pool configuration. For more information, see [Upgrade an AKS cluster][upgrade-cluster]. + +## Delete a cluster + +> [!IMPORTANT] +> **You can't recover a cluster after it's deleted**. If you need to recover a cluster, you need to create a new cluster and redeploy your applications. + +### [Azure CLI](#tab/azure-cli) + +Delete a cluster using the [`az aks delete`][az-aks-delete] command. The following example deletes the `myAKSCluster` cluster in the `myResourceGroup` resource group. + +Declare environment variables with generic names and a random suffix to ensure uniqueness. This avoids conflicts and enables the commands to be re-used in different sessions or by different users. + +```azurecli-interactive +export RANDOM_SUFFIX=$(openssl rand -hex 3) +export AKS_CLUSTER_NAME="myAKSCluster$RANDOM_SUFFIX" +export RESOURCE_GROUP_NAME="myResourceGroup$RANDOM_SUFFIX" +az aks delete --name $AKS_CLUSTER_NAME --resource-group $RESOURCE_GROUP_NAME --yes --no-wait +``` + +### [Azure PowerShell](#tab/azure-powershell) + +Delete a cluster using the [`Remove-AzAks`][remove-azaks] command. The following example deletes the `myAKSCluster` cluster in the `myResourceGroup` resource group: + +```azurepowershell-interactive +Remove-AzAksCluster -Name myAKSCluster -ResourceGroupName myResourceGroup +``` + +### [Azure portal](#tab/azure-portal) + +You can delete a cluster using the Azure portal. To delete a cluster, navigate to the **Overview** page for the cluster and select **Delete**. You can also delete a cluster from the **Resource group** page by selecting the cluster and then selecting **Delete**. + +--- + +## Next steps + +For more information about AKS, see [Core Kubernetes concepts for AKS][core-concepts]. + + +[node-resource-group]: ./concepts-clusters-workloads.md#node-resource-group +[stop-cluster]: ./start-stop-cluster.md +[upgrade-cluster]: ./upgrade-cluster.md +[az-aks-delete]: /cli/azure/aks#az_aks_delete +[remove-azaks]: /powershell/module/az.aks/remove-azakscluster +[core-concepts]: ./concepts-clusters-workloads.md \ No newline at end of file diff --git a/scenarios/azure-aks-docs/articles/aks/enable-host-encryption.md b/scenarios/azure-aks-docs/articles/aks/enable-host-encryption.md new file mode 100644 index 000000000..6d9d4b8e2 --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/enable-host-encryption.md @@ -0,0 +1,103 @@ +--- +title: Enable host-based encryption on Azure Kubernetes Service (AKS) +description: Learn how to configure a host-based encryption in an Azure Kubernetes Service (AKS) cluster. +ms.topic: how-to +ms.subservice: aks-security +ms.date: 07/17/2023 +author: nickomang +ms.author: nickoman + +ms.custom: devx-track-azurecli +ms.devlang: azurecli +--- + +# Host-based encryption on Azure Kubernetes Service (AKS) + +With host-based encryption, the data stored on the VM host of your AKS agent nodes' VMs is encrypted at rest and flows encrypted to the Storage service. This means the temp disks are encrypted at rest with platform-managed keys. The cache of OS and data disks is encrypted at rest with either platform-managed keys or customer-managed keys depending on the encryption type set on those disks. + +By default, when using AKS, OS and data disks use server-side encryption with platform-managed keys. The caches for these disks are encrypted at rest with platform-managed keys. You can specify your own managed keys following [Bring your own keys (BYOK) with Azure disks in Azure Kubernetes Service](azure-disk-customer-managed-keys.md). The caches for these disks are also encrypted using the key you specify. + +Host-based encryption is different than server-side encryption (SSE), which is used by Azure Storage. Azure-managed disks use Azure Storage to automatically encrypt data at rest when saving data. Host-based encryption uses the host of the VM to handle encryption before the data flows through Azure Storage. + +## Before you begin + +Before you begin, review the following prerequisites and limitations. + +### Prerequisites + +- Ensure you have the CLI extension v2.23 or higher installed. + +### Limitations + +- This feature can only be set at cluster or node pool creation time. +- This feature can only be enabled in [Azure regions][supported-regions] that support server-side encryption of Azure managed disks and only with specific [supported VM sizes][supported-sizes]. +- This feature requires an AKS cluster and node pool based on Virtual Machine Scale Sets as *VM set type*. + +## Enable Encryption at Host for your AKS cluster + +Before adding a node pool with host-based encryption, ensure the EncryptionAtHost feature is enabled for your subscription: + +```azurecli +# Register the EncryptionAtHost feature +az feature register --namespace Microsoft.Compute --name EncryptionAtHost + +# Wait for registration to complete (this may take several minutes) +az feature show --namespace Microsoft.Compute --name EncryptionAtHost --query "properties.state" + +# Refresh the provider registration +az provider register --namespace Microsoft.Compute +``` + +## Use host-based encryption on new clusters + +- Create a new cluster and configure the cluster agent nodes to use host-based encryption using the [`az aks create`][az-aks-create] command with the `--enable-encryption-at-host` flag. + + ```shell + az aks create \ + --name myAKSCluster \ + --resource-group myResourceGroup \ + --node-vm-size Standard_DS2_v2 \ + --location westus2 \ + --enable-encryption-at-host \ + --generate-ssh-keys + ``` + +## Use host-based encryption on existing clusters + +- Enable host-based encryption on an existing cluster by adding a new node pool using the [`az aks nodepool add`][az-aks-nodepool-add] command with the `--enable-encryption-at-host` flag. + + ```azurecli + az aks nodepool add --name hostencrypt --cluster-name $MY_AKS_CLUSTER --resource-group $MY_RESOURCE_GROUP -s Standard_DS2_v2 --enable-encryption-at-host + ``` + + Results: + + + + ```output + { + "agentPoolProfile": { + "enableEncryptionAtHost": true, + "name": "hostencrypt", + "nodeCount": 1, + "osDiskSizeGB": 30, + "vmSize": "Standard_DS2_v2" + }, + ... + } + ``` + +## Next steps + +- Review [best practices for AKS cluster security][best-practices-security]. +- Read more about [host-based encryption](/azure/virtual-machines/disk-encryption#encryption-at-host---end-to-end-encryption-for-your-vm-data). + + + +[best-practices-security]: ./operator-best-practices-cluster-security.md +[supported-regions]: /azure/virtual-machines/disk-encryption#supported-regions +[supported-sizes]: /azure/virtual-machines/disk-encryption#supported-vm-sizes +[control-keys]: ../key-vault/general/best-practices.md#control-access-to-your-vault +[akv-built-in-roles]: ../key-vault/general/rbac-guide.md#azure-built-in-roles-for-key-vault-data-plane-operations +[az-aks-create]: /cli/azure/aks#az-aks-create +[az-aks-nodepool-add]: /cli/azure/aks/nodepool#az-aks-nodepool-add \ No newline at end of file diff --git a/scenarios/azure-aks-docs/articles/aks/events.md b/scenarios/azure-aks-docs/articles/aks/events.md new file mode 100644 index 000000000..38c81e4d4 --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/events.md @@ -0,0 +1,131 @@ +--- +title: Use Kubernetes events for troubleshooting +description: Learn about Kubernetes events, which provide details on pods, nodes, and other Kubernetes objects. +ms.topic: how-to +ms.author: nickoman +author: nickomang +ms.subservice: aks-monitoring +ms.date: 06/13/2024 +ms.custom: innovation-engine +--- + +# Use Kubernetes events for troubleshooting in Azure Kubernetes Service (AKS) + +This article shows you how to use Kubernetes events to monitor and troubleshoot issues in your Azure Kubernetes Service (AKS) clusters. + +## What are Kubernetes events? + +Events are one of the most prominent sources for monitoring and troubleshooting issues in Kubernetes. They capture and record information about the lifecycle of various Kubernetes objects, such as pods, nodes, services, and deployments. By monitoring events, you can gain visibility into your cluster's activities, identify issues, and troubleshoot problems effectively. + +Kubernetes events don't persist throughout your cluster lifecycle, as there's no retention mechanism. Events are **only available for *one hour* after the event is generated**. To store events for a longer time period, enable [Container insights][container-insights]. + +## Kubernetes event objects + +The following table lists some key Kubernetes event objects: + +|Field name|Description| +|----------|------------| +|type |The type is based on the severity of the event:
**Warning** events signal potentially problematic situations, such as a pod repeatedly failing or a node running out of resources. They require attention, but might not result in immediate failure.
**Normal** events represent routine operations, such as a pod being scheduled or a deployment scaling up. They usually indicate healthy cluster behavior.| +|reason|The reason why the event was generated. For example, *FailedScheduling* or *CrashLoopBackoff*.| +|message|A human-readable message that describes the event.| +|namespace|The namespace of the Kubernetes object that the event is associated with.| +|firstSeen|Timestamp when the event was first observed.| +|lastSeen|Timestamp of when the event was last observed.| +|reportingController|The name of the controller that reported the event. For example, `kubernetes.io/kubelet`.| +|object|The name of the Kubernetes object that the event is associated with.| + +For more information, see the official [Kubernetes documentation][k8s-events]. + +## View Kubernetes events + +### [Azure CLI](#tab/azure-cli) + +List all events in your cluster using the `kubectl get events` command. + +Assuming your cluster is already created and available (per doc prerequisites), get credentials (note the `--overwrite-existing` flag is set to avoid kubeconfig errors): + +```bash +az aks get-credentials --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER --overwrite-existing +``` + +Now list all events in your cluster: + +```bash +kubectl get events +``` + +Results: + + + +```output +LAST SEEN TYPE REASON OBJECT MESSAGE +xxm Normal Scheduled pod/my-pod-xxxxx Successfully assigned default/my-pod-xxxxx to aks-nodepoolxx-xxxxxxx-vmss000000 +xxm Normal Pulled pod/my-pod-xxxxx Container image "nginx" already present on machine +xxm Normal Created pod/my-pod-xxxxx Created container nginx +xxm Normal Started pod/my-pod-xxxxx Started container nginx +... +``` + +Look at a specific pod's events by first finding the name of the pod and then using the `kubectl describe pod` command. + +List the pods in the current namespace: + +```bash +kubectl get pods +``` + +Results: + + + +```output +NAME READY STATUS RESTARTS AGE +my-pod-xxxxx 1/1 Running 0 xxm +nginx-deployment-xxxxx 1/1 Running 0 xxm +... +``` + +Replace `` below with your actual pod name. For automation, here's an example for the first pod in the list: + +```shell +POD_NAME=$(kubectl get pods -o jsonpath="{.items[0].metadata.name}") +kubectl describe pod $POD_NAME +``` + +### [Azure portal](#tab/azure-portal) + +1. Open the Azure portal and navigate to your AKS cluster resource. +1. From the service menu, under **Kubernetes resources**, select **Events**. +1. The **Events** page displays a list of events in your cluster. You can filter events by type, reason, source, object, or namespace. You can combine filters to narrow down the results. + +--- + +## Best practices for troubleshooting with events + +### Filtering events for relevance + +You might have various namespaces and services running in your AKS cluster. Filtering events based on object type, namespace, or reason can help narrow down the results to the most relevant information. + +For example, you can use the following command to filter events within the default namespace: + +```bash +kubectl get events --namespace default +``` + +### Automating event notifications + +To ensure timely response to critical events in your AKS cluster, set up automated notifications. Azure offers integration with monitoring and alerting services like [Azure Monitor][aks-azure-monitor]. You can configure alerts to trigger based on specific event patterns. This way, you're immediately informed about crucial issues that require attention. + +### Regularly reviewing events + +Make a habit of regularly reviewing events in your AKS cluster. This proactive approach can help you identify trends, catch potential problems early, and prevent escalations. By staying on top of events, you can maintain the stability and performance of your applications. + +## Next steps + +Now that you understand Kubernetes events, you can continue your monitoring and observability journey by [enabling Container insights][container-insights]. + + +[aks-azure-monitor]: ./monitor-aks.md +[container-insights]: /azure/azure-monitor/containers/container-insights-enable-aks +[k8s-events]: https://kubernetes.io/docs/reference/kubernetes-api/cluster-resources/event-v1/ diff --git a/scenarios/azure-aks-docs/articles/aks/free-standard-pricing-tiers.md b/scenarios/azure-aks-docs/articles/aks/free-standard-pricing-tiers.md new file mode 100644 index 000000000..1977af5d8 --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/free-standard-pricing-tiers.md @@ -0,0 +1,296 @@ +--- +title: Azure Kubernetes Service (AKS) Free, Standard, and Premium pricing tiers for cluster management +description: Learn about the Azure Kubernetes Service (AKS) Free, Standard, and Premium pricing plans and what features, deployment patterns, and recommendations to consider between each plan. +ms.topic: concept-article +ms.date: 06/07/2024 +author: schaffererin +ms.author: schaffererin +ms.custom: references_regions, devx-track-azurecli, innovation-engine +--- + +# Free, Standard, and Premium pricing tiers for Azure Kubernetes Service (AKS) cluster management + +Azure Kubernetes Service (AKS) offers three pricing tiers for cluster management: the **Free tier**, the **Standard tier**, and the **Premium tier**. All tiers are in the **Base** SKU. + +| |Free tier|Standard tier|Premium tier| +|------------------|---------|--------|--------| +|**When to use**|• You want to experiment with AKS at no extra cost
• You're new to AKS and Kubernetes|• You're running production or mission-critical workloads and need high availability and reliability
• You need a financially backed SLA
• Automatically selected for AKS automatic clusters (if you create an AKS Automatic Cluster)|• You're running production or mission-critical workloads and need high availability and reliability
• You need a financially backed SLA
• All mission critical, at scale, or production workloads requiring *two years* of one Kubernetes version support| +|**Supported cluster types**|• Development clusters or small scale testing environments
• Clusters with fewer than 10 nodes|• Enterprise-grade or production workloads
• Clusters with up to 5,000 nodes| • Enterprise-grade or production workloads
• Clusters with up to 5,000 nodes | +|**Pricing**|• Free cluster management
• Pay-as-you-go for resources you consume|• Pay-as-you-go for resources you consume
• [Standard tier Cluster Management Pricing](https://azure.microsoft.com/pricing/details/kubernetes-service/) | • Pay-as-you-go for resources you consume
• [Premium tier Cluster Management Pricing](https://azure.microsoft.com/pricing/details/kubernetes-service/) | +|**Feature comparison**|• Recommended for clusters with fewer than 10 nodes, but can support up to 1,000 nodes
• Includes all current AKS features|• Uptime SLA is enabled by default
• Greater cluster reliability and resources
• Can support up to 5,000 nodes in a cluster
• Includes all current AKS features | • Includes all current AKS features from standard tier
• [Microsoft maintenance past community support][long-term-support] | + +For more information on pricing, see the [AKS pricing details](https://azure.microsoft.com/pricing/details/kubernetes-service/). + +## Uptime SLA terms and conditions + +In the Standard tier and Premium tier, the Uptime SLA feature is enabled by default per cluster. The Uptime SLA feature guarantees 99.95% availability of the Kubernetes API server endpoint for clusters using [Availability Zones][availability-zones], and 99.9% of availability for clusters that aren't using Availability Zones. For more information, see [SLA](https://azure.microsoft.com/support/legal/sla/kubernetes-service/v1_1/). + +## Region availability + +* Free tier, Standard tier, and Premium tier are available in public regions and Azure Government regions where [AKS is supported](https://azure.microsoft.com/global-infrastructure/services/?products=kubernetes-service). +* Free tier, Standard tier, and Premium tier are available for [private AKS clusters][private-clusters] in all public regions where AKS is supported. + +## Before you begin + +You need [Azure CLI](/cli/azure/install-azure-cli) version 2.47.0 or later. Run `az --version` to find your current version. If you need to install or upgrade, see [Install Azure CLI][install-azure-cli]. + +## Create a new cluster and select the pricing tier + +Use the Azure CLI to create a new cluster on an AKS pricing tier. You can create your cluster in an existing resource group or create a new one. To learn more about resource groups and working with them, see [managing resource groups using the Azure CLI][manage-resource-group-cli]. + +Use the [`az aks create`][az-aks-create] command to create an AKS cluster. The following commands show you how to create a new cluster in the Free, Standard, and Premium tiers. + +Below, we set up the required environment variables for the resource group, cluster name, and region. We generate a unique suffix for the resource names to avoid conflicts if run multiple times. + +```shell +export RANDOM_SUFFIX=$(openssl rand -hex 3) +export REGION="eastus2" +export RESOURCE_GROUP="aks-rg-$RANDOM_SUFFIX" +export CLUSTER_NAME="aks-cluster-$RANDOM_SUFFIX" +az group create --name $RESOURCE_GROUP --location $REGION +``` + +Results: + +```output +{ + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/aks-rg-xxx", + "location": "eastus2", + "managedBy": null, + "name": "aks-rg-xxx", + "properties": { + "provisioningState": "Succeeded" + }, + "tags": null, + "type": "Microsoft.Resources/resourceGroups" +} +``` + +### Create a new AKS cluster in the Free tier + +```shell +# Create a new AKS cluster in the Free tier + +az aks create \ + --resource-group $RESOURCE_GROUP \ + --name $CLUSTER_NAME \ + --tier free \ + --generate-ssh-keys +``` + +Results: + +```output +{ + ... + "sku": { + "name": "Base", + "tier": "Free" + }, + ... +} +``` + +### Create a new AKS cluster in the Standard tier + +```shell +# Create a new AKS cluster in the Standard tier + +az aks create \ + --resource-group $RESOURCE_GROUP \ + --name $CLUSTER_NAME \ + --tier standard \ + --generate-ssh-keys +``` + +Results: + +```output +{ + ... + "sku": { + "name": "Base", + "tier": "Standard" + }, + ... +} +``` + +### Create a new AKS cluster in the Premium tier + +LongTermSupport and Premium tier should be enabled/disabled together. + +```shell +# Create a new AKS cluster in the Premium tier +# LongTermSupport and Premium tier should be enabled/disabled together + +az aks create \ + --resource-group $RESOURCE_GROUP \ + --name $CLUSTER_NAME \ + --tier premium \ + --k8s-support-plan AKSLongTermSupport \ + --generate-ssh-keys +``` + +Results: + +```output +{ + ... + "sku": { + "name": "Base", + "tier": "Premium" + }, + "supportPlan": "AKSLongTermSupport", + ... +} +``` + +Once the deployment completes, it returns JSON-formatted information about your cluster: + +```output +# Sample output for --tier free + + }, + "sku": { + "name": "Base", + "tier": "Free" + }, + +# Sample output for --tier standard + + }, + "sku": { + "name": "Base", + "tier": "Standard" + }, + +# Sample output for --tier premium + + "sku": { + "name": "Base", + "tier": "Premium" + }, + "supportPlan": "AKSLongTermSupport", +``` + +## Update the tier of an existing AKS cluster + +The following example uses the [`az aks update`](/cli/azure/aks#az_aks_update) command to update the existing cluster. + +### Update an existing cluster from the Standard tier to the Free tier + +```azurecli-interactive +# Update an existing cluster from the Standard tier to the Free tier + +az aks update --resource-group $RESOURCE_GROUP --name $CLUSTER_NAME --tier free +``` + + + +Results: + +```output +{ + ... + "sku": { + "name": "Base", + "tier": "Free" + }, + ... +} +``` + +### Update an existing cluster from the Free tier to the Standard tier + +```azurecli-interactive +# Update an existing cluster from the Free tier to the Standard tier + +az aks update --resource-group $RESOURCE_GROUP --name $CLUSTER_NAME --tier standard +``` + + + +Results: + +```output +{ + ... + "sku": { + "name": "Base", + "tier": "Standard" + }, + ... +} +``` + +[Updating existing clusters from and to the Premium tier][long-term-support-update] requires changing the support plan. + +### Update an existing cluster to the Premium tier + +```azurecli-interactive +# Update an existing cluster to the Premium tier +az aks update --resource-group $RESOURCE_GROUP --name $CLUSTER_NAME --tier premium --k8s-support-plan AKSLongTermSupport +``` + + + +Results: + +```output +{ + ... + "sku": { + "name": "Base", + "tier": "Premium" + }, + "supportPlan": "AKSLongTermSupport", + ... +} +``` + +### Update an existing cluster to from Premium tier to Free or Standard tier + +```shell +# Update an existing cluster to from Premium tier to Free or Standard tier +az aks update --resource-group $RESOURCE_GROUP --name $CLUSTER_NAME --tier free --k8s-support-plan KubernetesOfficial +# or +az aks update --resource-group $RESOURCE_GROUP --name $CLUSTER_NAME --tier standard --k8s-support-plan KubernetesOfficial +``` + +Results: + +```output +{ + ... + "sku": { + "name": "Base", + "tier": "Free" # or "Standard" + }, + "supportPlan": "KubernetesOfficial", + ... +} +``` + +This process takes several minutes to complete. You shouldn't experience any downtime while your cluster tier is being updated. When finished, the following example JSON snippet shows updating the existing cluster to the Standard tier in the Base SKU. + +```output + }, + "sku": { + "name": "Base", + "tier": "Standard" + }, +``` + +## Next steps + +* Use [Availability Zones][availability-zones] to increase high availability with your AKS cluster workloads. +* Configure your cluster to [limit egress traffic](limit-egress-traffic.md). + +[manage-resource-group-cli]: /azure/azure-resource-manager/management/manage-resource-groups-cli +[availability-zones]: ./availability-zones.md +[az-aks-create]: /cli/azure/aks?#az_aks_create +[private-clusters]: private-clusters.md +[long-term-support]: long-term-support.md +[long-term-support-update]: long-term-support.md#enable-lts-on-an-existing-cluster +[install-azure-cli]: /cli/azure/install-azure-cli \ No newline at end of file diff --git a/scenarios/azure-aks-docs/articles/aks/istio-meshconfig.md b/scenarios/azure-aks-docs/articles/aks/istio-meshconfig.md new file mode 100644 index 000000000..0d40f84b1 --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/istio-meshconfig.md @@ -0,0 +1,184 @@ +--- +title: Configure Istio-based service mesh add-on for Azure Kubernetes Service +description: Configure Istio-based service mesh add-on for Azure Kubernetes Service +ms.topic: how-to +ms.custom: innovation-engine +ms.service: azure-kubernetes-service +ms.date: 06/13/2024 +ms.author: shasb +author: shashankbarsin +--- + +# Configure Istio-based service mesh add-on for Azure Kubernetes Service + +Open-source Istio uses [MeshConfig][istio-meshconfig] to define mesh-wide settings for the Istio service mesh. Istio-based service mesh add-on for AKS builds on top of MeshConfig and classifies different properties as supported, allowed, and blocked. + +This article walks through how to configure Istio-based service mesh add-on for Azure Kubernetes Service and the support policy applicable for such configuration. + +## Prerequisites + +This guide assumes you followed the [documentation][istio-deploy-add-on] to enable the Istio add-on on an AKS cluster. + +## Set up configuration on cluster + +1. Find out which revision of Istio is deployed on the cluster: + + ```bash + export RANDOM_SUFFIX=$(head -c 3 /dev/urandom | xxd -p) + export CLUSTER="my-aks-cluster" + export RESOURCE_GROUP="my-aks-rg$RANDOM_SUFFIX" + az aks show --name $CLUSTER --resource-group $RESOURCE_GROUP --query 'serviceMeshProfile' --output json + ``` + + Results: + + + + ```output + { + "istio": { + "certificateAuthority": null, + "components": { + "egressGateways": null, + "ingressGateways": null + }, + "revisions": [ + "asm-1-24" + ] + }, + "mode": "Istio" + } + ``` + + This command shows the Istio service mesh profile, including the revision(s) currently deployed on your AKS cluster. + +2. Create a ConfigMap with the name `istio-shared-configmap-` in the `aks-istio-system` namespace. For example, if your cluster is running asm-1-24 revision of mesh, then the ConfigMap needs to be named as `istio-shared-configmap-asm-1-24`. Mesh configuration has to be provided within the data section under mesh. + + Example: + + ```bash + cat < istio-shared-configmap-asm-1-24.yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: istio-shared-configmap-asm-1-24 + namespace: aks-istio-system + data: + mesh: |- + accessLogFile: /dev/stdout + defaultConfig: + holdApplicationUntilProxyStarts: true + EOF + kubectl apply -f istio-shared-configmap-asm-1-24.yaml + ``` + + Results: + + + + ```output + configmap/istio-shared-configmap-asm-1-24 created + ``` + + The values under `defaultConfig` are mesh-wide settings applied for Envoy sidecar proxy. + +> [!CAUTION] +> A default ConfigMap (for example, `istio-asm-1-24` for revision asm-1-24) is created in `aks-istio-system` namespace on the cluster when the Istio add-on is enabled. However, this default ConfigMap gets reconciled by the managed Istio add-on and thus users should NOT directly edit this ConfigMap. Instead users should create a revision specific Istio shared ConfigMap (for example `istio-shared-configmap-asm-1-24` for revision asm-1-24) in the aks-istio-system namespace, and then the Istio control plane will merge this with the default ConfigMap, with the default settings taking precedence. + +### Mesh configuration and upgrades + +When you're performing [canary upgrade for Istio](./istio-upgrade.md), you need to create a separate ConfigMap for the new revision in the `aks-istio-system` namespace **before initiating the canary upgrade**. This way the configuration is available when the new revision's control plane is deployed on cluster. For example, if you're upgrading the mesh from asm-1-24 to asm-1-25, you need to copy changes over from `istio-shared-configmap-asm-1-24` to create a new ConfigMap called `istio-shared-configmap-asm-1-25` in the `aks-istio-system` namespace. + +After the upgrade is completed or rolled back, you can delete the ConfigMap of the revision that was removed from the cluster. + +## Allowed, supported, and blocked MeshConfig values + +Fields in `MeshConfig` are classified as `allowed`, `supported`, or `blocked`. To learn more about these categories, see the [support policy][istio-support-policy] for Istio add-on features and configuration options. + +Mesh configuration and the list of allowed/supported fields are revision specific to account for fields being added/removed across revisions. The full list of allowed fields and the supported/unsupported ones within the allowed list is provided in the below table. When new mesh revision is made available, any changes to allowed and supported classification of the fields is noted in this table. + +### MeshConfig + +Fields present in [open source MeshConfig reference documentation][istio-meshconfig] that are not covered in the following table are blocked. For example, `configSources` is blocked. + +| **Field** | **Supported/Allowed** | **Notes** | +|-----------|---------------|-----------| +| proxyListenPort | Allowed | - | +| proxyInboundListenPort | Allowed | - | +| proxyHttpPort | Allowed | - | +| connectTimeout | Allowed | Configurable in [DestinationRule](https://istio.io/latest/docs/reference/config/networking/destination-rule/#ConnectionPoolSettings-TCPSettings) | +| tcpKeepalive | Allowed | Configurable in [DestinationRule](https://istio.io/latest/docs/reference/config/networking/destination-rule/#ConnectionPoolSettings-TCPSettings) | +| defaultConfig | Supported | Used to configure [ProxyConfig](https://istio.io/latest/docs/reference/config/istio.mesh.v1alpha1/#ProxyConfig) | +| outboundTrafficPolicy | Supported | Also configurable in [Sidecar CR](https://istio.io/latest/docs/reference/config/networking/sidecar/#OutboundTrafficPolicy) | +| extensionProviders | Allowed | - | +| defaultProviders | Allowed | - | +| accessLogFile | Supported | This field addresses the generation of access logs. For a managed experience on collection and querying of logs, refer to [Azure Monitor Container Insights on AKS][container-insights-docs]. It is encouraged to configure access logging via the [Telemetry API][istio-telemetry]. | +| accessLogFormat | Supported | This field addresses the generation of access logs. For a managed experience on collection and querying of logs, refer to [Azure Monitor Container Insights on AKS][container-insights-docs] | +| accessLogEncoding | Supported | This field addresses the generation of access logs. For a managed experience on collection and querying of logs, refer to [Azure Monitor Container Insights on AKS][container-insights-docs] | +| enableTracing | Allowed | It is encouraged to configure tracing via the [Telemetry API][istio-telemetry]. | +| enableEnvoyAccessLogService | Supported | This field addresses the generation of access logs. For a managed experience on collection and querying of logs, refer to [Azure Monitor Container Insights on AKS][container-insights-docs] | +| disableEnvoyListenerLog | Supported | This field addresses the generation of access logs. For a managed experience on collection and querying of logs, refer to [Azure Monitor Container Insights on AKS][container-insights-docs] | +| trustDomain | Allowed | - | +| trustDomainAliases | Allowed | - | +| caCertificates | Allowed | Configurable in [DestinationRule](https://istio.io/latest/docs/reference/config/networking/destination-rule/#ClientTLSSettings) | +| defaultServiceExportTo | Allowed | Configurable in [ServiceEntry](https://istio.io/latest/docs/reference/config/networking/service-entry/#ServiceEntry) | +| defaultVirtualServiceExportTo | Allowed | Configurable in [VirtualService](https://istio.io/latest/docs/reference/config/networking/virtual-service/#VirtualService) | +| defaultDestinationRuleExportTo | Allowed | Configurable in [DestinationRule](https://istio.io/latest/docs/reference/config/networking/destination-rule/#DestinationRule) | +| localityLbSetting | Allowed | Configurable in [DestinationRule](https://istio.io/latest/docs/reference/config/networking/destination-rule/#LoadBalancerSettings) | +| dnsRefreshRate | Allowed | - | +| h2UpgradePolicy | Allowed | Configurable in [DestinationRule](https://istio.io/latest/docs/reference/config/networking/destination-rule/#ConnectionPoolSettings-HTTPSettings) | +| enablePrometheusMerge | Allowed | - | +| discoverySelectors | Supported | - | +| pathNormalization | Allowed | - | +| defaultHttpRetryPolicy | Allowed | Configurable in [VirtualService](https://istio.io/latest/docs/reference/config/networking/virtual-service/#HTTPRetry) | +| serviceSettings | Allowed | - | +| meshMTLS | Allowed | - | +| tlsDefaults | Allowed | - | +| ingressService | Allowed | Name of the Kubernetes service used for the istio ingress controller. | +| ingressSelector | Allowed | Defines which gateway deployment to use as the Ingress controller. This field corresponds to the Gateway.selector field, and will be set as istio: INGRESS_SELECTOR. | + +### ProxyConfig (meshConfig.defaultConfig) + +Fields present in [open source MeshConfig reference documentation](https://istio.io/latest/docs/reference/config/istio.mesh.v1alpha1/#ProxyConfig) that are not covered in the following table are blocked. + +| **Field** | **Supported/Allowed** | **Notes** | +|-----------|-----------------------|-----------| +| tracingServiceName | Allowed | It is encouraged to configure tracing via the [Telemetry API][istio-telemetry]. | +| drainDuration | Supported | - | +| statsUdpAddress | Allowed | - | +| proxyAdminPort | Allowed | - | +| tracing | Allowed | It is encouraged to configure tracing via the [Telemetry API][istio-telemetry]. | +| concurrency | Supported | - | +| envoyAccessLogService | Allowed | It is encouraged to configure tracing via the [Telemetry API][istio-telemetry]. | +| envoyMetricsService | Allowed | It is encouraged to configure metrics collection via the [Telemetry API][istio-telemetry]. +| proxyMetadata | Allowed | - | +| statusPort | Allowed | - | +| extraStatTags | Allowed | - | +| gatewayTopology | Allowed | - | +| proxyStatsMatcher | Allowed | - | +| terminationDrainDuration | Supported | - | +| meshId | Allowed | - | +| holdApplicationUntilProxyStarts | Supported | - | +| caCertificatesPem | Allowed | - | +| privateKeyProvider | Allowed | - | + +> [!CAUTION] +> **Support scope of configurations:** Mesh configuration allows for extension providers such as self-managed instances of Zipkin or Apache Skywalking to be configured with the Istio add-on. However, these extension providers are outside the support scope of the Istio add-on. Any issues associated with extension tools are outside the support boundary of the Istio add-on. + +## Common errors and troubleshooting tips + +- Ensure that the MeshConfig is indented with spaces instead of tabs. +- Ensure that you're only editing the revision specific shared ConfigMap (for example `istio-shared-configmap-asm-1-24`) and not trying to edit the default ConfigMap (for example `istio-asm-1-24`). +- The ConfigMap must follow the name `istio-shared-configmap-` and be in the `aks-istio-system` namespace. +- Ensure that all MeshConfig fields are spelled correctly. If they're unrecognized or if they aren't part of the allowed list, admission control denies such configurations. +- When performing canary upgrades, [check your revision specific ConfigMaps](#mesh-configuration-and-upgrades) to ensure configurations exist for the revisions deployed on your cluster. +- Certain `MeshConfig` options such as accessLogging may increase Envoy's resource consumption, and disabling some of these settings may mitigate Istio data plane resource utilization. It's also advisable to use the `discoverySelectors` field in the MeshConfig to help alleviate memory consumption for Istiod and Envoy. +- If the `concurrency` field in the MeshConfig is misconfigured and set to zero, it causes Envoy to use up all CPU cores. Instead if this field is unset, number of worker threads to run is automatically determined based on CPU requests/limits. +- [Pod and sidecar race conditions][istio-sidecar-race-condition] in which the application starts before Envoy can be mitigated using the `holdApplicationUntilProxyStarts` field in the MeshConfig. + +[istio-meshconfig]: https://istio.io/latest/docs/reference/config/istio.mesh.v1alpha1/ +[istio-sidecar-race-condition]: https://istio.io/latest/docs/ops/common-problems/injection/#pod-or-containers-start-with-network-issues-if-istio-proxy-is-not-ready +[istio-deploy-add-on]: istio-deploy-addon.md +[container-insights-docs]: /azure/azure-monitor/containers/container-insights-overview +[istio-support-policy]: ./istio-support-policy.md#allowed-supported-and-blocked-customizations +[istio-telemetry]: ./istio-telemetry.md diff --git a/scenarios/azure-aks-docs/articles/aks/istio-scale.md b/scenarios/azure-aks-docs/articles/aks/istio-scale.md new file mode 100644 index 000000000..a3fa97c18 --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/istio-scale.md @@ -0,0 +1,167 @@ +--- +title: Istio service mesh Azure Kubernetes Service add-on performance and scaling +description: Istio service mesh Azure Kubernetes Service add-on performance and scaling +ms.topic: concept-article +ms.custom: innovation-engine +ms.service: azure-kubernetes-service +ms.date: 06/13/2024 +ms.author: shalierxia +--- + +# Istio service mesh add-on performance and scaling +The Istio-based service mesh add-on is logically split into a control plane (`istiod`) and a data plane. The data plane is composed of Envoy sidecar proxies inside workload pods. Istiod manages and configures these Envoy proxies. This article presents the performance of both the control and data plane for revision asm-1-19, including resource consumption, sidecar capacity, and latency overhead. Additionally, it provides suggestions for addressing potential strain on resources during periods of heavy load. This article also covers how to customize scaling for the control plane and gateways. + +## Control plane performance +[Istiod’s CPU and memory requirements][control-plane-performance] correlate with the rate of deployment and configuration changes and the number of proxies connected. The scenarios tested were: + +- Pod churn: examines the impact of pod churning on `istiod`. To reduce variables, only one service is used for all sidecars. +- Multiple services: examines the impact of multiple services on the maximum sidecars Istiod can manage (sidecar capacity), where each service has `N` sidecars, totaling the overall maximum. + +#### Test specifications +- One `istiod` instance with default settings +- Horizontal pod autoscaling disabled +- Tested with two network plugins: Azure Container Networking Interface (CNI) Overlay and Azure CNI Overlay with Cilium [ (recommended network plugins for large scale clusters) ](/azure/aks/azure-cni-overlay?tabs=kubectl#choosing-a-network-model-to-use) +- Node SKU: Standard D16 v3 (16 vCPU, 64-GB memory) +- Kubernetes version: 1.28.5 +- Istio revision: asm-1-19 + +### Pod churn +The [ClusterLoader2 framework][clusterloader2] was used to determine the maximum number of sidecars Istiod can manage when there's sidecar churning. The churn percent is defined as the percent of sidecars churned down/up during the test. For example, 50% churn for 10,000 sidecars would mean that 5,000 sidecars were churned down, then 5,000 sidecars were churned up. The churn percents tested were determined from the typical churn percentage during deployment rollouts (`maxUnavailable`). The churn rate was calculated by determining the total number of sidecars churned (up and down) over the actual time taken to complete the churning process. + +#### Sidecar capacity and Istiod CPU and memory + +**Azure CNI overlay** + +| Churn (%) | Churn Rate (sidecars/sec) | Sidecar Capacity | Istiod Memory (GB) | Istiod CPU | +|-------------|-----------------------------|--------------------|----------------------|--------------| +| 0 | -- | 25000 | 32.1 | 15 | +| 25 | 31.2 | 15000 | 22.2 | 15 | +| 50 | 31.2 | 15000 | 25.4 | 15 | + + +**Azure CNI overlay with Cilium** + +| Churn (%) | Churn Rate (sidecars/sec) | Sidecar Capacity | Istiod Memory (GB) | Istiod CPU | +|-------------|-----------------------------|--------------------|----------------------|--------------| +| 0 |-- | 30000 | 41.2 | 15 | +| 25 | 41.7 | 25000 | 36.1 | 16 | +| 50 | 37.9 | 25000 | 42.7 | 16 | + + +### Multiple services +The [ClusterLoader2 framework][clusterloader2] was used to determine the maximum number of sidecars `istiod` can manage with 1,000 services. The results can be compared to the 0% churn test (one service) in the pod churn scenario. Each service had `N` sidecars contributing to the overall maximum sidecar count. The API Server resource usage was observed to determine if there was any significant stress from the add-on. + +**Sidecar capacity** + +| Azure CNI Overlay | Azure CNI Overlay with Cilium | +|---------------------|---------------------------------| +| 20000 | 20000 | + +**CPU and memory** + +| Resource | Azure CNI Overlay | Azure CNI Overlay with Cilium | +|------------------------|--------------------|---------------------------------| +| API Server Memory (GB) | 38.9 | 9.7 | +| API Server CPU | 6.1 | 4.7 | +| Istiod Memory (GB) | 40.4 | 42.6 | +| Istiod CPU | 15 | 16 | + + +## Data plane performance +Various factors impact [sidecar performance][data-plane-performance] such as request size, number of proxy worker threads, and number of client connections. Additionally, any request flowing through the mesh traverses the client-side proxy and then the server-side proxy. Therefore, latency and resource consumption are measured to determine the data plane performance. + +[`Fortio`][fortio] was used to create the load. The test was conducted with the [Istio benchmark repository][istio-benchmark] that was modified for use with the add-on. + +#### Test specifications +- Tested with two network plugins: Azure CNI Overlay and Azure CNI Overlay with Cilium [ (recommended network plugins for large scale clusters) ](/azure/aks/azure-cni-overlay?tabs=kubectl#choosing-a-network-model-to-use) +- Node SKU: Standard D16 v5 (16 vCPU, 64-GB memory) +- Kubernetes version: 1.28.5 +- Two proxy workers +- 1-KB payload +- 1,000 Queries per second (QPS) at varying client connections +- `http/1.1` protocol and mutual Transport Layer Security (TLS) enabled +- 26 data points collected + +#### CPU and memory +The memory and CPU usage for both the client and server proxy for 16 client connections and 1,000 QPS across all network plugin scenarios is roughly 0.4 vCPU and 72 MB. + +#### Latency +The sidecar Envoy proxy collects raw telemetry data after responding to a client, which doesn't directly affect the request's total processing time. However, this process delays the start of handling the next request, contributing to queue wait times and influencing average and tail latencies. Depending on the traffic pattern, the actual tail latency varies. + +The following results evaluate the impact of adding sidecar proxies to the data path, showcasing the P90 and P99 latency. +- Sidecar traffic path: client --> client-sidecar --> server-sidecar --> server +- Baseline traffic path: client --> server + +A comparison of data plane latency performance across Istio add-on and AKS versions can be found [here](./istio-latency.md). + +| Azure CNI Overlay |Azure CNI Overlay with Cilium | +|:-------------------------:|:-------------------------:| +[ ![Diagram that compares P99 latency for Azure CNI Overlay.](./media/aks-istio-addon/latency-box-plot/overlay-azure-p99.png) ](./media/aks-istio-addon/latency-box-plot/overlay-azure-p99.png#lightbox) | [ ![Diagram that compares P99 latency for Azure CNI Overlay with Cilium.](./media/aks-istio-addon/latency-box-plot/overlay-cilium-p99.png) ](./media/aks-istio-addon/latency-box-plot/overlay-cilium-p99.png#lightbox) +[ ![Diagram that compares P90 latency for Azure CNI Overlay.](./media/aks-istio-addon/latency-box-plot/overlay-azure-p90.png) ](./media/aks-istio-addon/latency-box-plot/overlay-azure-p90.png#lightbox) | [ ![Diagram that compares P90 latency for Azure CNI Overlay with Cilium.](./media/aks-istio-addon/latency-box-plot/overlay-cilium-p90.png) ](./media/aks-istio-addon/latency-box-plot/overlay-cilium-p90.png#lightbox) + +## Scaling + +### Horizontal pod autoscaling customization + +[Horizontal pod autoscaling (HPA)][hpa] is enabled for the `istiod` and ingress gateway pods. The default configurations for `istiod` and the gateways are: +- Min Replicas: 2 +- Max Replicas: 5 +- CPU Utilization: 80% + +> [!NOTE] +> To prevent conflicts with the `PodDisruptionBudget`, the add-on does not allow setting the `minReplicas` below the initial default of `2`. + +The following are the `istiod` and ingress gateway HPA resources: +```console +NAMESPACE NAME REFERENCE +aks-istio-ingress aks-istio-ingressgateway-external-asm-1-19 Deployment/aks-istio-ingressgateway-external-asm-1-19 + +aks-istio-ingress aks-istio-ingressgateway-internal-asm-1-19 Deployment/aks-istio-ingressgateway-internal-asm-1-19 + +aks-istio-system istiod-asm-1-19 Deployment/istiod-asm-1-19 +``` + +The HPA configuration can be modified through patches and direct edits. + +First, connect to your AKS cluster using the Azure CLI: + +```bash +az aks get-credentials --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER --overwrite-existing +``` + +Then, you can patch the HPA resources to customize the scaling settings. Modify the minimum and maximum number of replicas for the external ingress gateway HPA resource to scale as needed. Replace the variable values according to your AKS cluster setup if needed. + +```bash +# Get the external ingress gateway HPA name dynamically +EXTERNAL_HPA_NAME=$(kubectl get hpa -n aks-istio-ingress -o jsonpath='{.items[?(@.metadata.name contains "external")].metadata.name}') + +kubectl patch hpa $EXTERNAL_HPA_NAME -n aks-istio-ingress --type merge --patch '{"spec": {"minReplicas": 3, "maxReplicas": 6}}' +``` + +Results: + + + +```output +horizontalpodautoscaler.autoscaling/aks-istio-ingressgateway-external-asm-1-19 patched +``` + +> [!NOTE] +> See the [Istio add-on upgrade documentation][istio-upgrade-hpa] for details on how HPA settings are applied across both revisions during a canary upgrade. + +## Service entry +Istio's ServiceEntry custom resource definition enables adding other services into the Istio’s internal service registry. A [ServiceEntry][serviceentry] allows services already in the mesh to route or access the services specified. However, the configuration of multiple ServiceEntries with the `resolution` field set to DNS can cause a [heavy load on Domain Name System (DNS) servers][understanding-dns]. The following suggestions can help reduce the load: + +- Switch to `resolution: NONE` to avoid proxy DNS lookups entirely. Suitable for most use cases. +- Increase TTL (Time To Live) if you control the domains being resolved. +- Limit the ServiceEntry scope with `exportTo`. + +[control-plane-performance]: https://istio.io/latest/docs/ops/deployment/performance-and-scalability/#control-plane-performance +[data-plane-performance]: https://istio.io/latest/docs/ops/deployment/performance-and-scalability/#data-plane-performance +[clusterloader2]: https://github.com/kubernetes/perf-tests/tree/master/clusterloader2#clusterloader +[fortio]: https://fortio.org/ +[istio-benchmark]: https://github.com/istio/tools/tree/master/perf/benchmark#istio-performance-benchmarking +[serviceentry]: https://istio.io/latest/docs/reference/config/networking/service-entry/ +[understanding-dns]: https://preliminary.istio.io/latest/docs/ops/configuration/traffic-management/dns/#proxy-dns-resolution +[hpa]: https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ +[istio-upgrade-hpa]: ./istio-upgrade.md#minor-revision-upgrades-with-horizontal-pod-autoscaling-customizations \ No newline at end of file diff --git a/scenarios/azure-aks-docs/articles/aks/kubelet-logs.md b/scenarios/azure-aks-docs/articles/aks/kubelet-logs.md new file mode 100644 index 000000000..249aacf34 --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/kubelet-logs.md @@ -0,0 +1,111 @@ +--- +title: View kubelet logs in Azure Kubernetes Service (AKS) +description: Learn how to view troubleshooting information in the kubelet logs from Azure Kubernetes Service (AKS) nodes +ms.topic: how-to +ms.subservice: aks-monitoring +ms.date: 06/08/2024 +author: nickoman +ms.author: nickoman +ms.custom: innovation-engine,aks,logs,troubleshooting +--- + +# Get kubelet logs from Azure Kubernetes Service (AKS) cluster nodes + +When operating an Azure Kubernetes Service (AKS) cluster, you may need to review logs to troubleshoot a problem. Azure portal has a built-in capability that allows you to view logs for AKS [main components][aks-main-logs] and [cluster containers][azure-container-logs]. Occasionally, you may need to get *kubelet* logs from AKS nodes for troubleshooting purposes. + +This article shows you how you can use `journalctl` to view *kubelet* logs on an AKS node. +Alternatively, customers can collect kubelet logs using the [syslog collection feature in Azure Monitor - Container Insights](https://aka.ms/CISyslog). + +## Before you begin + +This article assumes you have an existing AKS cluster. If you need an AKS cluster, create one using [Azure CLI][aks-quickstart-cli], [Azure PowerShell][aks-quickstart-powershell], or [Azure portal][aks-quickstart-portal]. + +## Connect to your AKS cluster + +To interact with your AKS cluster, first get the cluster credentials using the Azure CLI: + +```bash +export RESOURCE_GROUP_NAME="" +export AKS_CLUSTER_NAME="" +az aks get-credentials --resource-group $RESOURCE_GROUP_NAME --name $AKS_CLUSTER_NAME +``` +This command configures `kubectl` to use the credentials for your AKS cluster. + +## Using kubectl raw + +You can quickly view any node kubelet logs by using the following command: + +```bash +export NODE_NAME="aks-agentpool-xxxxxxx-0" +kubectl get --raw "/api/v1/nodes/$NODE_NAME/proxy/logs/messages" | grep kubelet +``` + +Results: + + + +```output +I0508 12:26:17.905042 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:26:27.943494 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:26:28.920125 8672 server.go:796] GET /stats/summary: (10.370874ms) 200 [[Ruby] 10.244.0.x:52492] +I0508 12:26:37.964650 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +... +``` + +## Create an SSH connection + +First, you need to create an SSH connection with the node you need to view *kubelet* logs for. To create this connection, follow the steps in [SSH into AKS cluster nodes][aks-ssh]. + +## Get kubelet logs + +Once you connect to the node using `kubectl debug`, run the following command to pull the *kubelet* logs: + +```console +chroot /host +journalctl -u kubelet -o cat +``` + +> [!NOTE] +> For Windows nodes, the log data is in `C:\k` and can be viewed using the *more* command: +> +> ```console +> more C:\k\kubelet.log +> ``` + +The following example output shows *kubelet* log data: + +```output +I0508 12:26:17.905042 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:26:27.943494 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:26:28.920125 8672 server.go:796] GET /stats/summary: (10.370874ms) 200 [[Ruby] 10.244.0.x:52292] +I0508 12:26:37.964650 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:26:47.996449 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:26:58.019746 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:27:05.107680 8672 server.go:796] GET /stats/summary/: (24.853838ms) 200 [[Go-http-client/1.1] 10.244.0.x:44660] +I0508 12:27:08.041736 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:27:18.068505 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:27:28.094889 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:27:38.121346 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:27:44.015205 8672 server.go:796] GET /stats/summary: (30.236824ms) 200 [[Ruby] 10.244.0.x:52588] +I0508 12:27:48.145640 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:27:58.178534 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:28:05.040375 8672 server.go:796] GET /stats/summary/: (27.78503ms) 200 [[Go-http-client/1.1] 10.244.0.x:44660] +I0508 12:28:08.214158 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:28:18.242160 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:28:28.274408 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:28:38.296074 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:28:48.321952 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +I0508 12:28:58.344656 8672 kubelet_node_status.go:497] Using Node Hostname from cloudprovider: "aks-agentpool-xxxxxxx-0" +``` + +## Next steps + +If you need more troubleshooting information for the Kubernetes main, see [view Kubernetes main node logs in AKS][aks-main-logs]. + + +[aks-ssh]: ssh.md +[aks-main-logs]: monitor-aks-reference.md#resource-logs +[aks-quickstart-cli]: ./learn/quick-kubernetes-deploy-cli.md +[aks-quickstart-portal]: ./learn/quick-kubernetes-deploy-portal.md +[aks-quickstart-powershell]: ./learn/quick-kubernetes-deploy-powershell.md +[azure-container-logs]: /azure/azure-monitor/containers/container-insights-overview \ No newline at end of file diff --git a/scenarios/azure-aks-docs/articles/aks/nat-gateway.md b/scenarios/azure-aks-docs/articles/aks/nat-gateway.md new file mode 100644 index 000000000..16629380c --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/nat-gateway.md @@ -0,0 +1,431 @@ +--- +title: Create a managed or user-assigned NAT gateway for your Azure Kubernetes Service (AKS) cluster +description: Learn how to create an AKS cluster with managed NAT integration and user-assigned NAT gateway. +ms.topic: how-to +ms.date: 06/03/2024 +author: asudbring +ms.author: allensu +ms.custom: devx-track-azurecli, innovation-engine +--- + +# Create a managed or user-assigned NAT gateway for your Azure Kubernetes Service (AKS) cluster + +While you can route egress traffic through an Azure Load Balancer, there are limitations on the number of outbound flows of traffic you can have. Azure NAT Gateway allows up to 64,512 outbound UDP and TCP traffic flows per IP address with a maximum of 16 IP addresses. + +This article shows you how to create an Azure Kubernetes Service (AKS) cluster with a managed NAT gateway and a user-assigned NAT gateway for egress traffic. It also shows you how to disable OutboundNAT on Windows. + +## Before you begin + +* Make sure you're using the latest version of [Azure CLI][az-cli]. +* Make sure you're using Kubernetes version 1.20.x or above. +* Managed NAT gateway is incompatible with custom virtual networks. + +> [!IMPORTANT] +> In non-private clusters, API server cluster traffic is routed and processed through the clusters outbound type. To prevent API server traffic from being processed as public traffic, consider using a [private cluster][private-cluster], or check out the [API Server VNet Integration][api-server-vnet-integration] feature. + +## Create an AKS cluster with a managed NAT gateway + +* Create an AKS cluster with a new managed NAT gateway using the [`az aks create`][az-aks-create] command with the `--outbound-type managedNATGateway`, `--nat-gateway-managed-outbound-ip-count`, and `--nat-gateway-idle-timeout` parameters. If you want the NAT gateway to operate out of a specific availability zone, specify the zone using `--zones`. +* If no zone is specified when creating a managed NAT gateway, then NAT gateway is deployed to "no zone" by default. When NAT gateway is placed in **no zone**, Azure places the resource in a zone for you. For more information on non-zonal deployment model, see [non-zonal NAT gateway](/azure/nat-gateway/nat-availability-zones#non-zonal). +* A managed NAT gateway resource can't be used across multiple availability zones. + +The following commands first create the required resource group, then the AKS cluster with a managed NAT gateway. + +```azurecli-interactive +export RANDOM_SUFFIX=$(openssl rand -hex 3) +export MY_RG="myResourceGroup$RANDOM_SUFFIX" +export MY_AKS="myNatCluster$RANDOM_SUFFIX" +az group create --name $MY_RG --location "eastus2" +``` + +Results: + +```output +{ + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx", + "location": "eastus2", + "managedBy": null, + "name": "myResourceGroupxxx", + "properties": { + "provisioningState": "Succeeded" + }, + "tags": null, + "type": "Microsoft.Resources/resourceGroups" +} +``` + +```azurecli-interactive +az aks create \ + --resource-group $MY_RG \ + --name $MY_AKS \ + --node-count 3 \ + --outbound-type managedNATGateway \ + --nat-gateway-managed-outbound-ip-count 2 \ + --nat-gateway-idle-timeout 4 \ + --generate-ssh-keys +``` + +Results: + + + +```output +{ + "aadProfile": null, + "agentPoolProfiles": [ + { + ... + "name": "nodepool1", + ... + "provisioningState": "Succeeded", + ... + } + ], + "dnsPrefix": "myNatClusterxxx-dns-xxx", + "fqdn": "myNatClusterxxx-dns-xxx.xxxxx.xxxxxx.cloudapp.azure.com", + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourcegroups/myResourceGroupxxx/providers/Microsoft.ContainerService/managedClusters/myNatClusterxxx", + "name": "myNatClusterxxx", + ... + "resourceGroup": "myResourceGroupxxx", + ... + "provisioningState": "Succeeded", + ... + "type": "Microsoft.ContainerService/ManagedClusters" +} +``` + +* Update the outbound IP address or idle timeout using the [`az aks update`][az-aks-update] command with the `--nat-gateway-managed-outbound-ip-count` or `--nat-gateway-idle-timeout` parameter. + +The following example updates the NAT gateway managed outbound IP count for the AKS cluster to 5. + +```azurecli-interactive +az aks update \ + --resource-group $MY_RG \ + --name $MY_AKS \ + --nat-gateway-managed-outbound-ip-count 5 +``` + +Results: + + + +```output +{ + "aadProfile": null, + "agentPoolProfiles": [ + { + ... + "name": "nodepool1", + ... + "provisioningState": "Succeeded", + ... + } + ], + "dnsPrefix": "myNatClusterxxx-dns-xxx", + "fqdn": "myNatClusterxxx-dns-xxx.xxxxx.xxxxxx.cloudapp.azure.com", + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourcegroups/myResourceGroupxxx/providers/Microsoft.ContainerService/managedClusters/myNatClusterxxx", + "name": "myNatClusterxxx", + ... + "resourceGroup": "myResourceGroupxxx", + ... + "provisioningState": "Succeeded", + ... + "type": "Microsoft.ContainerService/ManagedClusters" +} +``` + +## Create an AKS cluster with a user-assigned NAT gateway + +This configuration requires bring-your-own networking (via [Kubenet][byo-vnet-kubenet] or [Azure CNI][byo-vnet-azure-cni]) and that the NAT gateway is preconfigured on the subnet. The following commands create the required resources for this scenario. + +1. Create a resource group using the [`az group create`][az-group-create] command. + + ```shell + export RANDOM_SUFFIX=$(openssl rand -hex 3) + export MY_RG="myResourceGroup$RANDOM_SUFFIX" + az group create --name $MY_RG --location southcentralus + ``` + + Results: + + ```output + { + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx", + "location": "southcentralus", + "managedBy": null, + "name": "myResourceGroupxxx", + "properties": { + "provisioningState": "Succeeded" + }, + "tags": null, + "type": "Microsoft.Resources/resourceGroups" + } + ``` + +2. Create a managed identity for network permissions and store the ID to `$IDENTITY_ID` for later use. + + ```shell + export IDENTITY_NAME="myNatClusterId$RANDOM_SUFFIX" + export IDENTITY_ID=$(az identity create \ + --resource-group $MY_RG \ + --name $IDENTITY_NAME \ + --location southcentralus \ + --query id \ + --output tsv) + ``` + + Results: + + ```output + /xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx/providers/Microsoft.ManagedIdentity/userAssignedIdentities/myNatClusterIdxxx + ``` + +3. Create a public IP for the NAT gateway using the [`az network public-ip create`][az-network-public-ip-create] command. + + ```shell + export PIP_NAME="myNatGatewayPip$RANDOM_SUFFIX" + az network public-ip create \ + --resource-group $MY_RG \ + --name $PIP_NAME \ + --location southcentralus \ + --sku standard + ``` + + Results: + + ```output + { + "publicIp": { + "ddosSettings": null, + "dnsSettings": null, + "etag": "W/\"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx\"", + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx/providers/Microsoft.Network/publicIPAddresses/myNatGatewayPipxxx", + "ipAddress": null, + "ipTags": [], + "location": "southcentralus", + "name": "myNatGatewayPipxxx", + ... + "provisioningState": "Succeeded", + ... + "sku": { + "name": "Standard", + "tier": "Regional" + }, + "type": "Microsoft.Network/publicIPAddresses", + ... + } + } + ``` + +4. Create the NAT gateway using the [`az network nat gateway create`][az-network-nat-gateway-create] command. + + ```shell + export NATGATEWAY_NAME="myNatGateway$RANDOM_SUFFIX" + az network nat gateway create \ + --resource-group $MY_RG \ + --name $NATGATEWAY_NAME \ + --location southcentralus \ + --public-ip-addresses $PIP_NAME + ``` + + Results: + + ```output + { + "etag": "W/\"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx\"", + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx/providers/Microsoft.Network/natGateways/myNatGatewayxxx", + "location": "southcentralus", + "name": "myNatGatewayxxx", + "provisioningState": "Succeeded", + "publicIpAddresses": [ + { + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx/providers/Microsoft.Network/publicIPAddresses/myNatGatewayPipxxx" + } + ], + ... + "type": "Microsoft.Network/natGateways" + } + ``` + + > [!Important] + > A single NAT gateway resource can't be used across multiple availability zones. To ensure zone-resiliency, it is recommended to deploy a NAT gateway resource to each availability zone and assign to subnets containing AKS clusters in each zone. For more information on this deployment model, see [NAT gateway for each zone](/azure/nat-gateway/nat-availability-zones#zonal-nat-gateway-resource-for-each-zone-in-a-region-to-create-zone-resiliency). + > If no zone is configured for NAT gateway, the default zone placement is "no zone", in which Azure places NAT gateway into a zone for you. + +5. Create a virtual network using the [`az network vnet create`][az-network-vnet-create] command. + + ```shell + export VNET_NAME="myVnet$RANDOM_SUFFIX" + az network vnet create \ + --resource-group $MY_RG \ + --name $VNET_NAME \ + --location southcentralus \ + --address-prefixes 172.16.0.0/20 + ``` + + Results: + + ```output + { + "newVNet": { + "addressSpace": { + "addressPrefixes": [ + "172.16.0.0/20" + ] + }, + ... + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx/providers/Microsoft.Network/virtualNetworks/myVnetxxx", + "location": "southcentralus", + "name": "myVnetxxx", + "provisioningState": "Succeeded", + ... + "type": "Microsoft.Network/virtualNetworks", + ... + } + } + ``` + +6. Create a subnet in the virtual network using the NAT gateway and store the ID to `$SUBNET_ID` for later use. + + ```shell + export SUBNET_NAME="myNatCluster$RANDOM_SUFFIX" + export SUBNET_ID=$(az network vnet subnet create \ + --resource-group $MY_RG \ + --vnet-name $VNET_NAME \ + --name $SUBNET_NAME \ + --address-prefixes 172.16.0.0/22 \ + --nat-gateway $NATGATEWAY_NAME \ + --query id \ + --output tsv) + ``` + + Results: + + ```output + /xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx/providers/Microsoft.Network/virtualNetworks/myVnetxxx/subnets/myNatClusterxxx + ``` + +7. Create an AKS cluster using the subnet with the NAT gateway and the managed identity using the [`az aks create`][az-aks-create] command. + + ```shell + export AKS_NAME="myNatCluster$RANDOM_SUFFIX" + az aks create \ + --resource-group $MY_RG \ + --name $AKS_NAME \ + --location southcentralus \ + --network-plugin azure \ + --vnet-subnet-id $SUBNET_ID \ + --outbound-type userAssignedNATGateway \ + --assign-identity $IDENTITY_ID \ + --generate-ssh-keys + ``` + + Results: + + ```output + { + "aadProfile": null, + "agentPoolProfiles": [ + { + ... + "name": "nodepool1", + ... + "provisioningState": "Succeeded", + ... + } + ], + "dnsPrefix": "myNatClusterxxx-dns-xxx", + "fqdn": "myNatClusterxxx-dns-xxx.xxxxx.xxxxxx.cloudapp.azure.com", + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourcegroups/myResourceGroupxxx/providers/Microsoft.ContainerService/managedClusters/myNatClusterxxx", + "name": "myNatClusterxxx", + ... + "resourceGroup": "myResourceGroupxxx", + ... + "provisioningState": "Succeeded", + ... + "type": "Microsoft.ContainerService/ManagedClusters" + } + ``` + +## Disable OutboundNAT for Windows + +Windows OutboundNAT can cause certain connection and communication issues with your AKS pods. An example issue is node port reuse. In this example, Windows OutboundNAT uses ports to translate your pod IP to your Windows node host IP, which can cause an unstable connection to the external service due to a port exhaustion issue. + +Windows enables OutboundNAT by default. You can now manually disable OutboundNAT when creating new Windows agent pools. + +### Prerequisites + +* Existing AKS cluster with v1.26 or above. If you're using Kubernetes version 1.25 or older, you need to [update your deployment configuration][upgrade-kubernetes]. + +### Limitations + +* You can't set cluster outbound type to LoadBalancer. You can set it to Nat Gateway or UDR: + * [NAT Gateway](./nat-gateway.md): NAT Gateway can automatically handle NAT connection and is more powerful than Standard Load Balancer. You might incur extra charges with this option. + * [UDR (UserDefinedRouting)](./limit-egress-traffic.md): You must keep port limitations in mind when configuring routing rules. + * If you need to switch from a load balancer to NAT Gateway, you can either add a NAT gateway into the VNet or run [`az aks upgrade`][aks-upgrade] to update the outbound type. + +> [!NOTE] +> UserDefinedRouting has the following limitations: +> +> * SNAT by Load Balancer (must use the default OutboundNAT) has "64 ports on the host IP". +> * SNAT by Azure Firewall (disable OutboundNAT) has 2496 ports per public IP. +> * SNAT by NAT Gateway (disable OutboundNAT) has 64512 ports per public IP. +> * If the Azure Firewall port range isn't enough for your application, you need to use NAT Gateway. +> * Azure Firewall doesn't SNAT with Network rules when the destination IP address is in a private IP address range per [IANA RFC 1918 or shared address space per IANA RFC 6598](/azure/firewall/snat-private-range). + +### Manually disable OutboundNAT for Windows + +* Manually disable OutboundNAT for Windows when creating new Windows agent pools using the [`az aks nodepool add`][az-aks-nodepool-add] command with the `--disable-windows-outbound-nat` flag. + + > [!NOTE] + > You can use an existing AKS cluster, but you might need to update the outbound type and add a node pool to enable `--disable-windows-outbound-nat`. + + The following command adds a Windows node pool to an existing AKS cluster, disabling OutboundNAT. + + ```shell + export WIN_NODEPOOL_NAME="win$(head -c 1 /dev/urandom | xxd -p)" + az aks nodepool add \ + --resource-group $MY_RG \ + --cluster-name $MY_AKS \ + --name $WIN_NODEPOOL_NAME \ + --node-count 3 \ + --os-type Windows \ + --disable-windows-outbound-nat + ``` + + Results: + + + + ```output + { + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx/providers/Microsoft.ContainerService/managedClusters/myNatClusterxxx/agentPools/mynpxxx", + "name": "mynpxxx", + "osType": "Windows", + "provisioningState": "Succeeded", + "resourceGroup": "myResourceGroupxxx", + "type": "Microsoft.ContainerService/managedClusters/agentPools" + } + ``` + +## Next steps + +For more information on Azure NAT Gateway, see [Azure NAT Gateway][nat-docs]. + + +[api-server-vnet-integration]: api-server-vnet-integration.md +[byo-vnet-azure-cni]: configure-azure-cni.md +[byo-vnet-kubenet]: configure-kubenet.md +[private-cluster]: private-clusters.md +[upgrade-kubernetes]:tutorial-kubernetes-upgrade-cluster.md + + +[nat-docs]: /azure/virtual-network/nat-gateway/nat-overview +[az-cli]: /cli/azure/install-azure-cli +[aks-upgrade]: /cli/azure/aks#az-aks-update +[az-aks-create]: /cli/azure/aks#az-aks-create +[az-aks-update]: /cli/azure/aks#az-aks-update +[az-group-create]: /cli/azure/group#az_group_create +[az-network-public-ip-create]: /cli/azure/network/public-ip#az_network_public_ip_create +[az-network-nat-gateway-create]: /cli/azure/network/nat/gateway#az_network_nat_gateway_create +[az-network-vnet-create]: /cli/azure/network/vnet#az_network_vnet_create +[az-aks-nodepool-add]: /cli/azure/aks/nodepool#az_aks_nodepool_add diff --git a/scenarios/azure-aks-docs/articles/aks/postgresql-ha-overview.md b/scenarios/azure-aks-docs/articles/aks/postgresql-ha-overview.md deleted file mode 100644 index 455d6024e..000000000 --- a/scenarios/azure-aks-docs/articles/aks/postgresql-ha-overview.md +++ /dev/null @@ -1,92 +0,0 @@ ---- -title: 'Overview of deploying a highly available PostgreSQL database on AKS with Azure CLI' -description: Learn how to deploy a highly available PostgreSQL database on AKS using the CloudNativePG operator. -ms.topic: overview -ms.date: 06/07/2024 -author: kenkilty -ms.author: kkilty -ms.custom: innovation-engine, aks-related-content -#Customer intent: As a developer or cluster operator, I want to deploy a highly available PostgreSQL database on AKS so I can see how to run a stateful database workload using the managed Kubernetes service in Azure. ---- -# Deploy a highly available PostgreSQL database on AKS with Azure CLI - -In this guide, you deploy a highly available PostgreSQL cluster that spans multiple Azure availability zones on AKS with Azure CLI. - -This article walks through the prerequisites for setting up a PostgreSQL cluster on [Azure Kubernetes Service (AKS)][what-is-aks] and provides an overview of the full deployment process and architecture. - -[!INCLUDE [open source disclaimer](./includes/open-source-disclaimer.md)] - -## Prerequisites - -* This guide assumes a basic understanding of [core Kubernetes concepts][core-kubernetes-concepts] and [PostgreSQL][postgresql]. -* You need the **Owner** or **User Access Administrator** and the **Contributor** [Azure built-in roles][azure-roles] on a subscription in your Azure account. - -[!INCLUDE [azure-cli-prepare-your-environment-no-header.md](~/reusable-content/azure-cli/azure-cli-prepare-your-environment-no-header.md)] - -* You also need the following resources installed: - - * [Azure CLI](/cli/azure/install-azure-cli) version 2.56 or later. - * [Azure Kubernetes Service (AKS) preview extension][aks-preview]. - * [jq][jq], version 1.5 or later. - * [kubectl][install-kubectl] version 1.21.0 or later. - * [Helm][install-helm] version 3.0.0 or later. - * [openssl][install-openssl] version 3.3.0 or later. - * [Visual Studio Code][install-vscode] or equivalent. - * [Krew][install-krew] version 0.4.4 or later. - * [kubectl CloudNativePG (CNPG) Plugin][cnpg-plugin]. - -## Deployment process - -In this guide, you learn how to: - -* Use Azure CLI to create a multi-zone AKS cluster. -* Deploy a highly available PostgreSQL cluster and database using the [CNPG operator][cnpg-plugin]. -* Set up monitoring for PostgreSQL using Prometheus and Grafana. -* Deploy a sample dataset to a PostgreSQL database. -* Perform PostgreSQL and AKS cluster upgrades. -* Simulate a cluster interruption and PostgreSQL replica failover. -* Perform backup and restore of a PostgreSQL database. - -## Deployment architecture - -This diagram illustrates a PostgreSQL cluster setup with one primary replica and two read replicas managed by the [CloudNativePG (CNPG)](https://cloudnative-pg.io/) operator. The architecture provides a highly available PostgreSQL running on an AKS cluster that can withstand a zone outage by failing over across replicas. - -Backups are stored on [Azure Blob Storage](/azure/storage/blobs/), providing another way to restore the database in the event of an issue with streaming replication from the primary replica. - -:::image source="./media/postgresql-ha-overview/postgres-architecture-diagram.png" alt-text="Diagram of CNPG architecture." lightbox="./media/postgresql-ha-overview/postgres-architecture-diagram.png"::: - -> [!NOTE] -> For applications that require data separation at the database level, you can add more databases with postInitSQL commands and similar. It is not currently possible with the CNPG operator to add more databases in a declarative way. -[Learn more](https://github.com/cloudnative-pg/cloudnative-pg) about the CNPG operator. - -## Next steps - -> [!div class="nextstepaction"] -> [Create the infrastructure to deploy a highly available PostgreSQL database on AKS using the CNPG operator][create-infrastructure] - -## Contributors - -*This article is maintained by Microsoft. It was originally written by the following contributors*: - -* Ken Kilty | Principal TPM -* Russell de Pina | Principal TPM -* Adrian Joian | Senior Customer Engineer -* Jenny Hayes | Senior Content Developer -* Carol Smith | Senior Content Developer -* Erin Schaffer | Content Developer 2 -* Adam Sharif | Customer Engineer 2 - - -[what-is-aks]: ./what-is-aks.md -[postgresql]: https://www.postgresql.org/ -[core-kubernetes-concepts]: ./concepts-clusters-workloads.md -[azure-roles]: /azure/role-based-access-control/built-in-roles -[aks-preview]: ./draft.md#install-the-aks-preview-azure-cli-extension -[jq]: https://jqlang.github.io/jq/ -[install-kubectl]: https://kubernetes.io/docs/tasks/tools/install-kubectl/ -[install-helm]: https://helm.sh/docs/intro/install/ -[install-openssl]: https://www.openssl.org/ -[install-vscode]: https://code.visualstudio.com/Download -[install-krew]: https://krew.sigs.k8s.io/ -[cnpg-plugin]: https://cloudnative-pg.io/documentation/current/kubectl-plugin/#using-krew -[create-infrastructure]: ./create-postgresql-ha.md \ No newline at end of file diff --git a/scenarios/azure-aks-docs/articles/aks/resize-cluster.md b/scenarios/azure-aks-docs/articles/aks/resize-cluster.md new file mode 100644 index 000000000..828ffa0ec --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/resize-cluster.md @@ -0,0 +1,132 @@ +--- +title: Resize Azure Kubernetes Service (AKS) clusters +description: In this article, you learn about the importance of right-sizing your AKS clusters and how you can right-size them to optimize costs and performance. +ms.topic: how-to +ms.date: 06/13/2024 +author: schaffererin +ms.author: schaffererin +ms.service: azure-kubernetes-service +# Customer intent: As a cluster operator, I want to resize my cluster so I can scale my workloads based on demand. +ms.custom: innovation-engine, devx-track-azurecli, aks, scaling, cluster-management +--- + +# Resize Azure Kubernetes Service (AKS) clusters + +In this article, you learn how to resize an Azure Kubernetes Service (AKS) cluster. It's important to right-size your clusters to optimize costs and performance. You can manually resize a cluster by adding or removing the nodes to meet the needs of your applications. You can also autoscale your cluster to automatically adjust the number of nodes in response to changing demands. + +## Cluster right-sizing + +When you create an AKS cluster, you specify the number of nodes and the size of the nodes, which determines the compute capacity of the cluster. Oversized clusters can lead to unnecessary costs, while undersized clusters can lead to performance issues. You can adjust the number and size of the nodes in the cluster to right-size the cluster to meet the needs of your applications. + +Consider the following factors when right-sizing your cluster: + +* **Resource requirements**: Understand the resource requirements of your applications to determine the number of nodes and the size of the nodes needed to run your workloads. +* **Performance requirements**: Determine the performance requirements of your applications to ensure that the cluster can meet the demands of your workloads. +* **Cost considerations**: Optimize costs by right-sizing your cluster to avoid unnecessary costs associated with oversized clusters. +* **Application demands**: Monitor the demands of your applications to adjust the size of the cluster in response to changing demands. +* **Infrastructure constraints**: Consider the infrastructure constraints of your environment, such as capacity or reserved instance limiting to specific SKUs, to ensure that the cluster can be right-sized within the limits of your environment. + +## Monitor cluster performance and cost + +Closely monitor the performance and cost of your clusters to ensure they're right-sized to meet the needs of your application and make adjustments as needed. You can use the following resources for monitoring: + +* [Identify high CPU usage in Azure Kubernetes Service (AKS) clusters][identify-high-cpu-usage] +* [Troubleshoot memory saturation in Azure Kubernetes Service (AKS) clusters][troubleshoot-memory-saturation] +* [Cost analysis add-on for Azure Kubernetes Service (AKS)](./cost-analysis.md) +* [Configure the Metrics Server Vertical Pod Autoscaler (VPA) in Azure Kubernetes Service (AKS)](./use-metrics-server-vertical-pod-autoscaler.md) + +## When to resize a cluster + +You might want to resize a cluster in scenarios such as the following: + +* If you see that CPU and memory usage is consistently low, consider *downsizing* the cluster. If usage is consistently high, make sure you have [autoscaling enabled](#automatically-resize-an-aks-cluster) and increase the maximum node count if necessary. +* The [cost analysis add-on for AKS](./cost-analysis.md) shows you details about node usage and cost that indicate you might benefit from cluster resizing. For example, if you see that your nodes have a *high idle cost* with a *low usage cost*, you might consider resizing your cluster to reduce costs. +* The [Metrics Server VPA](./use-metrics-server-vertical-pod-autoscaler.md) shows you that your requests and/or limits are too high or low based on historical usage. You can use this information to adjust your cluster size to better match your workload. +* You experience performance issues such as resource starvation. This might be a result of the cluster being undersized for the demands of your applications. + +## What happens when I resize a cluster? + +### Increasing cluster size + +You can increase the size of an AKS cluster by adding nodes to the cluster. You can [add nodes to the cluster manually][manually-scale] or [configure autoscaling to automatically adjust the number of nodes](#automatically-resize-an-aks-cluster) in response to changing demands. + +When you increase the size of a cluster, the following changes occur: + +* New node instances are created using the same configuration as the existing nodes in the cluster. +* New pods might be scheduled on the new nodes to distribute the workload across the cluster. +* Existing pods don't move to the new nodes unless they are rescheduled due to node failures or other reasons. + +### Decreasing cluster size + +You can decrease the size of an AKS cluster by removing nodes from the cluster. When you remove nodes from the cluster, the nodes are automatically drained and removed from the cluster. You can remove nodes from the cluster manually or configure autoscaling to automatically adjust the number of nodes in response to changing demands. + +When you decrease the size of a cluster, the following changes occur: + +* AKS gracefully terminates the nodes and drains the pods running on the nodes before removing the nodes from the cluster. +* Any pods managed by a replication controller are rescheduled on other node instances in the cluster. +* Any pods that aren't managed by a replication controller aren't restarted. + +## Manually resize an AKS cluster + +### [Azure CLI](#tab/azure-cli) + +* Resize an AKS cluster using the [`az aks scale`][az-aks-scale] command with the `--node-count` and `--nodepool-name` parameters. + +Before running the resize command, set the required environment variables with your own values. The example values should be substituted with your actual resource group, cluster, desired node count, and node pool name. + +```azurecli-interactive +az aks scale --resource-group $RESOURCE_GROUP --name $CLUSTER_NAME --node-count $NUM_NODES --nodepool-name $NODE_POOL_NAME +``` + +Results: + + + +```output +{ + "agentPoolProfiles": [ + { + "count": 4, + "maxCount": null, + "minCount": null, + "name": "nodepool1", + ... + } + ], + "dnsPrefix": "xxxxx", + "fqdn": "xxxxx.xxxxx.xxxxxx.cloudapp.azure.com", + ... +} +``` + +Repeat this command for each node pool in the cluster that you want to resize. If your cluster has only one node pool, you can omit the `--nodepool-name` parameter. + +### [Azure portal](#tab/azure-portal) + +1. In the Azure portal, go to the AKS cluster that you want to resize. +2. Under **Settings**, select **Node pools**. +3. Select the node pool that you want to resize > **Scale node pool**. +4. On the **Scale node pool** page, enter the new **Node count** value. +5. Select **Apply** and repeat the steps for each node pool in the cluster that you want to resize. + +--- + +## Automatically resize an AKS cluster + +Use the [cluster autoscaler](./cluster-autoscaler-overview.md) to automatically resize your node pools in response to changing demands. + +For more information, see the [Cluster autoscaling in Azure Kubernetes Service (AKS) overview](./cluster-autoscaler-overview.md). To configure cluster autoscaling in AKS, see [Use the cluster autoscaler in Azure Kubernetes Service (AKS)](./cluster-autoscaler.md). + +## Next steps + +In this article, you learned how to right-size an AKS cluster. To learn more about managing AKS clusters, see the following articles: + +* [Stop and start an AKS cluster](./start-stop-cluster.md) +* [Configure a private AKS cluster](./private-clusters.md) +* [Use AKS cluster extensions](./cluster-extensions.md) + + +[az-aks-scale]: /cli/azure/aks#az-aks-scale +[manually-scale]: ./scale-cluster.md +[identify-high-cpu-usage]: /troubleshoot/azure/azure-kubernetes/availability-performance/identify-high-cpu-consuming-containers-aks +[troubleshoot-memory-saturation]: /troubleshoot/azure/azure-kubernetes/availability-performance/identify-memory-saturation-aks \ No newline at end of file diff --git a/scenarios/azure-aks-docs/articles/aks/use-etags.md b/scenarios/azure-aks-docs/articles/aks/use-etags.md new file mode 100644 index 000000000..03d7e1af8 --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/use-etags.md @@ -0,0 +1,137 @@ +--- +title: Enhancing Concurrency Control with Entity Tags (eTags) in Azure Kubernetes Service +description: Learn how to use eTags (Entity Tags) to enable concurrency control and avoid racing conditions or overwriting scenarios. +ms.topic: how-to +ms.date: 06/10/2024 +author: reginalin +ms.author: reginalin +ms.custom: innovation-engine, aks, etag, concurrency-control +ms.subservice: aks-nodes +--- + +# Enhance concurrency control with entity tags (eTags) in Azure Kubernetes Service + +To prevent conflicting requests in Azure Kubernetes Service (AKS), eTags (Entity Tags) serve as unique identifiers that enable concurrency control. When a request to the cluster is made, the system checks whether the provided eTag matches the latest version stored in the database. If there is a mismatch, the request fails early, ensuring that no unintended overwrites occur. + +## Utilizing eTag Headers + +There are two options for applying eTags through headers: + +**`–-if-match`** Header: Ensures that the operation is performed only if the existing eTag matches the value provided in this header. + +**`–-if-none-match`** Header: Ensures that the operation is performed only if none of the eTags matches the value provided in this header. This header type can only be empty or a `*`. + +### Find existing ETags + +You can do either a `LIST` or a `GET` call to your cluster or node pool to see the existing ETag. An ETag looks something like the following example: +``` +"agentPoolProfiles": [ + {"eTag": "5e5ffdce-356b-431b-b050-81b45eef2a12"} +] +``` + +### What would modify existing ETags + +ETags can exist at both the cluster and agent pool levels. Depending on the scope of the operations you are performing, you can pass in the corresponding eTag. When you perform a cluster-level operation, both the cluster-level eTag and agent pool eTag are updated. When you perform an agent pool operation, only the agent pool eTag is updated. + +### Include ETags in operation headers + +Headers are optional to use. The following examples show how to use `–-if-match` and `-–if-none-match` headers. + +**Example 1**: The CLI command below deletes an existing cluster `MyManagedCluster` if the eTag matches with `yvjvt` + +Suppose you want to delete an AKS cluster using its eTag. (For illustration, replace `"yvjvt"` with the actual eTag value you retrieved from the resource.) + +```shell +az aks delete -g $RG_NAME -n $CLUSTER_NAME --if-match "yvjvt" +``` + +**Example 2**: The CLI command below creates a new cluster. If `*` is provided in the `–if-none-match` header, that means to validate the resource does not exist. + +First, create a resource group: + +```azurecli +export RANDOM_SUFFIX=$(head -c 3 /dev/urandom | xxd -p) +export RG_NAME="my-resource-group$RANDOM_SUFFIX" +export REGION="eastus2" + +az group create --name $RG_NAME --location $REGION +``` + +Then, create a new AKS cluster with a random suffix to ensure uniqueness: + +```azurecli +export CLUSTER_NAME="my-managed-cluster$RANDOM_SUFFIX" + +az aks create -g $RG_NAME -n $CLUSTER_NAME --location $REGION --if-none-match "*" +``` + +Results: + + + +```output +{ + "aadProfile": null, + "addonProfiles": null, + "agentPoolProfiles": [ + { + "eTag": "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", + ... + } + ], + "apiServerAccessProfile": null, + "autoScalerProfile": null, + ... + "name": "my-managed-clusterxxx", + ... + "provisioningState": "Succeeded", + ... + "resourceGroup": "my-resource-groupxxx", + ... +} +``` + +### Configurations and Expected Behavior + +The table below outlines the expected behavior of HTTP operations (PUT, PATCH, and DELETE) based on different eTag configurations and resource existence. They show how the presence of `--if-match` or `--if-none-match` headers affects the response status codes, ensuring concurrency control and preventing unintended modifications. + + +**PUT** | **Resource does not exist** | **Resource exists** +--- | --- | --- +**`--if-match = ""`** | 201 – Created | 200 - Ok +**`--if-match = "*"`** | 412 - Precondition Failed | 200 - OK +**`--if-match = "xyz"`** | 412 - Precondition Failed | 200 - OK OR 412 - Precondition Failed +**`--if-none-match = "*"`** | 201 - Created | 412 - Precondition Failed + + +**PATCH** | **Resource does not exist** | **Resource exists** +--- | --- | --- +**`--if-match = ""`** | 404 - Not Found | 200 - OK +**`--if-match = "*"`** | 404 - Not Found | 200 - OK +**`--if-match = "xyz"`** | 404 - Not Found | 200 - OK OR 412 - Precondition Failed + + +**DELETE** | **Resource does not exist** | **Resource exists** +--- | --- | --- +**`--if-match = ""`** | 204 - No Content | 200 - OK +**`--if-match = "*"`** | 204 - No Content | 200 - OK +**`--if-match = "xyz"`** | 204 - No Content | 200 - OK OR 412 - Precondition Failed + +## Common Issues and Recommended Mitigations + +### **Scenario 1**: `BadRequest` – `--if-none-match` header is not empty or not set to `*` + +This fails the prevalidation checks. The `--if-none-match` header can only be empty or take a value of `*`. + +### **Scenario 2**: `BadRequest` - `--if-match` header is not empty AND `--if-none-match` header is `*` + +This fails the prevalidation checks. Both headers cannot be used at the same time. + +### **Scenario 3**: `PreConditionFailed` - `--if-none-match` is `*` and the given resource already exists + +The request is rejected if a `*` (wildcard of any) value is passed into `--if-none-match` header and the resource already exists. + +### **Scenario 4**: `PreConditionFailed` - The value of `--if-match` header does not match the latest eTag value of the resource + +The request is rejected if the header provided does not match with the eTag value. A new GET operation is needed to get the latest eTag on the resource and update the header value in the request. diff --git a/scenarios/azure-aks-docs/articles/aks/use-labels.md b/scenarios/azure-aks-docs/articles/aks/use-labels.md new file mode 100644 index 000000000..1eb6ef75b --- /dev/null +++ b/scenarios/azure-aks-docs/articles/aks/use-labels.md @@ -0,0 +1,254 @@ +--- +title: Use labels in an Azure Kubernetes Service (AKS) cluster +description: Learn how to use labels in an Azure Kubernetes Service (AKS) cluster. +author: rayoef +ms.author: rayoflores +ms.topic: how-to +ms.date: 06/10/2024 +ms.custom: innovation-engine, devx-track-azurecli, linux-related-content, kubernetes, aks +--- + +# Use labels in an Azure Kubernetes Service (AKS) cluster + +If you have multiple node pools, you may want to add a label during node pool creation. [Kubernetes labels][kubernetes-labels] handle the scheduling rules for nodes. You can add labels to a node pool anytime and apply them to all nodes in the node pool. + +In this how-to guide, you learn how to use labels in an Azure Kubernetes Service (AKS) cluster. + +## Prerequisites + +You need the Azure CLI version 2.2.0 or later installed and configured. Run `az --version` to find the version. If you need to install or upgrade, see [Install Azure CLI][install-azure-cli]. + +## Create an AKS cluster with a label + +You can create an AKS cluster with node labels to set key/value metadata for workload scheduling. + +```bash +export RANDOM_SUFFIX=$(openssl rand -hex 3) +export RESOURCE_GROUP="myResourceGroup$RANDOM_SUFFIX" +export AKS_CLUSTER_NAME="myAKSCluster$RANDOM_SUFFIX" +az group create --name $RESOURCE_GROUP --location $REGION +``` + +Results: + + + +```output +{ + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx", + "location": "eastus2", + "managedBy": null, + "name": "myResourceGroupxxx", + "properties": { + "provisioningState": "Succeeded" + }, + "tags": null, + "type": "Microsoft.Resources/resourceGroups" +} +``` + +Create the AKS cluster specifying node labels (e.g., dept=IT, costcenter=9000): + +```azurecli-interactive +az aks create \ + --resource-group $RESOURCE_GROUP \ + --name $AKS_CLUSTER_NAME \ + --node-count 2 \ + --nodepool-labels dept=IT costcenter=9000 \ + --generate-ssh-keys --location $REGION +``` + +Results: + + + +```output +{ + "aadProfile": null, + "addonProfiles": {}, + "agentPoolProfiles": [ + { + "count": 2, + "enableAutoScaling": null, + "mode": "System", + "name": "nodepool1", + "nodeLabels": { + "costcenter": "9000", + "dept": "IT" + } + } + ], + "dnsPrefix": "myaksclusterxxx-dns", + "fqdn": "myaksclusterxxx-xxxxxxxx.hcp.eastus2.azmk8s.io", + "id": "/subscriptions/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx/resourceGroups/myResourceGroupxxx/providers/Microsoft.ContainerService/managedClusters/myAKSClusterxxx", + "location": "eastus2", + "name": "myAKSClusterxxx", + "resourceGroup": "myResourceGroupxxx" +} +``` + +Verify the labels were set: + +```bash +az aks get-credentials --resource-group $RESOURCE_GROUP --name $AKS_CLUSTER_NAME --overwrite-existing +kubectl get nodes --show-labels | grep -e "costcenter=9000" -e "dept=IT" +``` + +## Create a node pool with a label + +You can create an additional node pool with labels for specific scheduling needs. + +```bash +export NODEPOOL_NAME="labelnp" +az aks nodepool add \ + --resource-group $RESOURCE_GROUP \ + --cluster-name $AKS_CLUSTER_NAME \ + --name $NODEPOOL_NAME \ + --node-count 1 \ + --labels dept=HR costcenter=5000 \ +``` + +The following is example output from the [`az aks nodepool list`][az-aks-nodepool-list] command showing the *labelnp* node pool is *Creating* nodes with the specified *nodeLabels*: + +```bash +az aks nodepool list --resource-group $RESOURCE_GROUP --cluster-name $AKS_CLUSTER_NAME +``` + +Results: + + + +```output +[ + { + "count": 2, + "name": "nodepool1", + "nodeLabels": { + "costcenter": "9000", + "dept": "IT" + } + }, + { + "count": 1, + "name": "labelnp", + "nodeLabels": { + "costcenter": "5000", + "dept": "HR" + }, + "provisioningState": "Creating" + } +] +``` + +Verify the labels were set: + +```bash +kubectl get nodes --show-labels | grep -e "costcenter=5000" -e "dept=HR" +``` + +## Updating labels on existing node pools + +You can update the labels on an existing node pool. Note: updating labels will overwrite the old labels. + +```bash +az aks nodepool update \ + --resource-group $RESOURCE_GROUP \ + --cluster-name $AKS_CLUSTER_NAME \ + --name $NODEPOOL_NAME \ + --labels dept=ACCT costcenter=6000 \ +``` + +Verify the new labels are set: + +```bash +kubectl get nodes --show-labels | grep -e "costcenter=6000" -e "dept=ACCT" +``` + +## Unavailable labels + +### Reserved system labels + +Since the [2021-08-19 AKS release][aks-release-2021-gh], AKS stopped the ability to make changes to AKS reserved labels. Attempting to change these labels results in an error message. + +The following labels are AKS reserved labels. *Virtual node usage* specifies if these labels could be a supported system feature on virtual nodes. Some properties that these system features change aren't available on the virtual nodes because they require modifying the host. + +| Label | Value | Example/Options | Virtual node usage | +| ---- | --- | --- | --- | +| kubernetes.azure.com/agentpool | \ | nodepool1 | Same | +| kubernetes.io/arch | amd64 | runtime.GOARCH | N/A | +| kubernetes.io/os | \ | Linux/Windows | Same | +| node.kubernetes.io/instance-type | \ | Standard_NC6s_v3 | Virtual | +| topology.kubernetes.io/region | \ | westus2 | Same | +| topology.kubernetes.io/zone | \ | 0 | Same | +| kubernetes.azure.com/cluster | \ | MC_aks_myAKSCluster_westus2 | Same | +| kubernetes.azure.com/managedby | aks | aks | N/A | +| kubernetes.azure.com/mode | \ | User or system | User | +| kubernetes.azure.com/role | agent | Agent | Same | +| kubernetes.azure.com/scalesetpriority | \ | Spot or regular | N/A | +| kubernetes.io/hostname | \ | aks-nodepool-00000000-vmss000000 | Same | +| kubernetes.azure.com/storageprofile | \ | Managed | N/A | +| kubernetes.azure.com/storagetier | \ | Premium_LRS | N/A | +| kubernetes.azure.com/instance-sku | \ | Standard_N | Virtual | +| kubernetes.azure.com/node-image-version | \ | AKSUbuntu-1804-2020.03.05 | Virtual node version | +| kubernetes.azure.com/subnet | \ | subnetName | Virtual node subnet name | +| kubernetes.azure.com/vnet | \ | vnetName | Virtual node virtual network | +| kubernetes.azure.com/ppg | \ | ppgName | N/A | +| kubernetes.azure.com/encrypted-set | \ | encrypted-set-name | N/A | +| kubernetes.azure.com/accelerator | \ | nvidia | N/A | +| kubernetes.azure.com/fips_enabled | \ | true | N/A | +| kubernetes.azure.com/os-sku | \ | [Create or update OS SKU][create-or-update-os-sku] | Linux | + +* *Same* is included in places where the expected values for the labels don't differ between a standard node pool and a virtual node pool. As virtual node pods don't expose any underlying virtual machine (VM), the VM SKU values are replaced with the SKU *Virtual*. +* *Virtual node version* refers to the current version of the [virtual Kubelet-ACI connector release][virtual-kubelet-release]. +* *Virtual node subnet name* is the name of the subnet where virtual node pods are deployed into Azure Container Instance (ACI). +* *Virtual node virtual network* is the name of the virtual network, which contains the subnet where virtual node pods are deployed on ACI. + +### Reserved prefixes + +The following prefixes are AKS reserved prefixes and can't be used for any node: + +* kubernetes.azure.com/ +* kubernetes.io/ + +For more information on reserved prefixes, see [Kubernetes well-known labels, annotations, and taints][kubernetes-well-known-labels]. + +### Deprecated labels + +The following labels are planned for deprecation with the release of [Kubernetes v1.24][aks-release-calendar]. You should change any label references to the recommended substitute. + +| Label | Recommended substitute | Maintainer | +| --- | --- | --- | +| failure-domain.beta.kubernetes.io/region | topology.kubernetes.io/region | [Kubernetes][kubernetes-labels] +| failure-domain.beta.kubernetes.io/zone | topology.kubernetes.io/zone | [Kubernetes][kubernetes-labels] +| beta.kubernetes.io/arch | kubernetes.io/arch | [Kubernetes][kubernetes-labels] +| beta.kubernetes.io/instance-type | node.kubernetes.io/instance-type | [Kubernetes][kubernetes-labels] +| beta.kubernetes.io/os | kubernetes.io/os | [Kubernetes][kubernetes-labels] +| node-role.kubernetes.io/agent* | kubernetes.azure.com/role=agent | Azure Kubernetes Service +| kubernetes.io/role* | kubernetes.azure.com/role=agent | Azure Kubernetes Service +| Agentpool* | kubernetes.azure.com/agentpool | Azure Kubernetes Service +| Storageprofile* | kubernetes.azure.com/storageprofile | Azure Kubernetes Service +| Storagetier* | kubernetes.azure.com/storagetier | Azure Kubernetes Service +| Accelerator* | kubernetes.azure.com/accelerator | Azure Kubernetes Service + +*Newly deprecated. For more information, see the [Release Notes][aks-release-notes-gh]. + +## Next steps + +Learn more about Kubernetes labels in the [Kubernetes labels documentation][kubernetes-labels]. + + +[aks-release-2021-gh]: https://github.com/Azure/AKS/releases/tag/2021-08-19 +[aks-release-notes-gh]: https://github.com/Azure/AKS/releases +[kubernetes-labels]: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ +[kubernetes-label-syntax]: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set +[kubernetes-well-known-labels]: https://kubernetes.io/docs/reference/labels-annotations-taints/ +[virtual-kubelet-release]: https://github.com/virtual-kubelet/azure-aci/releases + + +[aks-release-calendar]: ./supported-kubernetes-versions.md#aks-kubernetes-release-calendar +[az-aks-create]: /cli/azure/aks#az-aks-create +[az-aks-nodepool-add]: /cli/azure/aks#az-aks-nodepool-add +[az-aks-nodepool-list]: /cli/azure/aks/nodepool#az-aks-nodepool-list +[az-aks-nodepool-update]: /cli/azure/aks/nodepool#az-aks-nodepool-update +[create-or-update-os-sku]: /rest/api/aks/agent-pools/create-or-update#ossku +[install-azure-cli]: /cli/azure/install-azure-cli \ No newline at end of file diff --git a/scenarios/azure-dev-docs/articles/ansible/vm-configure.md b/scenarios/azure-dev-docs/articles/ansible/vm-configure.md index e785a1230..8c6eca78a 100644 --- a/scenarios/azure-dev-docs/articles/ansible/vm-configure.md +++ b/scenarios/azure-dev-docs/articles/ansible/vm-configure.md @@ -1,10 +1,12 @@ --- -title: Create a Linux virtual machines in Azure using Ansible +title: Create a Linux virtual machines in Azure using Ansible description: Learn how to create a Linux virtual machine in Azure using Ansible keywords: ansible, azure, devops, virtual machine ms.topic: tutorial ms.date: 08/14/2024 -ms.custom: devx-track-ansible, linux-related-content +ms.custom: devx-track-ansible, linux-related-content, innovation-engine +author: +ms.author: --- # Create a Linux virtual machines in Azure using Ansible @@ -21,118 +23,135 @@ In this article, you learn how to: > * Create a virtual network interface card > * Create a virtual machine -## 1. Configure your environment +## Configure your environment -[!INCLUDE [open-source-devops-prereqs-azure-sub.md](../includes/open-source-devops-prereqs-azure-subscription.md)] -[!INCLUDE [ansible-prereqs-cloudshell-use-or-vm-creation1.md](includes/ansible-prereqs-cloudshell-use-or-vm-creation1.md)] +- **Azure subscription**: If you don't have an Azure subscription, create a [free account](https://azure.microsoft.com/free/?ref=microsoft.com&utm_source=microsoft.com&utm_medium=docs&utm_campaign=visualstudio) before you begin. +- **Install Ansible**: Do one of the following options: -## 2. Create an SSH key pair + - [Install](/azure/ansible/ansible-install-configure#install-ansible-on-an-azure-linux-virtual-machine) and [configure](/azure/ansible/ansible-install-configure#create-azure-credentials) Ansible on a Linux virtual machine + - [Configure Azure Cloud Shell](/azure/cloud-shell/quickstart) -1. Run the following command. When prompted, specify the files to be created in the following directory: `/home/azureuser/.ssh/authorized_keys`. +## Implement the Ansible playbook - ```bash - ssh-keygen -m PEM -t rsa -b 4096 - ``` +1. Create a directory in which to test and run the sample Ansible code and make it the current directory. -1. Copy the contents of the public key file. By default, the public key file is named `id_rsa.pub`. The value is a long string starting with "ssh-rsa ". You'll need this value in the next step. - -## 3. Implement the Ansible playbook +2. Create a file named main.yml and insert the following code. In the playbook below the resource group name and other relevant properties use environment variables so that they are unique for each run. -1. Create a directory in which to test and run the sample Ansible code and make it the current directory. +```bash +export RANDOM_SUFFIX=$(openssl rand -hex 3) +export REGION="eastus2" +export MY_RESOURCE_GROUP="myResourceGroup$RANDOM_SUFFIX" +export MY_VM_NAME="myVM$RANDOM_SUFFIX" +export MY_VNET_NAME="myVnet$RANDOM_SUFFIX" +export MY_SUBNET_NAME="mySubnet$RANDOM_SUFFIX" +export MY_NIC_NAME="myNIC$RANDOM_SUFFIX" +export MY_PUBLIC_IP_NAME="myPublicIP$RANDOM_SUFFIX" +export MY_NSG_NAME="myNetworkSecurityGroup$RANDOM_SUFFIX" + +cat > main.yml <<'EOF' +- name: Create Azure VM + hosts: localhost + connection: local + tasks: + - name: Create resource group + azure_rm_resourcegroup: + name: "{{ lookup('env', 'MY_RESOURCE_GROUP') }}" + location: "{{ lookup('env', 'REGION') }}" + - name: Create virtual network + azure_rm_virtualnetwork: + resource_group: "{{ lookup('env', 'MY_RESOURCE_GROUP') }}" + name: "{{ lookup('env', 'MY_VNET_NAME') }}" + address_prefixes: "10.0.0.0/16" + - name: Add subnet + azure_rm_subnet: + resource_group: "{{ lookup('env', 'MY_RESOURCE_GROUP') }}" + name: "{{ lookup('env', 'MY_SUBNET_NAME') }}" + address_prefix: "10.0.1.0/24" + virtual_network: "{{ lookup('env', 'MY_VNET_NAME') }}" + - name: Create public IP address + azure_rm_publicipaddress: + resource_group: "{{ lookup('env', 'MY_RESOURCE_GROUP') }}" + allocation_method: Static + name: "{{ lookup('env', 'MY_PUBLIC_IP_NAME') }}" + register: output_ip_address + - name: Public IP of VM + debug: + msg: "The public IP is {{ output_ip_address.state.ip_address }}." + - name: Create Network Security Group that allows SSH + azure_rm_securitygroup: + resource_group: "{{ lookup('env', 'MY_RESOURCE_GROUP') }}" + name: "{{ lookup('env', 'MY_NSG_NAME') }}" + rules: + - name: SSH + protocol: Tcp + destination_port_range: 22 + access: Allow + priority: 1001 + direction: Inbound + - name: Create virtual network interface card + azure_rm_networkinterface: + resource_group: "{{ lookup('env', 'MY_RESOURCE_GROUP') }}" + name: "{{ lookup('env', 'MY_NIC_NAME') }}" + virtual_network: "{{ lookup('env', 'MY_VNET_NAME') }}" + subnet_name: "{{ lookup('env', 'MY_SUBNET_NAME') }}" + security_group: "{{ lookup('env', 'MY_NSG_NAME') }}" + ip_configurations: + - name: ipconfig1 + public_ip_address_name: "{{ lookup('env', 'MY_PUBLIC_IP_NAME') }}" + primary: yes + - name: Create VM + azure_rm_virtualmachine: + resource_group: "{{ lookup('env', 'MY_RESOURCE_GROUP') }}" + name: "{{ lookup('env', 'MY_VM_NAME') }}" + vm_size: Standard_DS1_v2 + admin_username: azureuser + ssh_password_enabled: false + generate_ssh_keys: yes # This will automatically generate keys if they don't exist + network_interfaces: "{{ lookup('env', 'MY_NIC_NAME') }}" + image: + offer: 0001-com-ubuntu-server-jammy + publisher: Canonical + sku: 22_04-lts + version: latest +EOF +``` -1. Create a file named `main.yml` and insert the following code. Replace the `` placeholder with the public key value from the previous step. - - ```yaml - - name: Create Azure VM - hosts: localhost - connection: local - tasks: - - name: Create resource group - azure_rm_resourcegroup: - name: myResourceGroup - location: eastus - - name: Create virtual network - azure_rm_virtualnetwork: - resource_group: myResourceGroup - name: myVnet - address_prefixes: "10.0.0.0/16" - - name: Add subnet - azure_rm_subnet: - resource_group: myResourceGroup - name: mySubnet - address_prefix: "10.0.1.0/24" - virtual_network: myVnet - - name: Create public IP address - azure_rm_publicipaddress: - resource_group: myResourceGroup - allocation_method: Static - name: myPublicIP - register: output_ip_address - - name: Public IP of VM - debug: - msg: "The public IP is {{ output_ip_address.state.ip_address }}." - - name: Create Network Security Group that allows SSH - azure_rm_securitygroup: - resource_group: myResourceGroup - name: myNetworkSecurityGroup - rules: - - name: SSH - protocol: Tcp - destination_port_range: 22 - access: Allow - priority: 1001 - direction: Inbound - - name: Create virtual network interface card - azure_rm_networkinterface: - resource_group: myResourceGroup - name: myNIC - virtual_network: myVnet - subnet: mySubnet - public_ip_name: myPublicIP - security_group: myNetworkSecurityGroup - - name: Create VM - azure_rm_virtualmachine: - resource_group: myResourceGroup - name: myVM - vm_size: Standard_DS1_v2 - admin_username: azureuser - ssh_password_enabled: false - ssh_public_keys: - - path: /home/azureuser/.ssh/authorized_keys - key_data: "" - network_interfaces: myNIC - image: - offer: 0001-com-ubuntu-server-jammy - publisher: Canonical - sku: 22_04-lts - version: latest - ``` - -## 4. Run the playbook - -[!INCLUDE [ansible-playbook.md](includes/ansible-playbook.md)] - -## 5. Verify the results - -Run [az vm list](/cli/azure/vm#az-vm-list) to verify the VM was created. - - ```azurecli - az vm list -d -o table --query "[?name=='myVM']" - ``` - -## 6. Connect to the VM - -Run the SSH command to connect to your new Linux VM. Replace the <ip-address> placeholder with the IP address from the previous step. +## Run the playbook + +Run the Ansible playbook using the ansible-playbook command. ```bash -ssh azureuser@ -i /home/azureuser/.ssh/authorized_keys/id_rsa +ansible-playbook main.yml +``` + +## Verify the results + +Run the following command to verify the VM was created. This command filters the VMs by name. + +```azurecli +az vm list -d -o table --query "[?name=='${MY_VM_NAME}']" +``` + + +```JSON +[ + { + "name": "myVM", + "powerState": "running", + "publicIps": "xxx.xxx.xxx.xxx" + } +] ``` -## Clean up resources +## Connect to the VM -[!INCLUDE [ansible-delete-resource-group.md](includes/ansible-delete-resource-group.md)] +Run the SSH command to connect to your new Linux VM. Replace the placeholder with the IP address obtained from the previous step. + +```bash +ssh -o StrictHostKeyChecking=no azureuser@$MY_PUBLIC_IP_NAME +``` ## Next steps -> [!div class="nextstepaction"] +> [!div class="nextstepaction"] > [Manage a Linux virtual machine in Azure using Ansible](./vm-manage.md) \ No newline at end of file diff --git a/scenarios/azure-docs/articles/iot-edge/quickstart-linux.md b/scenarios/azure-docs/articles/iot-edge/quickstart-linux.md new file mode 100644 index 000000000..ab473661a --- /dev/null +++ b/scenarios/azure-docs/articles/iot-edge/quickstart-linux.md @@ -0,0 +1,429 @@ +--- +title: "Quickstart: Create an Azure IoT Edge Device on Linux" +description: Learn to configure an Azure IoT Edge device on Linux. This guide walks you through creating an IoT Hub, registering a device, and deploying a simulated sensor module. +#customer intent: As a developer, I want to create an IoT Edge device on Linux so that I can deploy and test containerized modules. +author: PatAltimore +ms.author: patricka +ms.date: 03/27/2025 +ms.topic: quickstart +ms.service: azure-iot-edge +services: iot-edge +ms.custom: mvc, devx-track-azurecli, mode-other, linux-related-content +--- + +## Environment Variables + +In this section we declare environment variables that will be used throughout the Exec Doc. A random suffix is appended to resource names that must be unique for each deployment. + +```bash +export RANDOM_SUFFIX=$(openssl rand -hex 3) +export REGION="eastus2" +export RESOURCE_GROUP="IoTEdgeResources$RANDOM_SUFFIX" +export IOTHUB_NAME="UniqueIoTHub$RANDOM_SUFFIX" +export VM_NAME="myvm$RANDOM_SUFFIX" +``` + +# Quickstart: Deploy your first IoT Edge module to a virtual Linux device + +[!INCLUDE [iot-edge-version-all-supported](includes/iot-edge-version-all-supported.md)] + +Try Azure IoT Edge in this quickstart by deploying containerized code to a virtual Linux IoT Edge device. IoT Edge lets you remotely manage code on your devices so you can send more of your workloads to the edge. For this quickstart, use an Azure virtual machine for your IoT Edge device. It lets you quickly create a test machine and delete it when you're done. + +In this quickstart, you learn how to: + +* Create an IoT Hub. +* Register an IoT Edge device to your IoT hub. +* Install and start the IoT Edge runtime on a virtual device. +* Deploy a module remotely to an IoT Edge device. + +:::image type="content" source="./media/quickstart-linux/install-edge-full.png" alt-text="Diagram of Quickstart architecture for device and cloud."::: + +This quickstart walks you through creating a Linux virtual machine that's configured to be an IoT Edge device. Then, you deploy a module from the Azure portal to your device. This quickstart uses a simulated sensor module that generates temperature, humidity, and pressure data. The other Azure IoT Edge tutorials build upon the work you do here by deploying additional modules that analyze the simulated data for business insights. + +If you don't have an active Azure subscription, create a [free account](https://azure.microsoft.com/free) before you begin. + +## Prerequisites + +Set up your environment for the Azure CLI. + +[!INCLUDE [azure-cli-prepare-your-environment-no-header.md](~/reusable-content/azure-cli/azure-cli-prepare-your-environment-no-header.md)] + +## Create a resource group + +A resource group to manage all the resources you use in this quickstart. This quickstart and the following tutorials use the example resource group name **IoTEdgeResources** with a randomized suffix. + + ```azurecli-interactive + az group create --name $RESOURCE_GROUP --location $REGION + ``` +Results: + + +```JSON +{ + "id": "/subscriptions/xxxxx-xxxxx-xxxxx-xxxxx/resourceGroups/IoTEdgeResourcesabcd12", + "location": "westus2", + "managedBy": null, + "name": "IoTEdgeResourcesabcd12", + "properties": { + "provisioningState": "Succeeded" + }, + "tags": null, + "type": "Microsoft.Resources/resourceGroups" +} +``` + +## Create an IoT Hub + +Start the quickstart by creating an IoT Hub with the Azure CLI. + +:::image type="content" source="./media/quickstart-linux/create-iot-hub.png" alt-text="Diagram that shows how to create an IoT Hub in the cloud."::: + +The free tier of IoT Hub works for this quickstart. If you've used IoT Hub in the past and already have a hub created, you can use that IoT hub. + +The following code creates a **S1** hub in the resource group. Replace the placeholder with your preferred IoT Hub name if desired – here we use the environment variable $IOTHUB_NAME. Creating an IoT Hub might take a few minutes. + + ```azurecli-interactive + az iot hub create --resource-group $RESOURCE_GROUP --name $IOTHUB_NAME --sku S1 --partition-count 2 + ``` +Results: + + +```JSON +{ + "name": "UniqueIoTHubabcd12", + "sku": "S1", + "resourceGroup": "IoTEdgeResourcesabcd12", + "location": "westus2", + "state": "Active", + "skuCapacity": 1 +} +``` + +If you use F1 (the free tier), you can only create one IoT Hub per subscription. If you try to create a second hub, you'll receive an error message. In such a case, change the SKU to **S1**. Each subscription can only have one free IoT hub. If you get an error that the IoT Hub name isn't available, it means that someone else already has a hub with that name. Try a new name. + +## Register an IoT Edge device + +Register an IoT Edge device with the IoT hub you just created. + +:::image type="content" source="./media/quickstart-linux/register-device.png" alt-text="Diagram of how to register a device with an IoT Hub identity."::: + +Create a device identity for your IoT Edge device so that it can communicate with your IoT hub. The device identity lives in the cloud, and you use a unique device connection string to associate a physical device to a device identity. + +Because IoT Edge devices behave and are managed differently from typical IoT devices, declare this identity as an IoT Edge device using the --edge-enabled flag. + +1. Enter the following command in Azure Cloud Shell to create a device named **myEdgeDevice** in your hub. + + ```azurecli-interactive + az config set extension.use_dynamic_install=yes_without_prompt + az iot hub device-identity create --device-id myEdgeDevice --edge-enabled --hub-name $IOTHUB_NAME + ``` +Results: + + +```JSON +{ + "deviceId": "myEdgeDevice", + "generationId": "xxxxxxxx", + "status": "enabled", + "connectionState": "Disconnected", + "statusReason": null, + "connectionStateUpdatedTime": null, + "statusUpdatedTime": "2025-03-27T00:00:00.000Z", + "lastActivityTime": null, + "cloudToDeviceMessageCount": 0, + "authentication": { + "symmetricKey": { + "primaryKey": "xxxxxxxxxxxxxxxx==", + "secondaryKey": "xxxxxxxxxxxxxxxx==" + }, + "type": "sas" + }, + "capabilities": { + "iotEdge": true + }, + "etag": "xxxxxxxxxxxxxx" +} +``` + +2. Check the connection string for your device, which links the physical device to its identity in IoT Hub. It includes the name of your IoT Hub, the name of your device, and a shared key that authenticates connections between them. You use this connection string again in the next section to set up your IoT Edge device. + + ```azurecli-interactive + az iot hub device-identity connection-string show --device-id myEdgeDevice --hub-name $IOTHUB_NAME + ``` +Results: + + +```JSON +{ + "connectionString": "HostName=UniqueIoTHubabcd12.azure-devices.net;DeviceId=myEdgeDevice;SharedAccessKey=xxxxxxxxxxxxxxxxxxxxxxx" +} +``` + +For example, the connection string should look similar to +HostName=contoso-hub.azure-devices.net;DeviceId=myEdgeDevice;SharedAccessKey=. + +## Configure your IoT Edge device + +Create a virtual machine with the Azure IoT Edge runtime. + +:::image type="content" source="./media/quickstart-linux/start-runtime.png" alt-text="Diagram of how to start the runtime on a device."::: + +The IoT Edge runtime is deployed on all IoT Edge devices and has three components. The IoT Edge security daemon starts each time an IoT Edge device boots and bootstraps the device by starting the IoT Edge agent. The IoT Edge agent facilitates deployment and monitoring of modules on the IoT Edge device, including the IoT Edge hub. The IoT Edge hub manages communications between modules on the IoT Edge device, and between the device and IoT Hub. + +During runtime configuration, provide a device connection string. This string is retrieved from the Azure CLI. This string associates your physical device with the IoT Edge device identity in Azure. + +### Deploy the IoT Edge device + +This section uses an Azure Resource Manager template to create a new virtual machine and install the IoT Edge runtime on it. If you want to use your own Linux device instead, you can follow the installation steps in [Manually provision a single Linux IoT Edge device](how-to-provision-single-device-linux-symmetric.md), then return to this quickstart. + +Use the Deploy to Azure button or CLI commands to create an IoT Edge device based on the prebuilt [iotedge-vm-deploy](https://github.com/Azure/iotedge-vm-deploy) template. + +* Deploy using the IoT Edge Azure Resource Manager template. + + [![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fazure%2Fiotedge-vm-deploy%2Fmain%2FedgeDeploy.json) + +* For bash or Cloud Shell users, first create a file named **cloud-init.txt** in your current working directory. This file contains the configuration for the IoT Edge runtime: + + ```text + #cloud-config + package_update: true + package_upgrade: true + runcmd: + - curl https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb > packages-microsoft-prod.deb + - dpkg -i packages-microsoft-prod.deb + - apt-get update + - apt-get install aziot-edge -y + - | + CONNECTION_STRING="$(az iot hub device-identity connection-string show \ + --device-id myEdgeDevice \ + --hub-name $IOTHUB_NAME \ + -o tsv)" + iotedge config mp --connection-string "$CONNECTION_STRING" + iotedge config apply + ``` + +* Then, copy the following command into a text editor, replace the placeholder text with your information, then copy into your bash or Cloud Shell window: + + ```azurecli-interactive + az vm create \ + --resource-group $RESOURCE_GROUP \ + --name $VM_NAME \ + --image Ubuntu2204 \ + --admin-username azureuser \ + --generate-ssh-keys \ + --custom-data cloud-init.txt + ``` + +* For PowerShell users, copy the following command into your PowerShell window, then replace the placeholder text with your own information: + + ```powershell + az deployment group create ` + --resource-group $RESOURCE_GROUP ` + --template-uri "https://raw.githubusercontent.com/Azure/iotedge-vm-deploy/main/edgeDeploy.json" ` + --parameters dnsLabelPrefix="$VM_NAME" ` + --parameters adminUsername='azureUser' ` + --parameters deviceConnectionString=$(az iot hub device-identity connection-string show --device-id myEdgeDevice --hub-name $IOTHUB_NAME -o tsv) ` + --parameters authenticationType='password' ` + --parameters adminPasswordOrKey="" + ``` + +This template takes the following parameters: + +| Parameter | Description | +| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **resource-group** | The resource group in which the resources are created. Use the default **IoTEdgeResources** that we've been using throughout this article or provide the name of an existing resource group in your subscription. | +| **template-uri** | A pointer to the Resource Manager template that we're using. | +| **dnsLabelPrefix** | A string that is used to create the virtual machine's hostname. Replace the placeholder text with a name for your virtual machine. | +| **adminUsername** | A username for the admin account of the virtual machine. Use the example **azureUser** or provide a new username. | +| **deviceConnectionString** | The connection string from the device identity in IoT Hub, which is used to configure the IoT Edge runtime on the virtual machine. The CLI command within this parameter grabs the connection string for you. Replace the placeholder text with your IoT hub name. | +| **authenticationType** | The authentication method for the admin account. This quickstart uses **password** authentication, but you can also set this parameter to **sshPublicKey**. | +| **adminPasswordOrKey** | The password or value of the SSH key for the admin account. Replace the placeholder text with a secure password. Your password must be at least 12 characters long and have three of four of the following: lowercase characters, uppercase characters, digits, and special characters. | + +After deployment completes, JSON-formatted output in the CLI contains the SSH information to connect to the virtual machine. The output includes the public IP address of the virtual machine, which you can use to connect to it. + + ```bash + export IP_ADDRESS=$(az vm show -d -g $RESOURCE_GROUP -n $VM_NAME --query publicIps -o tsv) + + ssh azureuser@$IP_ADDRESS -o StrictHostKeyChecking=no + ``` + +### View the IoT Edge runtime status + +The rest of the commands in this quickstart take place on your IoT Edge device itself, so that you can see what's happening on the device. If you're using a virtual machine, connect to that machine now using the admin username that you set up and the DNS name that was output by the deployment command. You can also find the DNS name on your virtual machine's overview page in the Azure portal. Use the following command to connect to your virtual machine. Replace and with your own values. + + ```text + ssh @ + ``` + +Once connected to your virtual machine, verify that the runtime was successfully installed and configured on your IoT Edge device. + +1. Check if IoT Edge is running. The following command returns a status of **Ok** if IoT Edge is running or provides any service errors. + + ```bash + sudo iotedge system status + ``` +Results: + + +```text +Status: Ok +``` + + >[!TIP] + >You need elevated privileges to run iotedge commands. Once you sign out of your machine and sign back in the first time after installing the IoT Edge runtime, your permissions are automatically updated. Until then, use sudo in front of the commands. + +2. If you need to troubleshoot the service, retrieve the service logs. + + ```bash + sudo iotedge system logs + ``` +Results: + + +```text +... (service log output redacted for brevity) ... +``` + +3. View all the modules running on your IoT Edge device. Since the service just started for the first time, you should only see the **edgeAgent** module running. The edgeAgent module runs by default and helps to install and start any additional modules that you deploy to your device. + + ```bash + sudo iotedge list + ``` +Results: + + +```JSON +[ + { + "Name": "$edgeAgent", + "Status": "running" + } +] +``` + +Your IoT Edge device is now configured. It's ready to run cloud-deployed modules. + +## Deploy a module + +Manage your Azure IoT Edge device from the cloud to deploy a module that sends device telemetry data to IoT Hub. + +:::image type="content" source="./media/quickstart-linux/deploy-module.png" alt-text="Diagram of how to deploy a module from cloud to device."::: + +A key capability of Azure IoT Edge is deploying code to your IoT Edge devices from the cloud. IoT Edge modules are executable packages implemented as containers. In this section, you deploy a pre-built module from the [IoT Edge Modules section of Microsoft Artifact Registry](https://mcr.microsoft.com/catalog?cat=IoT%20Edge%20Modules&alphaSort=asc&alphaSortKey=Name). + +The module that you deploy in this section simulates a sensor and sends generated data. This module is a useful piece of code when you're getting started with IoT Edge because you can use the simulated data for development and testing. If you want to see exactly what this module does, you can view the [simulated temperature sensor source code](https://github.com/Azure/iotedge/blob/main/edge-modules/SimulatedTemperatureSensor/src/Program.cs). + +Use these steps to deploy your first module. + +1. Sign in to the [Azure portal](https://portal.azure.com) and go to your IoT Hub. + +2. From the menu on the left, under **Device Management**, select **Devices**. + +3. Select the device ID of the target IoT Edge device from the list. + + When you create a new IoT Edge device, it displays the status code 417 -- The device's deployment configuration is not set in the Azure portal. This status is normal, and means that the device is ready to receive a module deployment. + +4. On the upper bar, select **Set Modules**. + + Select the modules you want to run on your device. You can choose from modules that you've built yourself or images in a container registry. In this quickstart, you deploy a module from the Microsoft container registry. + +5. In the **IoT Edge modules** section, select **Add** then choose **IoT Edge Module**. + +6. Update the following module settings: + + | Setting | Value | + |--------------------|----------------------------------------------------------------------| + | IoT Module name | SimulatedTemperatureSensor | + | Image URI | mcr.microsoft.com/azureiotedge-simulated-temperature-sensor:latest | + | Restart policy | always | + | Desired status | running | + +7. Select **Next: Routes** to continue to configure routes. + +8. Add a route that sends all messages from the simulated temperature module to IoT Hub. + + | Setting | Value | + |------------|--------------------------------------------| + | Name | SimulatedTemperatureSensorToIoTHub | + | Value | FROM /messages/modules/SimulatedTemperatureSensor/* INTO $upstream | + +9. Select **Next: Review + create**. + +10. Review the JSON file, and then select **Create**. The JSON file defines all the modules that you deploy to your IoT Edge device. + + > [!NOTE] + > When you submit a new deployment to an IoT Edge device, nothing is pushed to your device. Instead, the device queries IoT Hub regularly for any new instructions. If the device finds an updated deployment manifest, it uses the information about the new deployment to pull the module images from the cloud then starts running the modules locally. This process can take a few minutes. + +After you create the module deployment details, the wizard returns you to the device details page. View the deployment status on the **Modules** tab. + +You should see three modules: **$edgeAgent**, **$edgeHub**, and **SimulatedTemperatureSensor**. If one or more of the modules has **Yes** under **Specified in Deployment** but not under **Reported by Device**, your IoT Edge device is still starting them. Wait a few minutes and refresh the page. + +:::image type="content" source="./media/quickstart-linux/view-deployed-modules.png" alt-text="Screenshot that shows the SimulatedTemperatureSensor in the list of deployed modules." lightbox="./media/quickstart-linux/view-deployed-modules.png"::: + +If you have issues deploying modules, learn more in [Troubleshoot IoT Edge devices from the Azure portal](troubleshoot-in-portal.md). + +## View generated data + +In this quickstart, you create a new IoT Edge device and install the IoT Edge runtime on it. Then, you use the Azure portal to deploy an IoT Edge module to run on the device without making changes to the device itself. + +In this case, the module that you pushed generates sample environment data that you can use for testing later. The simulated sensor is monitoring both a machine and the environment around the machine. For example, this sensor can be in a server room, on a factory floor, or on a wind turbine. The message includes ambient temperature and humidity, machine temperature and pressure, and a timestamp. The IoT Edge tutorials use the data created by this module as test data for analytics. + +Open the command prompt on your IoT Edge device, or use the SSH connection from Azure CLI. Confirm that the module you deployed from the cloud is running on your IoT Edge device: + +```bash +sudo iotedge list +``` +Results: + + +```JSON +[ + { + "Name": "$edgeAgent", + "Status": "running" + }, + { + "Name": "$edgeHub", + "Status": "running" + }, + { + "Name": "SimulatedTemperatureSensor", + "Status": "running" + } +] +``` + +View the messages sent from the temperature sensor module: + +```bash +sudo iotedge logs SimulatedTemperatureSensor -f +``` +Results: + + +```text +... (sample sensor data output redacted for brevity) ... +``` + +>[!TIP] +>IoT Edge commands are case sensitive when referring to module names. + +## Clean up resources + +To continue with the IoT Edge tutorials, use the device you registered and set up in this quickstart. Otherwise, delete the Azure resources you created to avoid charges. + +If you created your virtual machine and IoT hub in a new resource group, you can delete that group and all the associated resources. Double-check the contents of the resource group to ensure there's nothing you want to keep. If you don't want to delete the whole group, you can delete individual resources instead. + +> [!IMPORTANT] +> Deleting a resource group is irreversible. + +(The deletion commands have been removed from this Exec Doc to avoid accidental deletion during automated execution.) + +## Next steps + +In this quickstart, you created an IoT Edge device and used the Azure IoT Edge cloud interface to deploy code onto the device. Now, you use a test device that generates raw data about its environment. + +In the next tutorial, you'll learn how to monitor the activity and health of your device from the Azure portal. + +> [!div class="nextstepaction"] +> [Monitor IoT Edge devices](tutorial-monitor-with-workbooks.md) \ No newline at end of file diff --git a/scenarios/metadata.json b/scenarios/metadata.json index cd2ab3685..5b18a3291 100644 --- a/scenarios/metadata.json +++ b/scenarios/metadata.json @@ -154,8 +154,7 @@ "url": "https://learn.microsoft.com/en-us/azure/static-web-apps/add-api" } ], - "configurations": { - } + "configurations": {} }, { "status": "active", @@ -179,8 +178,7 @@ "url": "https://learn.microsoft.com/en-us/azure/virtual-machine-scale-sets/tutorial-autoscale-cli" } ], - "configurations": { - } + "configurations": {} }, { "status": "active", @@ -249,10 +247,8 @@ "title": "Secure your Linux VM", "url": "https://learn.microsoft.com/en-us/azure/virtual-machines/linux/tutorial-secure-vm" } - ], - "configurations": { - } + "configurations": {} }, { "status": "active", @@ -276,8 +272,7 @@ "url": "https://go.microsoft.com/fwlink/p/?linkid=2259865" } ], - "configurations": { - } + "configurations": {} }, { "status": "active", @@ -305,8 +300,7 @@ "url": "https://learn.microsoft.com/azure/aks/tutorial-kubernetes-app-update?tabs=azure-cli" } ], - "configurations": { - } + "configurations": {} }, { "status": "active", @@ -334,8 +328,7 @@ "url": "https://learn.microsoft.com/en-us/azure/load-balancer/quickstart-load-balancer-standard-public-cli" } ], - "configurations": { - } + "configurations": {} }, { "status": "active", @@ -402,19 +395,19 @@ "configurations": { "permissions": [], "configurableParams": [ - { - "inputType": "textInput", - "commandKey": "MY_RESOURCE_GROUP_NAME", - "title": "Resource Group Name", - "defaultValue": "" - }, - { - "inputType": "textInput", - "commandKey": "MY_VM_NAME", - "title": "VM Name", - "defaultValue": "" - } - ] + { + "inputType": "textInput", + "commandKey": "MY_RESOURCE_GROUP_NAME", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "MY_VM_NAME", + "title": "VM Name", + "defaultValue": "" + } + ] } }, { @@ -430,10 +423,8 @@ "title": "Deploy a highly available PostgreSQL database on AKS with Azure CLI", "url": "https://learn.microsoft.com/en-us/azure/aks/deploy-postgresql-ha?tabs=helm" } - ], - "configurations": { - } + "configurations": {} }, { "status": "inactive", @@ -448,10 +439,8 @@ "title": "Deploy a highly available PostgreSQL database on AKS with Azure CLI", "url": "https://learn.microsoft.com/en-us/azure/aks/deploy-postgresql-ha?tabs=helm" } - ], - "configurations": { - } + "configurations": {} }, { "status": "inactive", @@ -479,12 +468,11 @@ "url": "https://learn.microsoft.com/azure/ai-services/computer-vision/" } ], - "configurations": { - } + "configurations": {} }, { "status": "inactive", - "key": "DeployHAPGonARO/deploy-ha-pg-on-aro.md", + "key": "DeployHAPGOnARO/deploy-ha-pg-on-aro.md", "title": "Create a Highly Available PostgreSQL Cluster on Azure Red Hat OpenShift", "description": "This tutorial shows how to create a Highly Available PostgreSQL cluster on Azure Red Hat OpenShift (ARO) using the CloudNativePG operator", "stackDetails": "", @@ -495,10 +483,8 @@ "title": "Deploy a highly available PostgreSQL database on AKS with Azure CLI", "url": "https://learn.microsoft.com/en-us/azure/aks/deploy-postgresql-ha?tabs=helm" } - ], - "configurations": { - } + "configurations": {} }, { "status": "inactive", @@ -578,7 +564,6 @@ "title": "Deploy a Valkey cluster on Azure Kubernetes Service (AKS)", "url": "https://learn.microsoft.com/en-us/azure/aks/valkey-overview" } - ], "configurations": { "permissions": [], @@ -692,8 +677,7 @@ } ], "configurations": { - "permissions": [ - ], + "permissions": [], "configurableParams": [ { "inputType": "textInput", @@ -715,8 +699,7 @@ "key": "CreateLinuxVMSecureWebServer/create-linux-vm-secure-web-server.md", "title": "Create a NGINX Webserver Secured via HTTPS", "description": "This tutorial shows how to create a NGINX Webserver Secured via HTTPS.", - "stackDetails": [ - ], + "stackDetails": [], "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/CreateLinuxVMSecureWebServer/create-linux-vm-secure-web-server.md", "documentationUrl": "", "nextSteps": [ @@ -733,8 +716,7 @@ "url": "https://learn.microsoft.com/en-us/azure/virtual-machines/linux/tutorial-secure-vm" } ], - "configurations": { - } + "configurations": {} }, { "status": "active", @@ -771,7 +753,6 @@ "title": "Azure Linux Container Host tutorial", "url": "https://github.com/MicrosoftDocs/azure-management-docs/blob/main/articles/azure-linux/tutorial-azure-linux-create-cluster.md" } - ], "configurations": { "permissions": [] @@ -790,7 +771,6 @@ "title": "Deploy applications to your scale sets", "url": "https://learn.microsoft.com/en-us/azure/virtual-machine-scale-sets/tutorial-install-apps-cli" } - ], "configurations": { "permissions": [] @@ -809,7 +789,6 @@ "title": "How Accelerated Networking works in Linux and FreeBSD VMs", "url": "https://learn.microsoft.com/en-us/azure/virtual-network/accelerated-networking-how-it-works" } - ], "configurations": { "permissions": [] @@ -832,7 +811,6 @@ "title": "Deploy a highly available PostgreSQL database on AKS with Azure CLI", "url": "https://learn.microsoft.com/en-us/azure/aks/deploy-postgresql-ha?tabs=helm" } - ], "configurations": { "permissions": [] @@ -851,7 +829,6 @@ "title": "Use Microsoft Entra Workload ID with Azure Kubernetes Service (AKS)", "url": "https://learn.microsoft.com/en-us/azure/aks/workload-identity-overview" } - ], "configurations": { "permissions": [], @@ -953,20 +930,20 @@ "configurations": { "permissions": [], "configurableParams": [ - { - "inputType": "textInput", - "commandKey": "MY_RESOURCE_GROUP_NAME", - "title": "Resource Group Name", - "defaultValue": "" - }, - { - "inputType": "textInput", - "commandKey": "MY_VM_NAME", - "title": "VM Name", - "defaultValue": "" - } - ] - } + { + "inputType": "textInput", + "commandKey": "MY_RESOURCE_GROUP_NAME", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "MY_VM_NAME", + "title": "VM Name", + "defaultValue": "" + } + ] + } }, { "status": "active", @@ -985,20 +962,20 @@ "configurations": { "permissions": [], "configurableParams": [ - { - "inputType": "textInput", - "commandKey": "MY_RESOURCE_GROUP_NAME", - "title": "Resource Group Name", - "defaultValue": "" - }, - { - "inputType": "textInput", - "commandKey": "MY_VM_NAME", - "title": "VM Name", - "defaultValue": "" - } - ] - } + { + "inputType": "textInput", + "commandKey": "MY_RESOURCE_GROUP_NAME", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "MY_VM_NAME", + "title": "VM Name", + "defaultValue": "" + } + ] + } }, { "status": "active", @@ -1017,28 +994,27 @@ "configurations": { "permissions": [], "configurableParams": [ - { - "inputType": "textInput", - "commandKey": "MY_RESOURCE_GROUP_NAME", - "title": "Resource Group Name", - "defaultValue": "" - }, - { - "inputType": "textInput", - "commandKey": "MY_VM_NAME", - "title": "VM Name", - "defaultValue": "" - } - ] - } + { + "inputType": "textInput", + "commandKey": "MY_RESOURCE_GROUP_NAME", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "MY_VM_NAME", + "title": "VM Name", + "defaultValue": "" + } + ] + } }, { "status": "active", "key": "azure-docs/articles/batch/quick-create-cli.md", "title": "Quickstart: Use the Azure CLI to create a Batch account and run a job", "description": "Follow this quickstart to use the Azure CLI to create a Batch account, a pool of compute nodes, and a job that runs basic tasks on the pool.", - "stackDetails": [ - ], + "stackDetails": [], "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/azure-docs/articles/batch/quick-create-cli.md", "documentationUrl": "https://learn.microsoft.com/en-us/azure/batch/quick-create-cli", "nextSteps": [ @@ -1047,16 +1023,14 @@ "url": "https://learn.microsoft.com/en-us/azure/batch/tutorial-parallel-python" } ], - "configurations": { - } + "configurations": {} }, { "status": "active", "key": "azure-compute-docs/articles/virtual-machines/linux/tutorial-manage-vm.md", "title": "Tutorial - Create and manage Linux VMs with the Azure CLI", "description": "In this tutorial, you learn how to use the Azure CLI to create and manage Linux VMs in Azure", - "stackDetails": [ - ], + "stackDetails": [], "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/azure-compute-docs/articles/virtual-machines/linux/tutorial-manage-vm.md", "documentationUrl": "https://learn.microsoft.com/en-us/azure/virtual-machines/linux/tutorial-manage-vm", "nextSteps": [ @@ -1065,30 +1039,25 @@ "url": "https://learn.microsoft.com/en-us/azure/virtual-machines/linux/tutorial-manage-disks" } ], - "configurations": { - } + "configurations": {} }, { "status": "active", "key": "azure-compute-docs/articles/virtual-machine-scale-sets/tutorial-autoscale-cli.md", "title": "Tutorial - Autoscale a scale set with the Azure CLI", "description": "Learn how to use the Azure CLI to automatically scale a Virtual Machine Scale Set as CPU demands increases and decreases", - "stackDetails": [ - ], + "stackDetails": [], "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/azure-compute-docs/articles/virtual-machine-scale-sets/tutorial-autoscale-cli.md", "documentationUrl": "https://learn.microsoft.com/en-us/azure/virtual-machine-scale-sets/tutorial-autoscale-cli?tabs=Ubuntu", - "nextSteps": [ - ], - "configurations": { - } + "nextSteps": [], + "configurations": {} }, { "status": "active", "key": "azure-compute-docs/articles/virtual-machine-scale-sets/tutorial-modify-scale-sets-cli.md", "title": "Modify an Azure Virtual Machine Scale Set using Azure CLI", "description": "Learn how to modify and update an Azure Virtual Machine Scale Set using Azure CLI", - "stackDetails": [ - ], + "stackDetails": [], "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/azure-compute-docs/articles/virtual-machine-scale-sets/tutorial-modify-scale-sets-cli.md", "documentationUrl": "https://learn.microsoft.com/en-us/azure/virtual-machine-scale-sets/tutorial-modify-scale-sets-cli", "nextSteps": [ @@ -1097,16 +1066,14 @@ "url": "https://learn.microsoft.com/en-us/azure/virtual-machine-scale-sets/tutorial-use-disks-powershell" } ], - "configurations": { - } + "configurations": {} }, { "status": "active", "key": "azure-compute-docs/articles/virtual-machines/disks-enable-performance.md", "title": "Preview - Increase performance of Premium SSDs and Standard SSD/HDDs", "description": "Increase the performance of Azure Premium SSDs and Standard SSD/HDDs using performance plus.", - "stackDetails": [ - ], + "stackDetails": [], "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/azure-compute-docs/articles/virtual-machines/disks-enable-performance.md", "documentationUrl": "https://learn.microsoft.com/en-us/azure/virtual-machines/disks-enable-performance?tabs=azure-cli", "nextSteps": [ @@ -1119,16 +1086,14 @@ "url": "https://learn.microsoft.com/en-us/azure/virtual-machines/linux/expand-disks" } ], - "configurations": { - } + "configurations": {} }, { "status": "active", "key": "azure-compute-docs/articles/container-instances/container-instances-vnet.md", "title": "Deploy container group to Azure virtual network", "description": "Learn how to deploy a container group to a new or existing Azure virtual network via the Azure CLI.", - "stackDetails": [ - ], + "stackDetails": [], "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/azure-compute-docs/articles/container-instances/container-instances-vnet.md", "documentationUrl": "https://learn.microsoft.com/en-us/azure/container-instances/container-instances-vnet", "nextSteps": [ @@ -1141,16 +1106,14 @@ "url": "https://learn.microsoft.com/en-us/azure/container-instances/using-azure-container-registry-mi" } ], - "configurations": { - } + "configurations": {} }, { "status": "active", "key": "azure-compute-docs/articles/virtual-machines/linux/multiple-nics.md", "title": "Create a Linux VM in Azure with multiple NICs", "description": "Learn how to create a Linux VM with multiple NICs attached to it using the Azure CLI or Resource Manager templates.", - "stackDetails": [ - ], + "stackDetails": [], "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/azure-compute-docs/articles/virtual-machines/linux/multiple-nics.md", "documentationUrl": "https://learn.microsoft.com/en-us/azure/virtual-machines/linux/multiple-nics", "nextSteps": [ @@ -1163,16 +1126,14 @@ "url": "https://learn.microsoft.com/en-us/azure/security-center/security-center-just-in-time" } ], - "configurations": { - } + "configurations": {} }, { "status": "inactive", "key": "azure-compute-docs/articles/virtual-machines/linux/quick-create-terraform/quick-create-terraform.md", "title": "Quickstart: Use Terraform to create a Linux VM", "description": "In this quickstart, you learn how to use Terraform to create a Linux virtual machine.", - "stackDetails": [ - ], + "stackDetails": [], "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/azure-compute-docs/articles/virtual-machines/linux/quick-create-terraform/quick-create-terraform.md", "documentationUrl": "https://learn.microsoft.com/en-us/azure/virtual-machines/linux/quick-create-terraform?tabs=azure-cli", "nextSteps": [ @@ -1185,8 +1146,7 @@ "url": "https://learn.microsoft.com/en-us/azure/virtual-machines/linux/tutorial-manage-vm" } ], - "configurations": { - } + "configurations": {} }, { "status": "inactive", @@ -1206,12 +1166,10 @@ "key": "upstream/FlatcarOnAzure/flatcar-on-azure.md", "title": "Running Flatcar Container Linux on Microsoft Azure", "description": "Deploy Flatcar Container Linux in Microsoft Azure by creating resource groups and using official marketplace images.", - "stackDetails": [ - ], + "stackDetails": [], "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/upstream/FlatcarOnAzure/flatcar-on-azure.md", "documentationUrl": "https://www.flatcar.org/docs/latest/installing/cloud/azure/", - "configurations": { - } + "configurations": {} }, { "status": "active", @@ -1411,7 +1369,7 @@ }, { "title": "Use an internal load balancer with Azure Container Service (AKS)", - "url": "https://learn.microsoft.com/en-us/azure/aks/internal-lb" + "url": "https://learn.microsoft.com/en-us/azure/aks/internal-lb" }, { "title": "Create a basic ingress controller with external network connectivity", @@ -1850,5 +1808,931 @@ } ] } + }, + { + "status": "inactive", + "key": "UseIGOnAKS/use-ig-on-aks.md", + "title": "Comprehensive Guide to Using Inspektor Gadget in Kubernetes", + "description": "This Exec Doc provides a detailed walkthrough of a shell script that demonstrates various operations with the Inspektor Gadget in a Kubernetes environment. It explains each functional block, how the gadget plugin is installed, deployed, and used to run examples, export metrics, and verify configurations.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/UseIGOnAKS/use-ig-on-aks.md", + "documentationUrl": "", + "nextSteps": [ + { + "title": "Real-world scenarios where Inspektor Gadget can help you", + "url": "https://go.microsoft.com/fwlink/p/?linkid=2260402#use-cases" + }, + { + "title": "Explore the available gadgets", + "url": "https://go.microsoft.com/fwlink/p/?linkid=2260070" + }, + { + "title": "Run your own eBPF program", + "url": "https://go.microsoft.com/fwlink/p/?linkid=2259865" + } + ], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "CLUSTER_NAME", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "inactive", + "key": "azure-docs/articles/iot-edge/quickstart-linux.md", + "title": "Quickstart: Create an Azure IoT Edge Device on Linux", + "description": "Learn to configure an Azure IoT Edge device on Linux. This guide walks you through creating an IoT Hub, registering a device, and deploying a simulated sensor module.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/azure-docs/articles/iot-edge/quickstart-linux.md", + "documentationUrl": "", + "nextSteps": [ + { + "title": "Monitor IoT Edge Devices", + "url": "https://learn.microsoft.com/en-us/azure/iot-edge/tutorial-monitor-with-workbooks" + } + ], + "configurations": { + "permissions": [], + "configurableParams": [] + } + }, + { + "status": "inactive", + "key": "azure-dev-docs/articles/ansible/vm-configure.md", + "title": "Create a Linux virtual machines in Azure using Ansible", + "description": "Learn how to create a Linux virtual machine in Azure using Ansible", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/azure-dev-docs/articles/ansible/vm-configure.md", + "documentationUrl": "", + "nextSteps": [ + { + "title": "Manage a Linux virtual machine in Azure using Ansible", + "url": "https://learn.microsoft.com/en-us/azure/developer/ansible/vm-manage" + } + ], + "configurations": { + "permissions": [], + "configurableParams": [] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/istio-scale.md", + "title": "Istio service mesh Azure Kubernetes Service add-on performance and scaling", + "description": "Istio service mesh Azure Kubernetes Service add-on performance and scaling", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/IstioPerformanceAKS/istio-performance-aks.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/istio-scale", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/node-not-ready-custom-script-extension-errors.md", + "title": "Node Not Ready because of custom script extension (CSE) errors", + "description": "Troubleshoot scenarios in which custom script extension (CSE) errors cause Node Not Ready states in an Azure Kubernetes Service (AKS) cluster node pool.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/CSEErrorsAKS/cse-errors-aks.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/availability-performance/node-not-ready-custom-script-extension-errors", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RG_NAME", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "CLUSTER_NAME", + "title": "Cluster Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AVAILABILITY_SET_VM", + "title": "Availability Set VM Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/node-not-ready-after-being-healthy.md", + "title": "Node Not Ready status after node is in a healthy state", + "description": "Troubleshoot scenarios in which an Azure Kubernetes Service (AKS) cluster node goes to a Not Ready status after is in a healthy state.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/NodeNotReadyAKS/node-not-ready-aks.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/availability-performance/node-not-ready-after-being-healthy", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/connectivity/tcp-timeouts-dial-tcp-nodeip-10250-io-timeout.md", + "title": "TCP 10250 I/O timeout errors when connecting to a node's Kubelet for log retrieval", + "description": "Learn how to troubleshoot TCP 10250 I/O timeout errors that occur when retrieving kubectl logs from a pod in an Azure Kubernetes Service (AKS) cluster.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/KubeletIOTroubleshooting/kubelet-io-troubleshooting.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/connectivity/tcp-timeouts-dial-tcp-nodeip-10250-io-timeout", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "POD_NAME", + "title": "Pod Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "CLUSTER_NAME", + "title": "Aks Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/kubelet-logs.md", + "title": "View kubelet logs in Azure Kubernetes Service (AKS)", + "description": "Learn how to view troubleshooting information in the kubelet logs from Azure Kubernetes Service (AKS) nodes", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/KubeletLogsAKS/kubelet-logs-aks.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/kubelet-logs", + "nextSteps": [ + { + "title": "SSH into AKS cluster nodes", + "url": "https://learn.microsoft.com/azure/aks/ssh" + } + ], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP_NAME", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER_NAME", + "title": "Aks Cluster Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "NODE_NAME", + "title": "Node Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/delete-cluster.md", + "title": "Delete an Azure Kubernetes Service (AKS) cluster", + "description": "Learn about deleting a cluster in Azure Kubernetes Service (AKS).", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/DeleteAKSCluster/delete-aks-cluster.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/delete-cluster", + "nextSteps": [ + { + "title": "Stop an AKS cluster", + "url": "https://learn.microsoft.com/azure/aks/stop-start-cluster" + }, + { + "title": "Upgrade an AKS cluster", + "url": "https://learn.microsoft.com/azure/aks/upgrade-cluster" + } + ], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER_NAME", + "title": "AKS Cluster Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP_NAME", + "title": "Resource Group Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/access-control-managed-azure-ad.md", + "title": "Control cluster access using Conditional Access with AKS-managed Microsoft Entra integration", + "description": "Learn how to access clusters using Conditional Access when integrating Microsoft Entra ID in your Azure Kubernetes Service (AKS) clusters.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/ConditionalAccessAKS/conditional-access-aks.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/access-control-managed-azure-ad", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/concepts-network-azure-cni-pod-subnet.md", + "title": "Concepts - Azure CNI Pod Subnet networking in AKS", + "description": "Learn about Azure CNI Pod Subnet, dynamic IP allocation mode, and static block allocation mode in Azure Kubernetes Service (AKS).", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/AzureCNIPodSubnet/azure-cni-pod-subnet.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/concepts-network-azure-cni-pod-subnet", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "CLUSTER_NAME", + "title": "Cluster Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP_NAME", + "title": "Resource Group Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/aks-migration.md", + "title": "Migrate to Azure Kubernetes Service (AKS)", + "description": "This article shows you how to migrate to Azure Kubernetes Service (AKS).", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/MigrateToAKS/migrate-to-aks.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/aks-migration", + "nextSteps": [ + { + "title": "Containerizing ASP.NET applications and migrating to AKS", + "url": "/azure/migrate/tutorial-app-containerization-aspnet-kubernetes" + }, + { + "title": "Containerizing Java web applications and migrating to AKS", + "url": "/azure/migrate/tutorial-app-containerization-java-kubernetes" + } + ], + "configurations": { + "permissions": [], + "configurableParams": [] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/use-etags.md", + "title": "Enhancing Concurrency Control with Entity Tags (eTags) in Azure Kubernetes Service", + "description": "Learn how to use eTags (Entity Tags) to enable concurrency control and avoid racing conditions or overwriting scenarios.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/ETagConcurrencyAKS/e-tag-concurrency-aks.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/use-etags", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/istio-meshconfig.md", + "title": "Configure Istio-based service mesh add-on for Azure Kubernetes Service", + "description": "Configure Istio-based service mesh add-on for Azure Kubernetes Service", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/IstioAddonAKS/istio-addon-aks.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/istio-meshconfig", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "CLUSTER", + "title": "Cluster Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/access-private-cluster.md", + "title": "Access a private Azure Kubernetes Service (AKS) cluster using the command invoke or Run command feature", + "description": "Learn how to access a private Azure Kubernetes Service (AKS) cluster using the Azure CLI command invoke feature or the Azure portal Run command feature.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/PrivateAKSAccess/private-aks-access.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/access-private-cluster", + "nextSteps": [ + { + "title": "Create a private AKS cluster", + "url": "./private-clusters.md" + }, + { + "title": "Install Azure CLI", + "url": "/cli/azure/install-azure-cli" + } + ], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "AKS_RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER_NAME", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "AKSDNSLookupFailError/aksdns-lookup-fail-error.md", + "title": "Troubleshoot the K8SAPIServerDNSLookupFailVMExtensionError error code (52)", + "description": "Learn how to troubleshoot the K8SAPIServerDNSLookupFailVMExtensionError error (52) when you try to start or create and deploy an Azure Kubernetes Service (AKS) cluster.", + "stackDetails": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/create-upgrade-delete/error-code-k8sapiserverdnslookupfailvmextensionerror", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/AKSDNSLookupFailError/aksdns-lookup-fail-error.md", + "documentationUrl": "", + "nextSteps": [ + { + "title": "Create a private AKS cluster", + "url": "/azure/aks/private-clusters" + }, + { + "title": "Private Azure Kubernetes service with custom DNS server", + "url": "https://github.com/Azure/terraform/tree/00d15e09c54f25fb6387330c36aa4366122c5aaa/quickstart/301-aks-private-cluster" + }, + { + "title": "What is IP address 168.63.129.16?", + "url": "/azure/virtual-network/what" + } + ], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "CLUSTER_NAME", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/resize-cluster.md", + "title": "Resize Azure Kubernetes Service (AKS) clusters", + "description": "In this article, you learn about the importance of right-sizing your AKS clusters and how you can right-size them to optimize costs and performance.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/ResizeAKSCluster/resize-aks-cluster.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/resize-cluster", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER_NAME", + "title": "AKS Cluster Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "NUM_NODES", + "title": "Number of Nodes", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "NODE_POOL_NAME", + "title": "Node Pool Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/concepts-preview-api-life-cycle.md", + "title": "AKS Preview API life cycle", + "description": "Learn about the AKS preview API life cycle.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/AKSPreviewAPILifecycle/aks-preview-api-lifecycle.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/concepts-preview-api-life-cycle", + "nextSteps": [ + { + "title": "AKS Preview CLI Extension", + "url": "https://github.com/Azure/azure-cli-extensions/tree/main/src/aks-preview" + }, + { + "title": "Newer version of the SDK", + "url": "https://azure.github.io/azure-sdk/releases/latest/index.html?search=containerservice" + }, + { + "title": "Terraform release notes", + "url": "/azure/developer/terraform/provider-version-history-azurerm" + }, + { + "title": "client.go in Terraform provider", + "url": "https://github.com/hashicorp/terraform-provider-azurerm/blob/main/internal/services/containers/client/client.go" + } + ], + "configurations": { + "permissions": [], + "configurableParams": [] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/use-labels.md", + "title": "Use labels in an Azure Kubernetes Service (AKS) cluster", + "description": "Learn how to use labels in an Azure Kubernetes Service (AKS) cluster.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/AKSNodeLabels/aks-node-labels.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/use-labels", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/extensions/aks-cost-analysis-add-on-issues.md", + "title": "Azure Kubernetes Service Cost Analysis add-on issues", + "description": "Learn how to resolve issues that occur when you try to enable the Azure Kubernetes Service (AKS) Cost Analysis add-on.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/AKSCostAnalysisIssues/aks-cost-analysis-issues.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/extensions/aks-cost-analysis-add-on-issues", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/availability-performance/cluster-service-health-probe-mode-issues.md", + "title": "Troubleshoot the health probe mode for AKS cluster service load balancer", + "description": "Diagnoses and fixes common issues with the health probe mode feature.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/AKSHealthProbeMode/aks-health-probe-mode.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/availability-performance/cluster-service-health-probe-mode-issues", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER_NAME", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/error-code-cnidownloadtimeoutvmextensionerror.md", + "title": "Troubleshoot Container Network Interface download failures", + "description": "Learn how to resolve Container Network Interface download failures when you try to create and deploy an Azure Kubernetes Service (AKS) cluster.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/CniDownloadFailureAKS/cni-download-failure-aks.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/create-upgrade-delete/error-code-cnidownloadtimeoutvmextensionerror", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER_NAME", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/connectivity/tcp-timeouts-kubetctl-third-party-tools-connect-api-server.md", + "title": "TCP time-outs when kubectl or other 3rd-party tools connect to API", + "description": "Troubleshoot TCP time-outs that occur when kubectl or other third-party tools connect to the API server in Azure Kubernetes Service (AKS).", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/TCPTroubleshootAKS/tcp-troubleshoot-aks.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/connectivity/tcp-timeouts-kubetctl-third-party-tools-connect-api-server", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "ResourceGroupName", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKSClusterName", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/enable-host-encryption.md", + "title": "Enable host-based encryption on Azure Kubernetes Service (AKS)", + "description": "Learn how to configure a host-based encryption in an Azure Kubernetes Service (AKS) cluster.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/HostEncryptionAKS/host-encryption-aks.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/enable-host-encryption", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "MY_AKS_CLUSTER", + "title": "AKS Cluster Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "MY_RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/nat-gateway.md", + "title": "Create a managed or user-assigned NAT gateway for your Azure Kubernetes Service (AKS) cluster", + "description": "Learn how to create an AKS cluster with managed NAT integration and user-assigned NAT gateway.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/NATGatewayAKSCluster/nat-gateway-aks-cluster.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/nat-gateway", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/free-standard-pricing-tiers.md", + "title": "Azure Kubernetes Service (AKS) Free, Standard, and Premium pricing tiers for cluster management", + "description": "Learn about the Azure Kubernetes Service (AKS) Free, Standard, and Premium pricing plans and what features, deployment patterns, and recommendations to consider between each plan.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/AKSPricingTiers/aks-pricing-tiers.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/free-standard-pricing-tiers", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "CLUSTER_NAME", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "azure-aks-docs/articles/aks/events.md", + "title": "Use Kubernetes events for troubleshooting", + "description": "Learn about Kubernetes events, which provide details on pods, nodes, and other Kubernetes objects.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/KubernetesEventsTroubleshooting/kubernetes-events-troubleshooting.md", + "documentationUrl": "https://learn.microsoft.com/en-us/azure/aks/events", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/upgrading-or-scaling-does-not-succeed.md", + "title": "Troubleshoot cluster upgrading and scaling errors", + "description": "Troubleshoot errors that occur when you try to upgrade or scale an Azure Kubernetes Service (AKS) cluster.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/AKSUpgradeScalingErrors/aks-upgrade-scaling-errors.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/create-upgrade-delete/upgrading-or-scaling-does-not-succeed", + "nextSteps": [ + { + "title": "Request an increase in your resource quota", + "url": "/azure/azure-resource-manager/troubleshooting/error-resource-quota#solution" + }, + { + "title": "Troubleshoot the SubnetIsFull error code", + "url": "error-code-subnetisfull.md" + }, + { + "title": "Troubleshoot UpgradeFailed errors due to eviction failures caused by PDBs", + "url": "error-code-poddrainfailure.md" + }, + { + "title": "How to mitigate stopped upgrade operations due to deprecated APIs", + "url": "/azure/aks/stop-cluster-upgrade-api-breaking-changes#mitigate-stopped-upgrade-operations" + } + ], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "CLUSTER_NAME", + "title": "Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/connectivity/user-cannot-get-cluster-resources.md", + "title": "Troubleshoot 'Forbidden' error when trying to access AKS cluster resources", + "description": "Troubleshoot 'Error from server (Forbidden)' RBAC-related errors that occur when you try to view Kubernetes resources in an AKS cluster.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/ForbiddenErrorAKS/forbidden-error-aks.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/connectivity/user-cannot-get-cluster-resources", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "CLUSTER_NAME", + "title": "Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/connectivity/troubleshoot-cluster-connection-issues-api-server.md", + "title": "Troubleshoot cluster connection issues with the API server", + "description": "Troubleshoot issues that occur when you attempt to connect to the API server of an Azure Kubernetes Service (AKS) cluster.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/AKSApiServerTroubleshoot/aks-api-server-troubleshoot.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/connectivity/troubleshoot-cluster-connection-issues-api-server", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/connectivity/client-ip-address-cannot-access-api-server.md", + "title": "Client IP address can't access the API server", + "description": "Troubleshoot issues caused when the client IP address can't access the API server on an Azure Kubernetes Service (AKS) cluster.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/AksApiAccessIssue/aks-api-access-issue.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/connectivity/client-ip-address-cannot-access-api-server", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RG_NAME", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "CLUSTER_NAME", + "title": "Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/error-code-badrequest-or-invalidclientsecret.md", + "title": "AADSTS7000222 - BadRequest or InvalidClientSecret error", + "description": "Learn how to troubleshoot the BadRequest or InvalidClientSecret error when you try to create or upgrade an Azure Kubernetes Service (AKS) cluster.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/AKSClientSecretError/aks-client-secret-error.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/create-upgrade-delete/error-code-badrequest-or-invalidclientsecret", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "RESOURCE_GROUP_NAME", + "title": "Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER_NAME", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } + }, + { + "status": "active", + "key": "SupportArticles-docs/support/azure/azure-kubernetes/create-upgrade-delete/cannot-scale-cluster-autoscaler-enabled-node-pool.md", + "title": "Cluster autoscaler fails to scale with cannot scale cluster autoscaler enabled node pool error", + "description": "Learn how to troubleshoot the cannot scale cluster autoscaler enabled node pool error when your autoscaler isn't scaling up or down.", + "stackDetails": "", + "sourceUrl": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/ClusterAutoscalerError/cluster-autoscaler-error.md", + "documentationUrl": "https://learn.microsoft.com/en-us/troubleshoot/azure/azure-kubernetes/create-upgrade-delete/cannot-scale-cluster-autoscaler-enabled-node-pool", + "nextSteps": [], + "configurations": { + "permissions": [], + "configurableParams": [ + { + "inputType": "textInput", + "commandKey": "AKS_RG_NAME", + "title": "AKS Resource Group Name", + "defaultValue": "" + }, + { + "inputType": "textInput", + "commandKey": "AKS_CLUSTER_NAME", + "title": "AKS Cluster Name", + "defaultValue": "" + } + ] + } } ] \ No newline at end of file From 09aba4e7637e31d3c41214db955717b9e91da739 Mon Sep 17 00:00:00 2001 From: pjsingh28 <145501263+pjsingh28@users.noreply.github.com> Date: Fri, 27 Jun 2025 15:04:41 -0400 Subject: [PATCH 2/2] Update metadata.json --- scenarios/metadata.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scenarios/metadata.json b/scenarios/metadata.json index 5b18a3291..7313f07cb 100644 --- a/scenarios/metadata.json +++ b/scenarios/metadata.json @@ -2250,7 +2250,7 @@ }, { "title": "Private Azure Kubernetes service with custom DNS server", - "url": "https://github.com/Azure/terraform/tree/00d15e09c54f25fb6387330c36aa4366122c5aaa/quickstart/301-aks-private-cluster" + "url": "https://github.com/Azure/terraform/tree/master/quickstart/301-aks-private-cluster" }, { "title": "What is IP address 168.63.129.16?", @@ -2735,4 +2735,4 @@ ] } } -] \ No newline at end of file +]