In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
import os
base_dir = Path(os.getcwd()).parents[2]
file_path = base_dir / 'data' / 'aks01_pod_metrics.json'
# Load the JSON file into a DataFrame
df = pd.read_json(file_path)

In [None]:
# Check the shape of the dataset
print(f"Dataset Shape: {df.shape}")
   
# Check the data types of each column
print(df.info())

# Check for missing values
print(df.isnull().sum())

# Get basic statistics for numerical columns
print(df.describe())

# Check for unique values in categorical columns
for col in df.select_dtypes(include=['object']).columns:
    print(f"Unique values in {col}: {df[col].nunique()}")

In [39]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Example: Visualize the distribution of numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
# Example: Count plot for categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(y=df[col])
    plt.title(f'Count of {col}')
    plt.show()

In [None]:
# Drop rows with missing values (if applicable)
df = df.dropna()

# Or fill missing values with a specific value
df.fillna(value={'column_name': 'default_value'}, inplace=True)

In [None]:
df['collectionTimestamp'] = pd.to_datetime(df['collectionTimestamp'])
df['year'] = df['collectionTimestamp'].dt.year
df['month'] = df['collectionTimestamp'].dt.month
df['day'] = df['collectionTimestamp'].dt.day

In [31]:
# Label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['namespace'] = le.fit_transform(df['namespace'])
df['pod'] = le.fit_transform(df['pod'])
df['node'] = le.fit_transform(df['node'])
df['container'] = le.fit_transform(df['container'])
df['host_ip'] = le.fit_transform(df['host_ip'])
df['pod_ip'] = le.fit_transform(df['pod_ip'])
df['uid'] = le.fit_transform(df['uid'])
df['controllerKind'] = le.fit_transform(df['controllerKind'])
df['controllerName'] = le.fit_transform(df['controllerName'])
df['deployment'] = le.fit_transform(df['deployment'])

In [None]:
# Check data types and missing values
df.info()

# Summary statistics
df.describe()

# Check for null values
df.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CPU Usage Distribution
sns.histplot(df['cpuUsage'], kde=True)
plt.title('CPU Usage Distribution')
plt.show()

# Memory Usage Distribution
sns.histplot(df['memUsage'], kde=True)
plt.title('Memory Usage Distribution')
plt.show()

In [1]:
# Correlation heatmap
correlation = df[['cpuUsage', 'memUsage', 'cpuRequest', 'memRequest', 'cpuLimit', 'memLimit']].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

NameError: name 'df' is not defined

In [None]:
# Analyze controller types
controller_counts = df['controllerKind'].value_counts()
controller_counts.plot(kind='bar')
plt.title('Controller Kind Distribution')
plt.show()

In [None]:
# Calculate CPU and Memory Utilization
df['cpuUtilization'] = df['cpuUsage'] / (df['cpuLimit'] + 1e-5)
df['memUtilization'] = df['memUsage'] / (df['memLimit'] + 1e-5)

In [None]:
# Categorize pods based on CPU requests
df['cpuRequestCategory'] = pd.cut(df['cpuRequest'], bins=[-1, 0, 0.5, 1, float('inf')], labels=['None', 'Low', 'Medium', 'High'])

In [None]:
# Convert timestamps to datetime
df['collectionTimestamp'] = pd.to_datetime(df['collectionTimestamp'])
df['cpuUsageTimestamp'] = pd.to_datetime(df['cpuUsageTimestamp'])

# Extract time-based features
df['hour'] = df['collectionTimestamp'].dt.hour
df['day_of_week'] = df['collectionTimestamp'].dt.dayofweek

In [None]:
# Save the cleaned and processed DataFrame to a new CSV file
output_path = r'c:\brijesh\Confluentis\MLearn\processed_pod_metrics.csv'
df.to_csv(output_path, index=False)

In [None]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# data['scaled_column'] = scaler.fit_transform(data[['numerical_column']])

In [None]:
# data['new_feature'] = data['feature1'] * data['feature2']

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer()
# text_features = vectorizer.fit_transform(data['text_column'])

In [None]:
df.to_csv('processed_data.csv', index=False)