In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

# Generate timestamp data: 14 days, 15-min intervals
dates = pd.date_range(start="2024-10-01", end="2024-10-14", freq="1min")

# Custom function: Add weekday/weekend patterns
def generate_usage(base, variation, size, peak_hours=None):
    usage = np.random.normal(base, variation, size)
    if peak_hours:
        # Add higher usage for specific hours (e.g., 9am-5pm)
        peak_indices = np.where((dates.hour >= peak_hours[0]) & (dates.hour < peak_hours[1]))
        usage[peak_indices] += np.random.normal(10, 5, len(peak_indices[0]))
    return np.clip(usage, 0, 100)  # Ensure values are between 0 and 100

# Simulate resource usage with daily patterns
cpu_usage = generate_usage(base=30, variation=10, size=len(dates), peak_hours=(9, 17))
memory_usage = generate_usage(base=60, variation=8, size=len(dates))
network_bandwidth = np.clip(np.random.normal(500, 100, size=len(dates)), 100, 1000)  # In MBps

# Add bursts of API requests to simulate peaks
api_requests = np.clip(np.random.poisson(200, size=len(dates)), 50, 1500)
spike_indices = np.random.choice(len(dates), size=50, replace=False)  # Random spikes
api_requests[spike_indices] += np.random.randint(500, 1000, size=50)  # Burst of requests

# Simulate concurrent users with lower activity on weekends
day_of_week = dates.dayofweek  # Monday=0, Sunday=6
concurrent_users = np.where(day_of_week < 5,  # Weekdays
                            np.random.poisson(100, len(dates)),
                            np.random.poisson(30, len(dates)))  # Weekends

# Create a DataFrame
data = pd.DataFrame({
    "timestamp": dates,
    "cpu_usage": cpu_usage,
    "memory_usage": memory_usage,
    "network_bandwidth_mbps": network_bandwidth,
    "api_requests_per_second": api_requests,
    "concurrent_users": concurrent_users
})

# Save to CSV with custom formatting
data.to_csv("./Output_Data/test_synthetic_cloud_usage.csv", index=False)
print("Customized synthetic data generated!")


Customized synthetic data generated!


In [2]:
from sklearn.preprocessing import StandardScaler
import joblib

# Select the features to normalize
features = ["cpu_usage", "memory_usage", "network_bandwidth_mbps", 
            "api_requests_per_second", "concurrent_users"]

# Initialize and fit the StandardScaler
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Save the scaler for future use (important for predictions)
joblib.dump(scaler, "scaler.pkl")

# Display the first few rows of the normalized data
print(data.head())

# Optional: Save the normalized data to a new CSV file
data.to_csv("./Output_Data/test_normalized_cloud_usage.csv", index=False)
print("Data normalized and saved!")


            timestamp  cpu_usage  memory_usage  network_bandwidth_mbps  \
0 2024-10-01 00:00:00  -0.154731     -0.348906               -0.848648   
1 2024-10-01 00:01:00  -1.554808      0.759911                0.808344   
2 2024-10-01 00:02:00   0.448196     -1.802395               -1.530230   
3 2024-10-01 00:03:00   0.122359     -0.356820                0.083606   
4 2024-10-01 00:04:00  -0.205188     -1.675004                1.301082   

   api_requests_per_second  concurrent_users  
0                -0.229902          0.522179  
1                -0.046236          0.581881  
2                 0.045596          0.760986  
3                -0.138069          0.552030  
4                 0.137429          1.417703  
Data normalized and saved!


In [3]:
# Load the scaler
scaler = joblib.load("scaler.pkl")

# Get mean and standard deviation
mean = scaler.mean_
std = scaler.scale_

print("Mean:", mean)
print("Standard Deviation:", std)

Mean: [ 33.37259751  59.99191496 500.59977334 202.01394156  78.5070242 ]
Standard Deviation: [ 11.39790946   7.98941626 100.1715581   43.55748615  33.49994346]
