# Analyzing Trace Statistics

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from MemoryAutoScaling import analysis

In [None]:
import os
pd.set_option('display.max_columns', None)

data_dir = "/Users/mattb/Desktop/Courses/MemoryAutoScaling/output_data"
stats_df = pd.read_csv(os.path.join(data_dir, "trace_stats.csv"))
stats_df.head()

In [None]:
print("Total Trace Count: {}".format(len(stats_df)))

In [None]:
print("Stationarity Results")
print("Number of Stationary Traces: {}".format(len(stats_df[stats_df['adf_p_val'] < 0.05])))
print("Number of Stationary Traces after 1 Level Differencing: {}".format(len(
    stats_df[(stats_df['adf_p_val_diff'] < 0.05) & (stats_df['adf_p_val'] >= 0.05)])))
print("Number of Stationary Traces after 2 Level Differencing: {}".format(len(
    stats_df[(stats_df['adf_p_val_diff2'] < 0.05) & (stats_df['adf_p_val_diff'] >= 0.05) & (stats_df['adf_p_val'] >= 0.05)])))
print("Other: {}".format(len(
    stats_df[(stats_df['adf_p_val_diff2'] >= 0.05) & (stats_df['adf_p_val_diff'] >= 0.05) & (stats_df['adf_p_val'] >= 0.05)])))

In [None]:
print("Correlation Results")
print("Number of traces with correlation of maximum_usage.memory and average_usage.memory >= 0.7 or <= -0.7: {}".format(len(stats_df[stats_df['corr_average_usage.memory'].abs() > 0.7])))
print("Number of traces with correlation of maximum_usage.memory and average_usage.cpus >= 0.7 or <= -0.7: {}".format(len(stats_df[stats_df['corr_average_usage.cpus'].abs() > 0.7])))
print("Number of traces with correlation of maximum_usage.memory and maximum_usage.cpus >= 0.7 or <= -0.7: {}".format(len(stats_df[stats_df['corr_maximum_usage.cpus'].abs() > 0.7])))

### CDFs

In [None]:
def plot_lagged_cdfs(data_df, col_name, col_title):
    """Plots the lagged CDFs for `col_name` based on `lags`.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The pandas DataFrame containing the distributions for the CDFs
    col_name: str
        A string representing the column for which the CDFs are generated.
    col_title: str
        A string representing the column title for the CDFs.
    
    Returns
    -------
    None
    
    """
    fig, axes = plt.subplots(2, 2, figsize=(20, 8))
    lags = [0, 2, 3, 4]
    col_names = [col_name] + ["{0}_lag_{1}".format(col_name, lag) for lag in [2, 3, 4]]
    colors = ["blue", "black", "green", "red"]
    for idx in range(4):
        row = idx // 2
        col = idx % 2
        analysis.plot_cumulative_distribution_function(data_df[col_names[idx]].abs().values, axes[row, col],
                                                       "{0} Lag {1}".format(col_title, lags[idx]), colors[idx])
    plt.show()

In [None]:
plot_lagged_cdfs(stats_df, 'corr_average_usage.memory', "Average Memory")

In [None]:
plot_lagged_cdfs(stats_df, 'corr_average_usage.cpus', "Average CPU")

In [None]:
plot_lagged_cdfs(stats_df, 'corr_maximum_usage.memory', "Maximum Memory")

In [None]:
plot_lagged_cdfs(stats_df, 'corr_maximum_usage.cpus', "Maximum CPU")