# Analyzing Trace Statistics

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from MemoryAutoScaling import analysis

In [None]:
import os
pd.set_option('display.max_columns', None)

data_dir = "/Users/mattb/Desktop/Courses/MemoryAutoScaling/output_data/max_mem_3"
stats_df = pd.read_csv(os.path.join(data_dir, "trace_stats.csv"))
stats_df.head()

In [None]:
print("Total Trace Count: {}".format(len(stats_df)))

In [None]:
print("Stationarity Results")
print("Number of Stationary Traces: {}".format(len(stats_df[stats_df['adf_p_val'] < 0.05])))
print("Number of Stationary Traces after 1 Level Differencing: {}".format(len(
    stats_df[(stats_df['adf_p_val_diff'] < 0.05) & (stats_df['adf_p_val'] >= 0.05)])))
print("Number of Stationary Traces after 2 Level Differencing: {}".format(len(
    stats_df[(stats_df['adf_p_val_diff2'] < 0.05) & (stats_df['adf_p_val_diff'] >= 0.05) & (stats_df['adf_p_val'] >= 0.05)])))
print("Other: {}".format(len(
    stats_df[(stats_df['adf_p_val_diff2'] >= 0.05) & (stats_df['adf_p_val_diff'] >= 0.05) & (stats_df['adf_p_val'] >= 0.05)])))

In [None]:
print("Correlation Results")
print("Number of traces with correlation of maximum_usage.memory and average_usage.memory >= 0.7 or <= -0.7: {}".format(len(stats_df[stats_df['corr_average_usage.memory_ts'].abs() > 0.7])))
print("Number of traces with correlation of maximum_usage.memory and average_usage.cpus >= 0.7 or <= -0.7: {}".format(len(stats_df[stats_df['corr_average_usage.cpus_ts'].abs() > 0.7])))
print("Number of traces with correlation of maximum_usage.memory and maximum_usage.cpus >= 0.7 or <= -0.7: {}".format(len(stats_df[stats_df['corr_maximum_usage.cpus_ts'].abs() > 0.7])))

### CDFs

In [None]:
def plot_lagged_cdfs(data_df, col_name, col_title):
    """Plots the lagged CDFs for `col_name`.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The pandas DataFrame containing the distributions for the CDFs
    col_name: str
        A string representing the column for which the CDFs are generated.
    col_title: str
        A string representing the column title for the CDFs.
    
    Returns
    -------
    None
    
    """
    fig, axes = plt.subplots(2, 2, figsize=(20, 8))
    lags = [0, 1, 2, 3]
    col_names = ["corr_{}".format(col_name)] + ["corr_{0}_lag_{1}".format(col_name, lag) for lag in lags[1:]]
    colors = ["blue", "black", "green", "red"]
    for idx in range(4):
        row = idx // 2
        col = idx % 2
        data_vals = data_df[col_names[idx]].abs().values
        data_vals = data_vals[~np.isnan(data_vals)]
        analysis.plot_cumulative_distribution_function(data_vals, axes[row, col],
                                                       "{0} Lag {1}".format(col_title, lags[idx]), colors[idx],
                                                       "CDF of Correlation")
    plt.show()

In [None]:
from MemoryAutoScaling import specs

def print_all_cdfs_for_stats(data_df, col_name, col_title, is_max=True):
    """Prints all of the CDFs for the correlations for `col_name` across all stats.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The pandas DataFrame containing the distributions for the CDFs
    col_name: str
        A string representing the column for which the CDFs are generated.
    col_title: str
        A string representing the column title for the CDFs.
    is_max: bool
        A boolean indicating whether `col_name` corresponds to a maximum column.
    
    Returns
    -------
    None
    
    """
    if is_max:
        col_names = ["{0}_{1}".format(col_name, stat) for stat in specs.MAX_STATS_COLS]
    else:
        col_names = ["{0}_{1}".format(col_name, stat) for stat in specs.AVG_STATS_COLS]
    for name in col_names:
        print(name)
        plot_lagged_cdfs(stats_df, name, col_title)
        print("------------------------------------------------------------------------")

In [None]:
print_all_cdfs_for_stats(stats_df, "average_usage.memory", "Average Memory", False)

In [None]:
print_all_cdfs_for_stats(stats_df, "average_usage.cpus", "Average CPU", False)

In [None]:
print_all_cdfs_for_stats(stats_df, "maximum_usage.memory", "Maximum Memory", True)

In [None]:
print_all_cdfs_for_stats(stats_df, "maximum_usage.cpus", "Maximum CPU", True)

### Causation

In [None]:
def plot_causal_cdfs(data_df, col_name, test_name, col_title):
    """Plots the causal CDFs for `col_name`.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The pandas DataFrame containing the distributions for the CDFs
    col_name: str
        A string representing the column for which the CDFs are generated.
    col_title: str
        A string representing the column title for the CDFs.
    
    Returns
    -------
    None
    
    """
    fig, axes = plt.subplots(3, figsize=(20, 20))
    lags = [1, 2, 3]
    col_names = ["causal_{0}_{1}_{2}".format(col_name, test_name, lag) for lag in lags]
    colors = ["blue", "black", "green"]
    for idx in range(3):
        data_vals = data_df[col_names[idx]].abs().values
        print("Lag {0}: {1} NaN values".format(lags[idx], np.sum(np.isnan(data_vals))))
        data_vals = data_vals[~np.isnan(data_vals)]
        analysis.plot_cumulative_distribution_function(data_df[col_names[idx]].abs().values, axes[idx],
                                                       "{0} Lag {1}".format(col_title, lags[idx]), colors[idx],
                                                       "CDF of Causation P-Values")
    plt.show()

In [None]:
plot_causal_cdfs(stats_df, "average_usage.memory_ts", "ssr_chi2test","Average Memory")

In [None]:
plot_causal_cdfs(stats_df, "average_usage.cpus_ts", "ssr_chi2test","Average CPU")

In [None]:
plot_causal_cdfs(stats_df, "maximum_usage.cpus_ts", "ssr_chi2test","Maximum CPU")