In [None]:
import sys

sys.path.append('..')

from utilities import *
import statistics as stat

In [None]:
def get_backups(folder, keep_duplicates=True):
    """Return a dataframe representing backups in a folder sorted based on time created.

    Create a pandas dataframe representing backup files in a given folder based
    on the time created, the name of the backup file (excluding the path) and 
    the number of each node type in the backup file represented as a dictionary. 
    If `keep_duplicates` is true, then any duplicate rows will be removed. This 
    is determined by the number of each node type and the backup file name. The
    purpose of this is to not count a save if the user is idle and not changing
    the file.

    :param folder: The folder to search for backup files.
    :param keep_duplicates: Whether to keep duplicate data rows.
    :returns: Pandas dataframe representing the backups.
    :rtype: pandas.DataFrame

    """
    backups = []

    for f in Path(folder).rglob("*.backup"):
        # Ignore relative path sequences
        f_str = str(f).replace('../', '')
        split = f_str.split('.')
        ext = split[1]
        name = "{}.{}".format(split[0], split[1])
        time = get_time_from_file_name(get_file_name(f))
        
        try:
            backups.append((time, name, NODE_FUNCS[ext](str(f))))
        except ET.ParseError as e:
            pass

    backups.sort(key=lambda x: x[0])

    df = pd.DataFrame(backups, columns=['time', 'name', 'nodes'])

    if not keep_duplicates:
        df['nodes_str'] = df['nodes'].astype(str)
        df = df.drop_duplicates(subset=['name', 'nodes_str'])
        df = df.drop(columns='nodes_str')
        
    return df


def plot_dist(to_plot, ax, bins, title_format, all_key, xlabel="Ratio", ylabel="Frequency"):
    """Plot a histogram graph of distributions.

    Given a plot, plot histogram subplots for each index of the dictionary
    `to_plot`. Define a key for the dictionary of all values in each key
    of data. Each subplot title will be `title_format` with their respective
    key in the dictionary formatted into the title.

    :param to_plot: The data to plot
    :param ax: The matplotlib axis to plot
    :param bins: The number of bins per graph
    :param title_format: The format for the title of each subplot
    :param all_key: The name to give the data entry of all data points.

    """
    index = 0

    for key, values in to_plot.items():
        title = title_format.format(key)
        
        df = normalize_data(values, bins)
        df.plot(kind='bar', ax=ax[index], align='edge', width=1, ec="white")
        
        format_xticks(ax[index])
        ax[index].set(xlabel=xlabel, ylabel=ylabel, title=title)
        
        index += 1

    edge_vals = normalize_aggregated_dict_data(to_plot, 0.05)
        
    df = pd.DataFrame.from_dict(edge_vals).T
    ax = df.plot(kind='bar', ec='white', align='edge', width=0.8, ax=ax[index])
    ax.set(xlabel=xlabel, ylabel=ylabel, title=title_format.format(all_key))
    
    format_xticks(ax, exclude_final=True)
    

def empty_pair_dict_list():
    """Return an empty dictionary with each pair type and an empty list.

    :returns: A dictionary with each pair type and an empty list
    :rtype: Dictionary

    """
    return {'BKY-BKY': [], 'BKY-SCM': [], 'SCM-BKY': [], 'SCM-SCM': []}

def empty_pair_dict_count():
    """Return an empty dictionary with each pair type and a count of 0

    :returns: A dictionary with each pair type and a count of 0
    :rtype: Dictionary

    """
    return {'BKY-BKY': 0, 'BKY-SCM': 0, 'SCM-BKY': 0, 'SCM-SCM': 0}

def print_stats(dictionary):
    """Print the length, mean, and standard deviation of a dictionary's values

    :param dictionary: The dictionary to print statistics of.

    """
    for key, counts in dictionary.items():
        print("{} count: {}\n{} mean: {}\n{} standard deviation: {}\n"
              .format(key, len(counts),
                      key, stat.mean(counts),
                      key, stat.stdev(counts)))
        
def get_ext(name):
    """Get the extension from a file name

    :param name: The name of the file
    :returns: The extension
    :rtype: String

    """
    return name.split('.')[1].upper()

def normalize_data(data, bins):
    vals, edges = np.histogram(data, bins=bins)

    total = sum([v for v in vals])

    assert len(data) == total
    
    vals = [v / total for v in vals]
    vals.append(0)
    
    # TODO ASSERT SUM OF VALS == 1
    # Hard to do because vals is a list of floats
    # Could manage all floats specially, but we don't necessarily care about
    # 100% accurate values because they're binned
    
    df = pd.DataFrame(zip(edges, vals)).set_index(0)
        
    return df

def normalize_aggregated_dict_data(dict_data, delta_bin):
    # Make sure only one extra tick is included over 1 
    # so that values of 1 are included by there are no more than 1
    # excess ticks.
    edge_upper_bound = 1 + (3/2 * delta_bin)
    edges = np.arange(0, edge_upper_bound, delta_bin)
    
    empty_dict_func = empty_pair_dict_count if 'SCM-SCM' in dict_data.keys() else empty_save_dict_count
    
    edge_vals = {edge: empty_dict_func() for edge in edges}
    total = 0
    for key, values in dict_data.items():
        for value in values:
            current_edge = max([edge for edge in edges if value >= edge])
            
            edge_vals[current_edge][key] += 1
            total += 1
            
    for key, pair in edge_vals.items():
        for k in pair.keys():
            edge_vals[key][k] /= total
    
    return edge_vals

def format_xticks(ax, exclude_final=False):
    xticks = ax.get_xticklabels()
    ticks = [str(round(float(item.get_text()), 2)) for item in xticks]
    ticks = [t[0:4] for t in ticks]
    if exclude_final: ticks[-1] = ''
    ax.set_xticklabels(ticks)
    
def get_time_since_beginning(times, index):
    """Return the time between the beginning of a list of times and a time at a given index

    :param times: The list of times
    :param index: The index of the time to look at
    :returns: The difference in time
    :rtype: datetime.timedelta

    """
    return (times[index] - times[0]).total_seconds()

def empty_save_dict_count():
    return {'SCM': 0, 'BKY': 0}

    
def get_project_sessions(folder, keep_duplicates=True, time_interval=TWO_HOURS):
    """Return a dictionary of each project backup session

    Separate each projects backups into separate sessions. The default threshold
    for a new session is 2 hours (7200 seconds) between submissions. Returns a
    dictionary where the keys are directories to a users projects, values being
    lists where each element of this list is a dictionary representing a session.
    This dictionary has lists of times created and file backup names. So we return
    a dictionary of lists of dictionaries of lists.
    
    :param folder: The folder where the backups are stored.
    :param keep_duplicates: Whether to keep duplicate data.
    :param time_interval: The threshold that defines a single session.
    :returns: Dictionary of each project backup session.
    :rtype: Dictionary.

    """
    all_sessions = {}

    for project in get_directories(folder):
        backups = get_backups(project, keep_duplicates=keep_duplicates)

        if not backups.size: continue

        names = list(backups['name'])
        times = list(backups['time'])

        sessions = []
        session_start = 0
        
        for i in range(len(times) - 1):
            if (times[i+1] - times[i]).total_seconds() > time_interval:
                sessions.append({'time': times[session_start: i+1],
                                 'name': names[session_start: i+1]})

                session_start = i + 1
        
        sessions.append({'time': times[session_start:],
                         'name': names[session_start:]})
        
        user = project.split('/')[-2]

        all_sessions[user + '/' + project] = sessions

    return all_sessions

def get_time_between_saves(folder, keep_duplicates=True):
    diffs = []

    for project in get_directories(folder):
        backups = get_backups(project, keep_duplicates=keep_duplicates)

        if not backups.size: continue

        times = list(backups['time'])
        
        for i in range(len(times) - 1):
            diffs.append((times[i+1] - times[i]).total_seconds())

    return diffs


def get_all_times_and_names(project):
    """Get a list of names and times backups files were created in a project.

    Get a list of names for each backup and a list of times the backups were
    created in a single project. A project is a dictionary with the times
    created and the names of each file in the project. Returns a tuple of 
    the form `(times, names)`.

    :param project: The individual project.
    :returns: The names and times backup files were created
    :rtype: Tuple

    """
    all_names = []
    all_times = []
    
    for session in project:
        all_names.extend(session['name'])
        all_times.extend(session['time'])

    return all_times, all_names

def get_backup_pairs(folder, keep_duplicates=True):
    """Return how often each different backup file pair occurs.
    
    :param folder: The folder where the backup files are contained.
    :param keep_duplicates: Whether to store duplicate data
    :returns: Dictionary with how often each type of backup pair occurs.
    :rtype: Dictionary

    """
    pairs = empty_pair_dict_count()
    
    for project in get_directories(folder):
        pair_counts = get_backup_pairs_per_project(project, keep_duplicates)
        
        for key, value in pair_counts.items():
            pairs[key] += value
            
    return pairs

In [None]:
import numpy as np
[np.random.randn(n) for n in [10000, 5000, 2000]]