In [None]:
%run scm_bky_utilities.ipynb
from utilities import *
import scipy
c.to_file = False

In [None]:
def plot_timeline(folder, keep_duplicates=True):
    """Plot real-spaced and compressed timelines which show change in BKY vs SCM backup files.

    :param folder: The folder where the backup files are contained.
    :param keep_duplicates: Whether to keep duplicate data.

    """
    df = get_backups(folder, keep_duplicates)
    df['time'] = df['time'].apply(lambda x: (x - df['time'][0]).total_seconds())
    df['reduced_time'] = df['time']
    df['really_reduced_time'] = df['time']

    for i in range(len(df['time']) - 1):
        buf = df['reduced_time'].iloc[i+1] - df['reduced_time'].iloc[i] - 18000 #TODO ask what this value is
        
        if buf > 0:
            for j in range(len(df['reduced_time'].iloc[i:]) - 1):
                df['reduced_time'].iloc[i+j+1] -= buf * 0.9
                df['really_reduced_time'].iloc[i+j+1] -= buf
    
    # This shouldn't generate a warning, but it does...
    df['level'] = df['name'].apply(lambda x: -3 if SCM in x else 3)
    
    fig, ax = plt.subplots(3, 1, figsize=(8.8, 8), constrained_layout=True)

    titles = [
        "Real-Spaced BKY vs SCM Change Timeline (BKY on Top)",
        "Compressed BKY vs SCM Change Timeline (BKY on Top)",
        "Very Compressed BKY vs SCM Change Timeline (BKY on Top)"
    ]

    cols = [df['time'], df['reduced_time'], df['really_reduced_time']]

    for i in range(len(cols)):
        ax[i].stem(cols[i], df['level'], linefmt="C3-", basefmt="k-")
        ax[i].get_yaxis().set_visible(False)
        ax[i].set(title=titles[i])
    
    save_or_display('BKY vs SCM Change Timeline (BKY on Top)')

def bky_to_scm_per_project(folder, keep_duplicates=True):
    time_ratios = {'BKY': [], 'SCM': []}

    for project in get_directories(folder):
        backups = get_backups(project, keep_duplicates=keep_duplicates)

        if not backups.size: continue
        backups['scm'] = backups.apply(lambda x: 'scm' in x['name'], axis=1)

        total = backups.shape[0]
        scm = backups[backups['scm']].shape[0]
        time_ratios['SCM'].append(scm / total)
        time_ratios['BKY'].append((total - scm) / total)

    assert len(time_ratios['BKY']) == len(time_ratios['SCM'])
    bky_ratio = sum(time_ratios['BKY']) / len(time_ratios['BKY'])
    scm_ratio = sum(time_ratios['SCM']) / len(time_ratios['SCM'])

    return pd.Series({'BKY': bky_ratio, 'SCM': scm_ratio})

def plot_time_dist(folder, keep_duplicates=True):
    """Plot the ratio of times spent on each backup to the total time spent on the project.

    Plot a histogram of ratios of times spent on a backup to the total time spent on each
    project. We separate the backups by file type (SCM and BKY).

    :param folder: The folder where the backups are located.
    :param keep_duplicates: Whether to keep duplicate data.

    """
    time_ratios = {'BKY': [], 'SCM': []}

    for project in get_directories(folder):
        backups = get_backups(project, keep_duplicates=keep_duplicates)

        if not backups.size: continue

        append_time_ratios(backups, time_ratios)

    print_stats(time_ratios)

    fig = plt.figure(figsize=(12.8, 12.8), constrained_layout=True)
    ax = fig.subplots(3, 1)
    bins = np.linspace(0, 1, 21)

    plot_dist(time_ratios, ax, 20, 
              "Relative Time Placement of {} Saves", "BKY & SCM",
              xlabel="Proportion of Project Finished")

    save_or_display('Relative Time Placement of Pair Saves')
    
def append_time_ratios(backups, time_ratios):
    """Given a list, for each backup append the ratio of the time spent on creation to the total time spent on the project.

    :param backups: A dataframe representing each backup, containing the 
                    file name, time created and the number of each node type.
    :param time_ratios: A dictionary which contains a list oftime ratios for
                        BKY and SCM files (of the form {'BKY': [], 'SCM': []}).

    """
    names = list(backups['name'])
    times = list(backups['time'])
    
    total_time = get_time_since_beginning(times, -1)

    if not total_time: return
    
    for i in range(len(times)):
        time = get_time_since_beginning(times, i)
        ext = get_ext(names[i])
        time_ratios[ext].append(time / total_time)
        

def plot_session_time_dist(folder, keep_duplicates=True):
    """Plot the distribution of submissions per session in a project.

    :param folder: The folder where the backups are stored.
    :param keep_duplicates: Whether to keep duplicate data.
    
    """
    time_ratios = {'BKY': [], 'SCM': []}

    sessions = get_project_sessions(folder, keep_duplicates=keep_duplicates)

    for project in sessions.values():
        for session in project:
            append_time_ratios(session, time_ratios)

    print_stats(time_ratios)

    fig = plt.figure(figsize=(12.8, 12.8), constrained_layout=True)
    ax = fig.subplots(3, 1)
    
    plot_dist(time_ratios, ax, 20, 
              "Relative Time Placement of {} Saves per Session", "BKY & SCM",
              xlabel="Proportion of Session finished")
    
def significance_test_bky_scm():
    backups = get_backups(AI_DATA)
    backups['user'] = backups['name'].apply(lambda x: x.split("/")[5])
    
    backups_by_user = backups.groupby('user')
    
    scm_nums = []
    bky_nums = []
    
    for user, rows in backups_by_user:
        n_bky = rows[rows.name.str.contains('bky')].shape[0]
        n_scm = rows.shape[0] - n_bky
        
        bky_nums.append(n_bky)
        scm_nums.append(n_scm)
        
    print(scipy.stats.wilcoxon(bky_nums, scm_nums, correction=True, alternative='less'))
    

In [None]:
significance_test_bky_scm()

In [None]:
plot_timeline(TEST_FOLDER_2, keep_duplicates=False)

In [None]:
bky_to_scm = bky_to_scm_ per_project(AI_DATA, keep_duplicates=False)
ax = bky_to_scm.plot(kind='bar', ec='white')
ax.set(ylabel="Average Usage Per Project")
save_or_display('BKY and SCM Average Usage Per Project')

In [None]:
plot_time_dist(AI_DATA, keep_duplicates=False)

In [None]:
plot_session_time_dist(AI_DATA, keep_duplicates=False)