In [None]:
%run scm_bky_utilities.ipynb
import scipy

In [None]:
N_BINS = 20

In [None]:
def plot_session_counts(folder, keep_duplicates=True):
    """Plot the number of sessions for each project.

    :param folder: Where the backups are stored.
    :param keep_duplicates: Whether to keep duplicate data points.

    """
    session_counts = get_session_counts(folder, keep_duplicates=keep_duplicates)
    mx = max(session_counts)

    fig = plt.figure(figsize=(12.8, 6.4), constrained_layout=True)
    ax = fig.subplots()
    bins = np.linspace(0, mx, mx + 1)

    ax.hist(session_counts, bins, alpha=0.5, density=False)
    ax.set(xlabel="Number of Sessions", ylabel="Frequency",
           title="Distribution of Number of Sessions for Projects")
    
    save_or_display("Distribution of Number of Sessions for Projects")

def get_session_counts(folder, keep_duplicates=True):
    """Get the number of sessions per project.

    :param folder: The folder where the backups are stored.
    :param keep_duplicates: Whether to keep duplicate data points.
    :returns: A list of the number of sessions per project.
    :rtype: List.

    """
    sessions = get_project_sessions(folder, keep_duplicates=keep_duplicates)

    return [len(b) for b in sessions.values()]

def get_distribution_saves_per_session(folder, interval=TWO_HOURS, rlength=False):
    sessions = get_project_sessions(folder, keep_duplicates=True, time_interval=interval)
    percents = []
    for project in sessions.values():

        for session in project:
            times = sorted(list(session['time']))
            
            delta_t = get_time_since_beginning(times, -1)
            
            for i in range(1, len(times)-1):
                time_magnitude = get_time_since_beginning(times, i)
                percent = time_magnitude / delta_t
                percents.append(percent)
    
    df = pd.DataFrame(percents)

    return df


def plot_dist_sub_counts(folder):
    """Plot the distribution of submission counts as a histogram, irrespective of project.

    :param folder: The folder where backups are stored

    """
    sub_counts = []

    for _, _, files in os.walk(folder):
        times = 0
        
        for f in files:
            file_name, file_ext = os.path.splitext(f)

            if BACKUP in file_ext and SCM in file_name:
                times += 1
        
        if times: sub_counts.append(times)

    fig = plt.figure(figsize=(12.9, 12.9))
    ax = fig.subplots(3, 1)

    fig.suptitle("Distribution of Submission Counts")

    bins = [np.linspace(0, 150, 150),
            np.linspace(0, 150, 50),
            np.linspace(0, 50, 50)]

    for i in range(len(bins)):
        ax[i].hist(sub_counts, bins[i], alpha=0.5, density=False)
        
        ax[i].set(xlabel="Frequency", ylabel="Number of Submissions")

    save_or_display("Distribution of Submission Counts")

In [None]:
c.to_file = False
plot_session_counts(AI_DATA, keep_duplicates=True)

In [None]:
plot_dist_sub_counts(AI_DATA)

In [None]:
distribution_saves_per_session = get_distribution_saves_per_session(AI_DATA)
ax = distribution_saves_per_session.plot(kind='hist', ec='white', bins=20)
ax.set(xlabel='Proportion of session finished')
save_or_display("Distribution of saves per session")

See different session times (10 minutes btwn 10-120 mins)

In [None]:
c.to_file=False
for i in range(5, 121, 10):
    print(i)
    distribution_saves_per_session = get_distribution_saves_per_session(AI_DATA, interval=i*60)
    ax = distribution_saves_per_session.plot(kind='hist', ec='white', bins=20)
    ax.set(xlabel='Proportion of session finished')
    save_or_display("Distribution of saves per session")


In [None]:
short = []
for i in range(1, 5):
    d = get_distribution_saves_per_session(AI_DATA, interval=i*60)
    short.append(normalize_data(d, N_BINS))
    
df_concat = pd.concat(short)
by_row_index = df_concat.groupby(df_concat.index)
df_medians_short = by_row_index.median()*100
df_medians_short

In [None]:
medium = []
for i in range(6, 20):
    d = get_distribution_saves_per_session(AI_DATA, interval=i*60)
    medium.append(normalize_data(d, N_BINS))

df_concat = pd.concat(medium)
by_row_index = df_concat.groupby(df_concat.index)
df_medians_med = by_row_index.median()*100
df_medians_med

In [None]:
large = []
for i in range(20, 1440, 10):
    d  = get_distribution_saves_per_session(AI_DATA, interval=i*60)
    large.append(normalize_data(d, N_BINS))

df_concat = pd.concat(large)
by_row_index = df_concat.groupby(df_concat.index)
df_medians_large = by_row_index.median()*100
df_medians_large

In [None]:
print(len(short), len(medium), len(large))

In [None]:
new_df = df_medians_short.copy()
new_df.columns = ['short']
new_df['medium'] = df_medians_med[1]
new_df['large'] = df_medians_large[1]
ax = new_df.plot(kind='bar', width=1, ec='white', align='edge')
ax.set(xlabel="Proportion of session finished")
ax.set(ylabel='Percentage of Saves Out of the \n Total Number of Saves in a Session')
ticks = [str(i)[0:4] for i in np.arange(0, 101, 5)]
ax.set_xticklabels(ticks)
save_plot("sml", graph_type="")

In [None]:
l = 0
for i in range(1, 6):
    l+= len(get_project_sessions(AI_DATA, keep_duplicates=True, time_interval=i*60))
print(l)

In [None]:
l = 0
for i in range(6, 20):
    l+= len(get_project_sessions(AI_DATA, keep_duplicates=True, time_interval=i*60))
print(l)

In [None]:
l = 0
for i in range(20, 1440, 100):
    l+= len(get_project_sessions(AI_DATA, keep_duplicates=True, time_interval=i*60))
print(l)

See different session times (btwn 3 and 24 hours)

In [None]:
for i in range(3, 24):
    print(i)
    distribution_saves_per_session = get_distribution_saves_per_session(AI_DATA, interval=i*60*60)
    ax = distribution_saves_per_session.plot(kind='hist', ec='white', bins=100)
    ax.set(xlabel='Proportion of session finished')
    save_or_display("Distribution of saves per session")

In [None]:
all_distributions = []
for i in range(10, 40):
    print(i)
    distributions = get_distribution_saves_per_session(AI_DATA, interval=i*60)
    all_distributions.append(normalize_data(distributions, N_BINS))


In [None]:
for i in range(1, 11):
    print(i)
    distributions = get_distribution_saves_per_session(AI_DATA, interval=i*60)
    all_distributions.append(normalize_data(distributions, N_BINS))

In [None]:
all_distributions.append(normalize_data(get_distribution_saves_per_session(AI_DATA, interval=40*60), N_BINS))

In [None]:
print(len(all_distributions))

In [None]:
df_concat = pd.concat(all_distributions)
by_row_index = df_concat.groupby(df_concat.index)
df_medians = by_row_index.median()*100

ax = df_medians.plot(kind='bar', width=1, ec='white', align='edge')
ax.set(xlabel='Pecentage of Session Finished', ylabel='Percentage of Saves Out of the \nTotal Number of Saves in a Session')
ax.get_legend().remove()
ticks = [str(i)[0:4] for i in np.arange(0, 101, int((100/N_BINS)))]
ax.set_xticklabels(ticks)
save_plot("Aggregated saves", graph_type="")

In [None]:
df_concat = pd.concat(all_distributions)
df_concat['i'] = df_concat.index
extremes = df_concat[(df_concat['i'] <= 0.5) | (df_concat['i'] >= 0.95)]
center = df_concat[(df_concat['i'] > 0.5) & (df_concat['i'] < 0.95)]

scipy.stats.mstats.kruskalwallis(extremes[1].values, center[1].values)

In [None]:
times = get_time_between_saves(AI_DATA)

In [None]:
fig, ax = plt.subplots()
ax.hist(times, bins=13000)
ax.set_xlim([0, 7200])


In [None]:
times = np.array(times)

fig, ax = plt.subplots()
ax.hist(times[times < 1000], bins=50)