In [None]:
%run scm_bky_utilities.ipynb
from sklearn.cluster import KMeans

In [None]:
def plot_pair_dist(folder, keep_duplicates=True):
    """Plot the distribution for each backup pair in a project.

    :param folder: The folder where the backups are stored.
    :param keep_duplicates: Whether to keep duplicate data.

    """
    pair_ratios = empty_pair_dict_list()

    pairs_dist = get_all_pair_dist(folder, keep_duplicates=True, ratio=True)
    
    #TODO ask what this is for
    for pairs in pairs_dist:
        if pairs[0] != 0 and pairs[0] != 1:
            pair_ratios['BKY-BKY'].append(pairs[0])
        if pairs[1] != 0 and pairs[1] != 1:
            pair_ratios['BKY-SCM'].append(pairs[1])
        if pairs[2] != 0 and pairs[2] != 1:
            pair_ratios['SCM-BKY'].append(pairs[2])
        if pairs[3] != 0 and pairs[3] != 1:
            pair_ratios['SCM-SCM'].append(pairs[3])
    
    print_stats(pair_ratios)
        
    fig = plt.figure(figsize=(12, 20), constrained_layout=True)
    ax = fig.subplots(5, 1)
    bins = np.linspace(0, 1.0, 21)
    
    plot_dist(pair_ratios, ax, bins, 
              "Ratio of the Number of Submission Pairs of Type {} to the Total Number of All Pair Types", 
              "All Types")
    
def get_all_pair_dist(folder, keep_duplicates=True, ratio=False):
    """Return either a list of ratios of each pair to the max sum of pairs in all projects or return a list of all pair counts.
    
    If `ratio` is true, get a list of pairs for each individual project, then 
    create a list of ratios of a pair to the maximum sum of pair values in an 
    individual project. If the sum is zero, then divide by one instead. Otherwise, 
    get a list of all pair counts throughout a directory. This function is meant
    to be run on the `STORAGE` folder where all data is located.

    :param folder: The folder where the backups are saved.
    :param keep_duplicates: Whether to keep duplicate data.
    :param ratio: Whether to return a list of ratios.
    :returns: A list of ratios or pair counts.
    :rtype: List

    """
    all_pairs = []
    
    for project in get_directories(folder):
        pairs = get_backup_pairs_per_project(project, keep_duplicates=keep_duplicates)

        if pairs:
            all_pairs.append(pairs)

    if not ratio: return all_pairs

    all_ratios = []
    
    for pairs in all_pairs:
        ratios = []
        maxed_sum = max(sum(pairs.values()), 1)
        
        for pair in pairs.values():
            ratios.append(pair / maxed_sum)
            
        all_ratios.append(ratios)
    
    return all_ratios

def get_backup_pairs_per_project(folder, keep_duplicates=True):
    """Return how often each different backup file pair occurs in a single project

    :param folder: The folder where the backup files are contained
    :param keep_duplicates: Whether to store duplicate data
    :returns: Dictionary with how often each type backup pair occurs
    :rtype: Dictionary

    """
    pairs = empty_pair_dict_count()

    try:
        backups = get_backups(folder, keep_duplicates=keep_duplicates)['name']
    
        for i in range(len(backups) - 1):
            pairs[get_pair_string(backups, i)] += 1

    except TypeError:
        pass

    return pairs

def get_pair_string(names, i):
    """Get the necessary key for a pair dictionary.
    
    Get the necessary key in the form `TYPE-TYPE` where `TYPE`
    is either "SCM" or "BKY". This is done by giving the full
    list of files and the index of the first file to get its
    extension as a string. The function will then append the 
    extension of the next file in the list with a hyphen before.

    :param names: The names of the files.
    :param i: The index of the file to look at.
    :returns: The string to access a pair count/list in a dictionary.
    :rtype: String

    """
    return get_ext(names[i]) + '-' + get_ext(names[i+1])

def plot_pair_session_time_dist(folder, keep_duplicates=True):
    """Plot the distribution of time ratios for each session in a project.

    Divide each project into sessions and the find how far through the session
    various backup pairs occur.

    :param folder: The folder where the backups are stored
    :param keep_duplicates: Whether to keep duplicate data

    """
    time_ratios = empty_pair_dict_list()
    sessions = get_project_sessions(folder, keep_duplicates=keep_duplicates)
    for project in sessions.values():
        for session in project:
            names = list(session['name'])
            times = list(session['time'])

            set_pair_string_ratios(names, times, time_ratios)

    print_stats(time_ratios)

    fig = plt.figure(figsize=(12.8, 14), constrained_layout=True)
    ax = fig.subplots(5, 1)
    plot_dist(time_ratios, ax, 20, 
              "Relative Time Placement of {} Saves", "All Pairs", xlabel="Percent session finished")
    
def set_pair_string_ratios(names, times, ratios):
    """Given a dictionary of pair counts, append to each pair the ratio of time spent on each backups to the total time spent on the project.
    
    Given a dictionary of backup pairs, for each pair append the ratio of
    time spent on the given backup and the total time spent on the project.
    The dictionary should be of the form {'BKY-BKY': [], 'BKY-SCM': [], 
    'SCM-BKY': [], 'SCM-SCM': []}. 

    :param names: The list of names of the backups.
    :param times: The times the backups were created.
    :param ratios: The dictionary to add the ratios to.

    """
    total_time = get_time_since_beginning(times, -1)
    
    if not total_time: return
    
    for i in range(len(times) - 1):
        time = get_time_since_beginning(times, i)
        ratios[get_pair_string(names, i)].append(time / total_time)
        
def plot_pair_times(folder, keep_duplicates=True, num_bins=10):
    """Plot a histogram of how far through (time-wise) backup pairs occurs.

    Within an entire project, look at how far along (time-wise) backup
    pairs occur. Time is measured by determining how many seconds occured
    between the first backup submission (a.k.a the "zero" point) and the
    last one (the total time spent on the project) and the halfway point
    between a pair of backups. Data suggests that more BKY-BKY (logic file)
    pairs occur toward the end of a project and more SCY-SCY (layout file)
    pairs at the beginning. This is consistent with hypothesis that people
    start off working on layout and work on lagic at the end.

    :param folder: The folder where the backup files are located.
    :param keep_duplicates: Whether to keep duplicate data.
    :param num_bins: The number of bins to use for the histogram.

    """
    all_sessions = get_project_sessions(folder, keep_duplicates=keep_duplicates)
    
    time_ratios = empty_pair_dict_list()
    
    for project in all_sessions.values():

        all_times, all_names = get_all_times_and_names(project)

        total_time = get_time_since_beginning(all_times, -1)

        if not total_time: continue

        for i in range(len(all_times) - 1):
            time = (get_time_since_beginning(all_times, i+1) 
                   + get_time_since_beginning(all_times, i)) / 2
            ratio = time / total_time

            time_ratios[get_pair_string(all_names, i)].append(ratio)

    fig = plt.figure(figsize=(12.8, 12.8), constrained_layout=True)
    ax = fig.subplots(5, 1)
    bins = np.linspace(0, 1, num_bins + 1)

    print_stats(time_ratios)
    
    plot_dist(time_ratios, ax, bins, 
              "Relative Time Place of {} Saves", "All Pairs", xlabel="Amount of Project Finished")
    
def get_pair_times_per_project(folder, keep_duplicates=True):
    """Get the time ratios for each backup pair separated by project.

    For each backup pair in a project, get the time ratio and then
    append it to a list within a list of individual projects.

    :param folder: The folder where the backups are stored.
    :param keep_duplicates: Whether to keep duplicate data.
    :returns: The time ratios for all backup pairs.
    :rtype: List

    """
    pair_times = []

    all_sessions = get_project_sessions(folder, keep_duplicates=keep_duplicates)

    for project in all_sessions.values():
        ratios = empty_pair_dict_list()
        
        all_times, all_names = get_all_times_and_names(project)

        if not get_time_since_beginning(all_times, -1): continue

        set_pair_string_ratios(all_names, all_times, ratios)

        pair_times.append(ratios)

    return pair_times
    
def discretize_pair_times(folder, keep_duplicates=True, 
                            bins=10, normalized=False):
    """For each projects pair time ratios, extract the height of the histogram bars.

    For each projects pair time ratios, extract the height of the histogram bars
    (aka the ratios) and generate a vector (list) for each project. If
    `normalized` is true, then each ratio within the vector will be divided
    by the total number of ratios.

    :param folder: The folder where the backups are 
    :param keep_duplicates: Whether to keep duplicate data
    :param bins: The number of bins to use for each backup pair.
    :param normalized: Whether to normalize the vectors.
    :returns: The vectors for each project.
    :rtype: List of lists.

    """
    vectors = []
    pair_times = get_pair_times_per_project(folder, 
                                            keep_duplicates=keep_duplicates)    
    for project in pair_times:
        vector = [0] * (4 * bins)
        counter = 0
        total = 0

        for pair in project.values():
            for time_ratio in pair:
                vector[counter * bins + int(time_ratio * bins)] += 1
                total += 1

            counter += 1

        if normalized:
            vector = [i/total for i in vector]

        vectors.append(vector)

    return vectors


def cluster_vectors(folder, keep_duplicates=True, bins=10):
    """Attempt to cluster discretized backup pair ratio values.

    :param folder: The folder where the backups are stored
    :param keep_duplicates: Whether to keep duplicate data
    :param bins: The number of bins to use for each backup pair vector.
 
    """
    vectors = discretize_pair_times(folder, keep_duplicates=keep_duplicates,
                                    bins=bins, normalized=True)

    n_clusters = 8
    clust = KMeans(n_clusters=n_clusters).fit(vectors)
    centers = clust.cluster_centers_
    labels = clust.labels_
    separated = [[] for i in range(n_clusters)]
    
    for i in range(len(separated)):
        separated[labels[i]].append(vectors[i])

    for i in range(len(separated)):
        center = centers[i]
        dist = sum([np.linalg.norm(v - center) for v in separated[i]])

        print(dist, len(separated[i]))

#TODO ask how this works... And what this does
def get_2D_reps(folder, keep_duplicates=True):
    """Return 2D representations of SCM-SCM submissions in the middle third and BKY-BKY submission that occur in last third. 

    :param folder: The folder wherre the backups are stored
    :param keep_duplicates: Whether to store duplicate data
    :returns: The 2D representations.
    :rtype: List of lists

    """
    twelve_reps = discretize_pair_times(folder, keep_duplicates=keep_duplicates, 
                                        bins=3, normalized=True)
    two_reps = []

    for twelve_rep in twelve_reps:
        two_reps.append([twelve_rep[10], twelve_rep[2]])

    return two_reps

def PCA_vectors(folder, keep_duplicates=True, bins=10):
    """Plot a PCA representation of the data.

    Get 2D representations of the data and plot a dotplot of a 
    PCA representation of the data. Who knows how good of it is
    though...

    :param folder: The folder where the backups are stored
    :param keep_duplicates: Whether to keep duplicate data
    :param bins: How many bins to use for each backup pair in the discretized pairs

    """
    vectors = discretize_pair_times(folder, keep_duplicates=keep_duplicates, 
                                    bins=bins, normalized=True)
    pca = PCA(n_components=2)
    transformed = pca.fit(vectors).transform(vectors)

    fig = plt.figure(figsize=(10, 10))
    ax = fig.subplots()
    ax.plot(transformed, 'ro')
    

In [None]:
plot_pair_dist(AI_DATA, keep_duplicates=False)

In [None]:
plot_pair_session_time_dist(AI_DATA, keep_duplicates=True)

In [None]:
plot_pair_times(AI_DATA, keep_duplicates=True, num_bins=6)

In [None]:
print(discretize_pair_times(AI_DATA, keep_duplicates=True, bins=1, normalized=True))

In [None]:
cluster_vectors(AI_DATA, keep_duplicates=True, bins=5)

In [None]:
two_reps = get_2D_reps(AI_DATA, keep_duplicates=True)

print(*two_reps[:10], sep='\n')

fig = plt.figure(figsize=(10, 10))
ax = fig.subplots()
ax.plot(two_reps, 'ro')

In [None]:
PCA_vectors(AI_DATA, keep_duplicates=True, bins=10)