In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
dicts = dicts + [dicts[0]]

In [2]:
# Change filename to the name of the file in model/results you want to plot
filename = 'result_reproduce_fit0_its100000_time20_20_10_36.csv'
data_repr = pd.read_csv(filename)

filename = 'result_no_rec_fit0_its100000_time21_11_16_04.csv'
data_no_rec = pd.read_csv(filename)

data_repr.head(10)


Unnamed: 0,sim,step,fmin,action,fnum,fmean,fstd,fmin.1,fmax,code_size,changes
0,0,0,0.290336,call_method,1,0.290336,0.0,0.290336,0.290336,1,1
1,0,1,0.024148,call_method,1,0.024148,0.0,0.024148,0.024148,2,1
2,0,2,0.942519,call_method,1,0.942519,0.0,0.942519,0.942519,3,1
3,0,3,0.168325,update_method,1,0.168325,0.0,0.168325,0.168325,4,1
4,0,4,0.103051,call_method,1,0.103051,0.0,0.103051,0.103051,5,1
5,0,5,0.22979,call_method,1,0.22979,0.0,0.22979,0.22979,6,1
6,0,6,0.53105,call_method,1,0.53105,0.0,0.53105,0.53105,7,1
7,0,7,0.53105,create_method,2,0.57431,0.04326,0.53105,0.61757,7,1
8,0,8,0.61757,remove_method,1,0.61757,0.0,0.61757,0.61757,0,1
9,0,9,0.042251,call_method,1,0.042251,0.0,0.042251,0.042251,1,1


In [3]:
def find_commits(data, f0, at_final=False):
    """ 
    Returns a list with the timesteps where the fmin >= f0 and at the final timestep
    """
#     commits = [data['step'].iloc[i] for i in range(len(data)) if data['fmin'].iloc[i] >= f0]
    commits = list(data[data['fmin'] >= f0]['step'])

    if at_final and commits[-1] != data['step'].iloc[-1]:
        commits += [data['step'].iloc[-1]]
    
    return commits


def get_pos_neg_lines(data):
    """
    Get absolute positive, negative and total changes in lines in this data
    """
    
    neg_changes = data[data['size_change'] < 0]['size_change'].abs().sum()
    pos_changes = data[data['size_change'] > 0]['size_change'].abs().sum()
    file_change = pos_changes - neg_changes         

    return file_change, pos_changes, neg_changes


def get_pos_neg_changes(data):
    """
    Get absolute positive, negative and total changes in this data
    """
    pos_changes = data[(data['action']!='remove_method') & (data['changes'] >= 0)]['changes'].abs().sum()    
    neg_changes = data[(data['action']=='remove_method') | (data['changes'] < 0)]['changes'].abs().sum()
    file_change = pos_changes - neg_changes         

    return file_change, pos_changes, neg_changes


def get_changes(data, commits):
    """
    Returns list with mean size of steps/lines/changes per commit
    """
    stats_names = ['steps', 'lines_sum', 'lines_pos', 'lines_neg', 'lines_big', 'changes_sum', 'changes_pos', 'changes_neg', 'changes_big']
    stats = {name : [] for name in stats_names}
    
    # Add column with size changes
    data['shift'] = data['code_size'].shift(periods=1, fill_value=0)
    data['size_change'] = data['code_size']-data['shift']
    
    # Iterate over commits
    prev_ind = 0
    for commit in commits:
        data_commit = data.iloc[prev_ind:commit]
        
        stats['steps'].append(commit-prev_ind)
        
        # Get number of changes lines, added and deleted lines
        file_change, change_pos, change_neg = get_pos_neg_lines(data_commit)
        stats['lines_sum'].append(file_change)
        stats['lines_pos'].append(change_pos)
        stats['lines_neg'].append(change_neg)
        stats['lines_big'].append(max(change_pos, change_neg))
        
        # Get the total, positive and negative changes
        file_change, change_pos, change_neg = get_pos_neg_changes(data_commit)
        stats['changes_sum'].append(file_change)
        stats['changes_pos'].append(change_pos)
        stats['changes_neg'].append(change_neg)
        stats['changes_big'].append(max(change_pos, change_neg))
        
        prev_ind = commit
        
    return {name: np.array(stats[name]).mean() for name in stats_names}, stats_names
#     return [np.array(stats[name]).mean() for name in stats_names], stats_names


def get_action_proportions(data, commits):
    """
    Returns a list with the proportion of actions in the commits
    update, call, remove, create"""
    # Initialize dict
    action_names = ['update_method', 'call_method',
                    'remove_method', 'create_method']
    actions = {name : 0 for name in action_names}

    # Count the actions
    for commit in commits:
        actions[data['action'].iloc[commit]] += 1
    
    # Return the list of action proportions
    return [actions[name]/len(commits) for name in action_names]
    
    
    

In [4]:
def get_statistics(data, f0):
    """
    Returns a dict with lists of statistical values per simulation
    
    """
    statistics = {
        'f0': f0,
        'fmax_means': [],
        'fmin_means': [],
        'f_commit_means': [],
        'f_commit_mins': [],
        'f_commit_maxs': [],
        'num_commits': [],
        
        'steps':[], 
        'lines_sum':[],
        'lines_pos':[],
        'lines_neg':[],
        'lines_big':[],
        'changes_sum':[],
        'changes_pos':[],
        'changes_neg':[],
        'changes_big':[],
        
        'action_props': []
    }
    
    for sim in data['sim'].unique():
        data_sim = data[data['sim']== sim]
        data_commits = data_sim[data_sim['fmin'] >= f0]
        commits = list(data_commits['step'])
        
        # mean fmin and fmax
        statistics['fmin_means'].append(data_sim['fmin'].mean())
        statistics['fmax_means'].append(data_sim['fmax'].mean())
        
        # mean, min, max f for a commit
        statistics['f_commit_means'].append(data_commits['fmean'].mean())
        statistics['f_commit_mins'].append(data_commits['fmin'].mean())
        statistics['f_commit_maxs'].append(data_commits['fmax'].mean())
        statistics['num_commits'].append(len(commits))               
        
        # Mean size of steps/lines/changes per commit
        changes_dict, changes_names = get_changes(data, commits)
        for name in changes_names:
            statistics[name].append(changes_dict[name])
        
        # Proportions last action
        statistics['action_props'].append(get_action_proportions(data_sim, commits))
        if sim %1==0:
            print('running sim {} of {}'.format(sim, len(data['sim'].unique())))
        
        
    return statistics


def dicts_per_exp(datas):
    dicts = []
    for data in datas:
        dicts.append(get_statistics(data, .5))
        print('a dict is done')
    return dicts

In [5]:
dicts = dicts_per_exp(datas)

NameError: name 'datas' is not defined

Fmin over all steps

In [None]:
# Get data
plot_data = []
for i in range(len(dicts)):
    plot_data.append(np.array(dicts[i]['fmin_means']))
    
# Plot data
plt.boxplot(plot_data)
plt.xticks(range(1, len(dicts)+1),['Reproduction', 'No recursion', 'Delete statements'])
plt.title('Mean minimal fitness per simulation')
plt.ylabel('Mean minimal fitness')
plt.savefig('fmin_means.png')

Max fmin > f0

In [None]:
# Get data
plot_data = []
for i in range(len(dicts)):
    plot_data.append(np.array(dicts[i]['f_commit_mins']))
    
# Plot data
plt.boxplot(plot_data)
plt.xticks(range(1, len(dicts)+1),['Reproduction', 'No recursion', 'Delete statements'])
plt.title('Mean minimal fitness of the commits per simulation')
plt.ylabel('Mean minimal fitness')
plt.savefig('f_commit_mins.png')

Mean size of steps/lines/changes per commit

In [None]:
# Get data
plot_data = []
for i in range(len(dicts)):
    plot_data.append(np.array(dicts[i]['lines_big']))
    
# Plot data
plt.boxplot(plot_data)
plt.xticks(range(1, len(dicts)+1),['Reproduction', 'No recursion', 'Delete statements'])
plt.title('Mean value of max(insertions, deletions)\nof the commits per simulation')
plt.ylabel('Mean lines changed')
plt.savefig('lines_big.png')

In [None]:
# Get data
plot_data = []
for i in range(len(dicts)):
    plot_data.append(np.array(dicts[i]['lines_sum']))
    
# Plot data
plt.boxplot(plot_data)
plt.xticks(range(1, len(dicts)+1),['Reproduction', 'No recursion', 'Delete statements'])
plt.title('Mean lines changes in the commits per simulation')
plt.ylabel('Mean lines changed')
plt.savefig('lines_sum.png')

Changes

In [None]:
# Get data
plot_data = []
for i in range(len(dicts)):
    plot_data.append(np.array(dicts[i]['changes_big']))
    
# Plot data
plt.boxplot(plot_data)
plt.xticks(range(1, len(dicts)+1),['Reproduction', 'No recursion', 'Delete statements'])
plt.title('Mean value of max(insertions, deletions) changes\nof the commits per simulation')
plt.ylabel('Mean changed')
plt.savefig('changes_big.png')

In [None]:
# Get data
plot_data = []
for i in range(len(dicts)):
    plot_data.append(np.array(dicts[i]['changes_sum']))
    
# Plot data
plt.boxplot(plot_data)
plt.xticks(range(1, len(dicts)+1),['Reproduction', 'No recursion', 'Delete statements'])
plt.title('Mean changes in the commits per simulation')
plt.ylabel('Mean changes')
plt.savefig('changes_sum.png')

Number of commits

In [None]:
# Get data
plot_data = []
for i in range(len(dicts)):
    plot_data.append(np.array(dicts[i]['num_commits']))
    
# Plot data
plt.boxplot(plot_data)
plt.xticks(range(1, len(dicts)+1),['Reproduction', 'No recursion', 'Delete statements'])
plt.title('Number of commits per simulation')
plt.ylabel('Number of commits')
plt.savefig('num_commits.png')

Proportions of last actions per commit

In [None]:
# Pplot the real probabilities
plt.scatter(1, .45, c=colors[0], label='update')
plt.scatter(1, .4, c=colors[1], label='call')
plt.scatter(1, .05, c=colors[2], label='remove')
plt.scatter(1, .1, c=colors[3], label='create')
    
colors = ['blue', 'orange', 'green', 'purple']
plot_data = []
for i in range(len(dicts)):
    update, call, remove, create = zip(*dicts[i]['action_props'])
    plt.scatter(i+2, np.array(update).mean(), c=colors[0])
    plt.scatter(i+2, np.array(call).mean(), c=colors[1])
    plt.scatter(i+2, np.array(remove).mean(), c=colors[2])
    plt.scatter(i+2, np.array(create).mean(), c=colors[3])

plt.xticks(range(1, len(dicts)+2),['Probabilities\naction picked', 'Reproduction', 'No recursion', 'Delete statements'])
plt.title('Proportion of the actions at commits per simulation')
plt.ylabel('Proportion of actions')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('action_props.png')

Correlations between fmin and ... action, code size, changes, step

In [204]:
#TODO