# Analysis

In [120]:
import json

with open('diffs.json', 'r') as f:
    diffs = json.load(f)

with open('codes.json', 'r') as cf:
    codes = json.load(cf)

## Dataset

In [121]:
TOTAL = 538
retrieved = len(diffs)
with_saving = len([d for d in diffs if 'saving' in d['existing_codes']])
print('retrieved commits:', f'{retrieved}/{TOTAL}', f'({retrieved/TOTAL*100:.2f}%)')
print('originally coded with `saving`:', f'{with_saving}/{retrieved}', f'({with_saving/retrieved*100:.2f}%)')

n_processed = sum(len(d["codes"]) != 0 for d in diffs)
print('processed commits:', f'{n_processed}/{retrieved}', f'({n_processed/retrieved*100:.2f}%)')

avg_files = sum(len(d['files']) for d in diffs) / len(diffs)
print('files affected (avg):', f'{avg_files:.2f}')

retrieved commits: 499/538 (92.75%)
originally coded with `saving`: 342/499 (68.54%)
processed commits: 260/499 (52.10%)
files affected (avg): 4.42


## Codes

In [122]:
print('codes:', len(codes))

counts = {
    c['name']: sum(c['name'] in d['codes'] for d in diffs)
    for c in codes
}

L = max(len(c['name']) for c in codes)
NL = max(len(str(n)) for n in counts.values())

for c in codes:
    name = c['name']
    n = sum(name in d['codes'] for d in diffs)
    p = (n / n_processed * 100)
    print(f'{name}{' ' * (L+1-len(name))}{' ' * (NL+1-len(str(n)))}{n} ({p:5.2f}%)')

codes: 84
not_relevant                              67 (25.77%)
digitalocean                               6 ( 2.31%)
aws                                      187 (71.92%)
gcp                                       38 (14.62%)
azure                                     25 ( 9.62%)
alicloud                                   1 ( 0.38%)
scaleway                                   1 ( 0.38%)
hcloud                                     1 ( 0.38%)
ibm                                        1 ( 0.38%)
smaller_disk                               8 ( 3.08%)
cheaper_disk_type                          6 ( 2.31%)
remove_loadbalancer                        3 ( 1.15%)
cheaper_instance                          57 (21.92%)
new_gen                                   29 (11.15%)
remove_dataflow                            1 ( 0.38%)
reduce_subnets                             1 ( 0.38%)
remove_nat                                 4 ( 1.54%)
add_lifecycle_rule                         6 ( 2.31%)
cheaper_volume    

## Undefined codes

List codes which are attached to commits but not defined in `codes.json`.

In [123]:
listed_codes = set(c['name'] for c in codes)
for d in diffs:
    for c in d['codes']:
        if c not in listed_codes:
            print(f'{c} is not listed')

remove_external_vpc_endpoint is not listed
reduce_eip is not listed
