# Analysis

In [12]:
import json

with open('diffs.json', 'r') as f:
    diffs = json.load(f)

with open('codes.json', 'r') as cf:
    codes = json.load(cf)

## Dataset

In [13]:
TOTAL = 538
retrieved = len(diffs)
with_saving = len([d for d in diffs if 'saving' in d['existing_codes']])
print('retrieved commits:', f'{retrieved}/{TOTAL}', f'({retrieved/TOTAL*100:.2f}%)')
print('originally coded with `saving`:', f'{with_saving}/{retrieved}', f'({with_saving/retrieved*100:.2f}%)')

n_processed = sum(len(d["codes"]) != 0 for d in diffs)
print('processed commits:', f'{n_processed}/{retrieved}', f'({n_processed/retrieved*100:.2f}%)')

avg_files = sum(len(d['files']) for d in diffs) / len(diffs)
print('files affected (avg):', f'{avg_files:.2f}')

retrieved commits: 499/538 (92.75%)
originally coded with `saving`: 342/499 (68.54%)
processed commits: 308/499 (61.72%)
files affected (avg): 4.42


## Codes

In [14]:
import polars as pl

df = pl.DataFrame(codes) \
    .with_columns(count=sum(pl.col('name').is_in(d['codes']) for d in diffs)) \
    .with_columns(percentage=pl.col('count') / n_processed * 100) \
    .sort(by=pl.col('count'), descending=True)

print(df)

# counts = {
#     c['name']: sum(c['name'] in d['codes'] for d in diffs)
#     for c in codes
# }

# L = max(len(c['name']) for c in codes)
# NL = max(len(str(n)) for n in counts.values())

# for c in codes:
#     name = c['name']
#     n = sum(name in d['codes'] for d in diffs)
#     p = (n / n_processed * 100)
#     print(f'{name}{' ' * (L+1-len(name))}{' ' * (NL+1-len(str(n)))}{n} ({p:5.2f}%)')

shape: (91, 4)
┌──────────────────────────────┬───────────────────────────────────┬───────┬────────────┐
│ name                         ┆ description                       ┆ count ┆ percentage │
│ ---                          ┆ ---                               ┆ ---   ┆ ---        │
│ str                          ┆ str                               ┆ i32   ┆ f64        │
╞══════════════════════════════╪═══════════════════════════════════╪═══════╪════════════╡
│ aws                          ┆ Specific to Amazon Web Services   ┆ 226   ┆ 73.376623  │
│ not_relevant                 ┆ The commit contents do not refle… ┆ 84    ┆ 27.272727  │
│ cheaper_instance             ┆ Use a cheaper compute instance    ┆ 64    ┆ 20.779221  │
│ gcp                          ┆ Specific to Google Cloud Platfor… ┆ 40    ┆ 12.987013  │
│ new_gen                      ┆ Newer resource generations tend … ┆ 31    ┆ 10.064935  │
│ …                            ┆ …                                 ┆ …     ┆ …       

## Undefined codes

List codes which are attached to commits but not defined in `codes.json`.

In [15]:
listed_codes = set(c['name'] for c in codes)
for d in diffs:
    for c in d['codes']:
        if c not in listed_codes:
            print(f'{c} is not listed')