# Analysis

In [56]:
import json

with open('diffs.json', 'r') as f:
    diffs = json.load(f)

with open('codes.json', 'r') as cf:
    codes = json.load(cf)

## Dataset

In [57]:
TOTAL = 538
retrieved = len(diffs)
with_saving = len([d for d in diffs if 'saving' in d['existing_codes']])
print('retrieved commits:', f'{retrieved}/{TOTAL}', f'({retrieved/TOTAL*100:.2f}%)')
print('originally coded with `saving`:', f'{with_saving}/{retrieved}', f'({with_saving/retrieved*100:.2f}%)')

n_processed = sum(len(d['codes']) != 0 for d in diffs)
print('processed commits:', f'{n_processed}/{retrieved}', f'({n_processed/retrieved*100:.2f}%)')
n_processed_with_saving = sum(len(d['codes']) != 0 and 'saving' in d['existing_codes'] for d in diffs)
print('processed commits coded with `saving`:', f'{n_processed_with_saving}/{with_saving}', f'({n_processed_with_saving/with_saving*100:.2f}%)')

avg_files = sum(len(d['files']) for d in diffs) / len(diffs)
print('files affected (avg):', f'{avg_files:.2f}')

retrieved commits: 499/538 (92.75%)
originally coded with `saving`: 342/499 (68.54%)
processed commits: 467/499 (93.59%)
processed commits coded with `saving`: 323/342 (94.44%)
files affected (avg): 4.42


## Codes

In [58]:
import polars as pl

df = pl.DataFrame(codes) \
        .with_columns(count=sum(pl.col('name').is_in(d['codes']) for d in diffs)) \
        .with_columns(percentage=pl.col('count') / n_processed * 100) \
        .sort(by=pl.col('count'), descending=True)

print(df)

shape: (114, 4)
┌───────────────────────────────────┬───────────────────────────────────┬───────┬────────────┐
│ name                              ┆ description                       ┆ count ┆ percentage │
│ ---                               ┆ ---                               ┆ ---   ┆ ---        │
│ str                               ┆ str                               ┆ i32   ┆ f64        │
╞═══════════════════════════════════╪═══════════════════════════════════╪═══════╪════════════╡
│ aws                               ┆ Specific to Amazon Web Services   ┆ 341   ┆ 73.019272  │
│ not_relevant                      ┆ The commit contents do not refle… ┆ 165   ┆ 35.331906  │
│ cheaper_instance                  ┆ Use a cheaper compute instance    ┆ 71    ┆ 15.203426  │
│ gcp                               ┆ Specific to Google Cloud Platfor… ┆ 58    ┆ 12.4197    │
│ azure                             ┆ Specific to Azure                 ┆ 46    ┆ 9.850107   │
│ …                               

### Cloud-specific Codes

In [59]:
clouds = {'digitalocean', 'aws', 'gcp', 'azure', 'alicloud', 'scaleway', 'hcloud', 'ibm', 'oracle', 'ovh'}
print(df.filter(pl.col('name').is_in(clouds)).drop('description'))

shape: (10, 3)
┌──────────────┬───────┬────────────┐
│ name         ┆ count ┆ percentage │
│ ---          ┆ ---   ┆ ---        │
│ str          ┆ i32   ┆ f64        │
╞══════════════╪═══════╪════════════╡
│ aws          ┆ 341   ┆ 73.019272  │
│ gcp          ┆ 58    ┆ 12.4197    │
│ azure        ┆ 46    ┆ 9.850107   │
│ digitalocean ┆ 12    ┆ 2.569593   │
│ scaleway     ┆ 2     ┆ 0.428266   │
│ hcloud       ┆ 2     ┆ 0.428266   │
│ ibm          ┆ 2     ┆ 0.428266   │
│ oracle       ┆ 2     ┆ 0.428266   │
│ alicloud     ┆ 1     ┆ 0.214133   │
│ ovh          ┆ 1     ┆ 0.214133   │
└──────────────┴───────┴────────────┘


### Undefined codes

List codes which are attached to commits but not defined in `codes.json`.

In [60]:
listed_codes = set(c['name'] for c in codes)
for d in diffs:
    for c in d['codes']:
        if c not in listed_codes:
            print(f'{c} is not listed')

cheaper_codebuild_compute is not listed
