# Themes and Patterns

In [None]:
import json
import operator
import polars as pl
from functools import reduce
from collections import defaultdict

with open('../1-coding/diffs.json', 'r') as f:
    diffs = json.load(f)

diffs_df = pl.DataFrame(diffs).drop('files', 'existing_codes', 'notes')

pattern_occurrences = []

## Budget

In [None]:
budget_codes = [
    'add_billing_alarm',
    'add_budget',
    'replace_billing_alarm_budget'
]
expr = reduce(
    operator.or_,
    (pl.col('codes').list.contains(c) for c in budget_codes)
)

for url, codes in diffs_df.filter(expr).iter_rows():
    print(url, codes)
    pattern_occurrences.append({
        'pattern': 'Budget',
        'url': url,
        'codes': [c for c in codes if c in budget_codes],
    })


## Spot instances

In [None]:
spot_codes = [
    'use_spot_instance',
    'add_preemptible',
    'use_spot_fleet',
    'use_fargate_spot_capacity_provider',
]
expr = reduce(
    operator.or_,
    (pl.col('codes').list.contains(c) for c in spot_codes)
)

for url, codes in diffs_df.filter(expr).iter_rows():
    print(url, codes)
    pattern_occurrences.append({
        'pattern': 'Spot instances',
        'url': url,
        'codes': [c for c in codes if c in spot_codes],
    })

## Lifecycle rules

In [None]:
for url, codes in diffs_df.filter(pl.col('codes').list.contains('add_lifecycle_rule')).iter_rows():
    print(url, codes)
    pattern_occurrences.append({
        'pattern': 'Object storage lifecycle rules',
        'url': url,
        'codes': ['add_lifecycle_rule'],
    })

## Expensive instance

In [None]:
expr = pl.col('codes').list.contains('cheaper_instance') & pl.col('codes').list.contains('new_gen').not_()

for url, codes in diffs_df.filter(expr).iter_rows():
    print(url, codes)
    pattern_occurrences.append({
        'pattern': 'Expensive instance',
        'url': url,
        'codes': ['cheaper_instance'],
    })

## Old generation

In [None]:
expr = pl.col('codes').list.contains('new_gen')

for url, codes in diffs_df.filter(expr).iter_rows():
    print(url, codes)
    pattern_occurrences.append({
        'pattern': 'Old generation',
        'url': url,
        'codes': ['new_gen'],
    })

## Expensive storage type

In [None]:
expr = (
    pl.col('codes').list.contains('cheaper_volume') |
    pl.col('codes').list.contains('cheaper_disk_type')
) & pl.col('codes').list.contains('new_gen').not_()

for url, codes in diffs_df.filter(expr).iter_rows():
    print(url, codes)
    pattern_occurrences.append({
        'pattern': 'Expensive storage type',
        'url': url,
        'codes': [c for c in codes if c == 'cheaper_volume' or c == 'cheaper_disk_type'],
    })

## Expensive network resource

In [None]:
network_codes = [
    'remove_nat_gateway',
    'reduce_nat_gateways',
    'use_ec2_as_nat_gateway',
    'remove_nat',
    'remove_eip',
    'reduce_eip',
    'remove_private_subnet',
    'reduce_subnets',
    'reduce_vpc_endpoint_subnets',
    'remove_loadbalancer',
    'remove_alb',
    'use_alb',
    'reduce_nlb',
    'remove_route_table',
    'remove_route',
    'remove_vpn',
    'remove_firewall',
    'remove_transit_gateway',
    'associate_public_ip',
    'remove_waf',
    'add_vpc',
    'add_nat_gateway',
    'add_eip',
    'add_route',
]
expr = reduce(
    operator.or_,
    (pl.col('codes').list.contains(c) for c in network_codes)
)

for url, codes in diffs_df.filter(expr).iter_rows():
    print(url, codes)
    pattern_occurrences.append({
        'pattern': 'Expensive network resource',
        'url': url,
        'codes': [c for c in codes if c in network_codes],
    })

## Overprovisioned resources

In [None]:
overprovision_codes = [
    'smaller_disk',
    'less_memory',
    'less_cpu_cores',
    'less_cpu',
    'lambda_less_memory',
    'increase_volume_size',
]
expr = reduce(
    operator.or_,
    (pl.col('codes').list.contains(c) for c in overprovision_codes)
)

for url, codes in diffs_df.filter(expr).iter_rows():
    print(url, codes)
    pattern_occurrences.append({
        'pattern': 'Overprovisioned resources',
        'url': url,
        'codes': [c for c in codes if c in overprovision_codes],
    })

## AWS - Expensive DynamoDB

In [None]:
dynamo_codes = [
    'dynamo_on_demand',
    'dynamo_reduce_rw_capacity',
    'remove_dynamo_global_secondary_indices',
]
expr = reduce(
    operator.or_,
    (pl.col('codes').list.contains(c) for c in dynamo_codes)
)

for url, codes in diffs_df.filter(expr).iter_rows():
    print(url, codes)
    pattern_occurrences.append({
        'pattern': 'AWS - Expensive DynamoDB',
        'url': url,
        'codes': [c for c in codes if c in dynamo_codes],
    })

## Expensive monitoring

In [None]:
monitoring_codes = [
    'remove_status_check',
    'remove_cloudwatch_metrics',
    'remove_cloudwatch_logs',
    'remove_log_analytics',
    'remove_container_insights',
    'reduce_cloudwatch_retention',
    'remove_audit_logs',
    'remove_cloudfront_logs',
    'remove_fluentbit_annotations',
    'remove_fluentbit_labels',
    'reduce_argo_log_level',
    'remove_cloudwatch_dashboard',
    'fastly_compute_compress_s3_logs',
    'remove_prometheus_kube_api_scraping',
    'remove_eks_cloudwatch_agent',
]
expr = reduce(
    operator.or_,
    (pl.col('codes').list.contains(c) for c in monitoring_codes)
)

for url, codes in diffs_df.filter(expr).iter_rows():
    print(url, codes)
    pattern_occurrences.append({
        'pattern': 'Expensive monitoring',
        'url': url,
        'codes': [c for c in codes if c in monitoring_codes],
    })

# Export

In [None]:
df = pl.DataFrame(pattern_occurrences).with_columns(technology=pl.lit(['terraform']))

df.select('pattern', 'url').write_csv('pattern_occurrences.csv', include_header=True)

df.write_json('theme_occurrences.json', row_oriented=True)

df.explode('codes') \
    .group_by('pattern') \
    .agg('url', 'codes') \
    .with_columns(
        url=pl.col('url').list.unique(),
        codes=pl.col('codes').list.unique()
    ) \
    .select(
        pattern='pattern',
        occurrences=pl.struct(terraform='url'),
        codes='codes',
    ) \
    .write_json('themes.json', row_oriented=True)