# Benchmark

In [None]:
import csv
import json
import subprocess
import os
import timeit
import time
import polars as pl

In [None]:
TIMESTAMP = int(time.time())

In [None]:
with open('../2-pattern-extraction/pattern_occurrences.csv', 'r') as f:
    reader = csv.DictReader(f)
    occurrences = {}
    for row in reader:
        if row['pattern'] not in occurrences:
            occurrences[row['pattern']] = []
        occurrences[row['pattern']].append(row['url'])

with open('../1-coding/diffs.json', 'r') as f:
    diffs = json.load(f)
    filenames = {
        d['url']: [
            f['filename'] for f in d['files']
        ]
        for d in diffs
    }

## Checkov

In [None]:
CHECKOV_CHECKS = {
    'Object storage lifecycle rules': [
        'CKV2_AWS_61',
    ],
    'AWS - Expensive DynamoDB': [
        'CKV_AWS_801',
        'CKV_AWS_802',
        'CKV_AWS_803',
    ],
    'Old generation': [
        'CKV_AWS_804',
    ],
}
CHECKOV_CHECKS_PARAM = ','.join(','.join(checks) for checks in CHECKOV_CHECKS.values())
CHECKOV_PATTERNS = {
    check: pattern
    for pattern, checks in CHECKOV_CHECKS.items()
    for check in checks
}

checkov_stats = []

N = sum(len(o) for p, o in occurrences.items() if p in CHECKOV_CHECKS.keys())
i = 0

for pattern, occs in occurrences.items():
    if pattern not in CHECKOV_CHECKS.keys():
        continue

    for url in occs:
        print(f'- [{i+1}/{N}] {pattern}: {url}')

        _, _, _, owner, name, _, sha = url.split('/')

        snapshot_path = f'snapshots/{owner}-{name}-{sha}'

        summary = {
            'pattern': pattern,
            'url': url,
            'before': []
        }

        for version in os.listdir(snapshot_path):
            if version == 'latest' and url.startswith('https://github.com/ministryofjustice/cloud-platform-environments'):
                print('  * Ignoring `latest`')
                summary['latest'] = None
                continue

            version_path = f'{snapshot_path}/{version}/'

            print(f'  * Running checkov against `{version}`')

            start = timeit.default_timer()

            result = subprocess.run([
                'checkov',
                '--evaluate-variables', 'true',
                '--download-external-modules', 'true',
                '--external-modules-download-path', '/tmp/',
                '--directory', version_path,
                '--check', CHECKOV_CHECKS_PARAM,
                '-o', 'json',
                '--framework', 'terraform',
            ], capture_output=True, encoding='utf-8')

            end = timeit.default_timer()

            stdout = json.loads(result.stdout)

            if 'results' not in stdout or 'summary' not in stdout:
                print('    NO RESULTS:', stdout)
                matched_patterns = []
                files = []
                has_errors = False
            else:
                failed_checks = stdout['results']['failed_checks']
                matched_patterns = list(set(CHECKOV_PATTERNS[check['check_id']] for check in failed_checks))
                files = [
                    {
                        'pattern': CHECKOV_PATTERNS[check['check_id']],
                        'path': check['file_abs_path'],
                        'lines': check['file_line_range'],
                    }
                    for check in failed_checks
                ]
                has_errors = stdout['summary']['parsing_errors'] != 0

            if version == 'after':
                summary['after'] = {
                    'matched': matched_patterns,
                    'errors': has_errors,
                    'duration': end - start,
                    'id': sha,
                    'files': files,
                }
            elif version.startswith('before'):
                summary['before'].append({
                    'matched': matched_patterns,
                    'errors': has_errors,
                    'duration': end - start,
                    'id': version.split('-')[1],
                    'files': files,
                })
            elif version == 'latest':
                summary['latest'] = {
                    'matched': matched_patterns,
                    'errors': has_errors,
                    'duration': end - start,
                    'id': 'latest',
                    'files': files,
                }

            print(f'    Done after {end - start:.2f}s')

        checkov_stats.append(summary)

        i += 1


In [None]:
checkov_df = pl.DataFrame(checkov_stats)

with pl.Config(tbl_rows=150, tbl_width_chars=500):
    print(checkov_df)

checkov_df.write_json(f'results/checkov_{TIMESTAMP}.json', row_oriented=True)

## TFLint

In [None]:
TFLINT_RULES = {
    'Budget': [
        'cost_aws_budget',
        'cost_google_budget'
    ],
    'Object storage lifecycle rules': [
        'cost_aws_object_storage_lifecycle_rule',
    ],
    'Old generation': [
        'cost_aws_old_generation',
    ],
    'AWS - Expensive DynamoDB': [
        'cost_aws_expensive_dynamodb',
    ],
}
TFLINT_PATTERNS = {
    rule: pattern
    for pattern, rules in TFLINT_RULES.items()
    for rule in rules
}
TFLINT_RULES_PARAMS = [
    f'--only={rule}'
    for rules in TFLINT_RULES.values()
    for rule in rules
]

tflint_stats = []

N = sum(len(o) for p, o in occurrences.items() if p in TFLINT_RULES.keys())
i = 0

for pattern, occs in occurrences.items():
    if pattern not in TFLINT_RULES.keys():
        continue

    for url in occs:
        print(f'- [{i+1}/{N}] {pattern}: {url}')

        _, _, _, owner, name, _, sha = url.split('/')

        snapshot_path = f'snapshots/{owner}-{name}-{sha}'

        summary = {
            'pattern': pattern,
            'url': url,
            'before': []
        }

        for version in os.listdir(snapshot_path):
            # if version == 'latest':
            #     continue

            version_path = f'{snapshot_path}/{version}/'

            print(f'  * Running tflint against `{version}`')

            print('    terraform get')

            terraform_result = subprocess.run(['terraform', f'-chdir={version_path}', 'get'], capture_output=True, encoding='utf-8')

            print(f'    terraform get returned with status {terraform_result.returncode}')

            print('    Starting tflint')

            start = timeit.default_timer()

            result = subprocess.run([
                'tflint',
                '--format=json',
                '--call-module-type=all',
                '--enable-plugin=cost',
                *TFLINT_RULES_PARAMS,
                f'--recursive'
            ], capture_output=True, encoding='utf-8', cwd=version_path)

            end = timeit.default_timer()

            stdout = json.loads(result.stdout)
            issues = stdout['issues']
            matched_patterns = [TFLINT_PATTERNS[issue['rule']['name']] for issue in issues]
            files = [
                {
                    'pattern': TFLINT_PATTERNS[issue['rule']['name']],
                    'path': version_path + issue['range']['filename'],
                    'start': issue['range']['start'],
                    'end': issue['range']['end'],
                }
                for issue in issues
            ]
            has_errors = len(stdout['errors']) != 0

            if version == 'after':
                summary['after'] = {
                    'matched': matched_patterns,
                    'errors': has_errors,
                    'duration': end - start,
                    'id': sha,
                    'files': files,
                }
            elif version.startswith('before'):
                summary['before'].append({
                    'matched': matched_patterns,
                    'errors': has_errors,
                    'duration': end - start,
                    'id': version.split('-')[1],
                    'files': files,
                })
            elif version == 'latest':
                summary['latest'] = {
                    'matched': matched_patterns,
                    'errors': has_errors,
                    'duration': end - start,
                    'id': 'latest',
                    'files': files,
                }

            print(f'    Done after {end - start:.2f}s')

        tflint_stats.append(summary)

        i += 1

In [None]:
tflint_df = pl.DataFrame(tflint_stats)

with pl.Config(tbl_rows=150, tbl_width_chars=500):
    print(tflint_df)

tflint_df.write_json(f'results/tflint_{TIMESTAMP}.json', row_oriented=True)