# Get before/after snapshots of commits

In [4]:
import json
import os
import shutil
from tempfile import TemporaryDirectory
from time import sleep

from git import Repo

## Load dataset

In [5]:
with open('../diffs.json', 'r') as f:
    dataset = json.load(f)

## Download before/after state

In [6]:
errors = dict()

In [7]:
for i, item in enumerate(dataset):
    _, _, _, owner, name, _, sha = item['url'].split('/')
    snapshot_path = f'snapshots/{owner}-{name}-{sha}'

    if not os.path.exists(snapshot_path):
        os.makedirs(snapshot_path)

        print(f'snapshotting {owner}/{name} {sha}')

        with TemporaryDirectory() as tmpdir:
            repo_url, sha = item['url'].split('/commit/')
            repo_url += '.git'

            try:
                repo = Repo.clone_from(repo_url, tmpdir)

                repo.git.checkout(sha)
                shutil.copytree(tmpdir, f'{snapshot_path}/after', ignore=shutil.ignore_patterns('.git'))

                commit = repo.commit(sha)
                for parent in commit.parents:
                    repo.git.checkout(parent.hexsha)
                    shutil.copytree(tmpdir, f'{snapshot_path}/before-{parent.hexsha}', ignore=shutil.ignore_patterns('.git'))
            except Exception as e:
                print(f'error for {item["url"]}:', e)
                errors[item['url']] = str(e)

                shutil.rmtree(snapshot_path)

                with open('errors.json', 'w') as f:
                    json.dump(errors, f)

    else:
        print(f'{snapshot_path} exists')

    if (i+1) % 10 == 0:
        print(f'{i+1}/{len(dataset)} commits processed')

        sleep(30)

snapshots/tkhoa2711-terraform-digitalocean-a86d89369aaf5a20c1e4d8415a8a771aa7de7d10 exists
snapshots/stealthHat-k8s-terraform-681a3f8b4942be495b3f2528fb9ee40d7a4eb08a exists
snapshots/stealthHat-k8s-terraform-4193db798227c6538c61d55a906ed9ac997563f7 exists
snapshots/JamesWoolfenden-terraform-aws-codebuild-container-4a00ffcbf9576d7e5febdbdf94a31d4735fc8035 exists
snapshots/dannysievers-gcp-project-88303c62ab59b1e7a538289112cf19354a8ed05f exists
snapshots/thomastodon-jabujabu-02210a3d3ba4a770c29623825b7f54f3ff33f3c7 exists
snapshots/tooxie-terraform-workshop-002bcce28e46728714fa1e0d20bec6f2559caba2 exists
snapshots/deptno-terraform-aws-modules-49f447bdbb3cf23499e8194e78f852ea1e256d3a exists
snapshots/beaulabs-terraform_aws_ec2_instance-d6df68da5ae58fb5c650c6be15d9d8e676a129db exists
snapshots/aws-observability-aws-otel-test-framework-c928fe0a05d1e9b8f1ecb1a7dacffecc2800e038 exists
10/499 commits processed
snapshots/ken-matsui-poac-infrastructure-02c710b8259f493c475021fc9eac23b871305ae6 e