# Get before/after snapshots of commits

In [None]:
import json
import os
import shutil
from tempfile import TemporaryDirectory
from time import sleep

from git import Repo

## Load dataset

In [None]:
with open('../1-coding/diffs.json', 'r') as f:
    dataset = json.load(f)

## Download before/after state

In [None]:
errors = dict()

In [None]:
for i, item in enumerate(dataset):
    _, _, _, owner, name, _, sha = item['url'].split('/')
    snapshot_path = f'snapshots/{owner}-{name}-{sha}'

    if not os.path.exists(snapshot_path):
        os.makedirs(snapshot_path)

        print(f'snapshotting {owner}/{name} {sha}')
    else:
        print(f'{snapshot_path} exists')

    with TemporaryDirectory() as tmpdir:
        repo_url, sha = item['url'].split('/commit/')
        repo_url += '.git'

        try:
            repo = Repo.clone_from(repo_url, tmpdir)

            # snapshot state after latest commit

            latest_path = f'{snapshot_path}/latest'
            if not os.path.exists(latest_path):
                shutil.copytree(tmpdir, latest_path, ignore=shutil.ignore_patterns('.git'))

                print(f'* created {latest_path}')
            else:
                print(f'* {latest_path} exists')

            # snapshot state after commit

            after_path = f'{snapshot_path}/after'
            if not os.path.exists(after_path):
                repo.git.checkout(sha)
                shutil.copytree(tmpdir, after_path, ignore=shutil.ignore_patterns('.git'))

                print(f'* created {after_path}')
            else:
                print(f'* {after_path} exists')

            # snapshot state before commit (parent commits)

            commit = repo.commit(sha)
            for parent in commit.parents:
                before_path = f'{snapshot_path}/before-{parent.hexsha}'
                if not os.path.exists(before_path):
                    repo.git.checkout(parent.hexsha)
                    shutil.copytree(tmpdir, before_path, ignore=shutil.ignore_patterns('.git'))

                    print(f'* created {before_path}')
                else:
                    print(f'* {before_path} exists')
        except Exception as e:
            print(f'error for {item["url"]}:', e)
            errors[item['url']] = str(e)

            with open('errors.json', 'w') as f:
                json.dump(errors, f)

    # if not os.path.exists(snapshot_path):
    #     os.makedirs(snapshot_path)

    #     print(f'snapshotting {owner}/{name} {sha}')

    #     with TemporaryDirectory() as tmpdir:
    #         repo_url, sha = item['url'].split('/commit/')
    #         repo_url += '.git'

    #         try:
    #             repo = Repo.clone_from(repo_url, tmpdir)

    #             repo.git.checkout(sha)
    #             shutil.copytree(tmpdir, f'{snapshot_path}/after', ignore=shutil.ignore_patterns('.git'))

    #             commit = repo.commit(sha)
    #             for parent in commit.parents:
    #                 repo.git.checkout(parent.hexsha)
    #                 shutil.copytree(tmpdir, f'{snapshot_path}/before-{parent.hexsha}', ignore=shutil.ignore_patterns('.git'))
    #         except Exception as e:
    #             print(f'error for {item["url"]}:', e)
    #             errors[item['url']] = str(e)

    #             shutil.rmtree(snapshot_path)

    #             with open('errors.json', 'w') as f:
    #                 json.dump(errors, f)

    # else:
    #     print(f'{snapshot_path} exists')

    if (i+1) % 10 == 0:
        print(f'-----\n{i+1}/{len(dataset)} commits processed\n------')

        sleep(30)