# Investigating commits

## Setup

In [75]:
import pytz
from datetime import datetime
import re

In [76]:
# Convert the date to a datetime object
cutoff_date = datetime(2022, 11, 30, tzinfo=pytz.utc)

In [77]:
# Function to filter commits by date and keywords, and check for .md file changes
def filter_commits(data):
    cutoff_date = datetime(2022, 11, 30, tzinfo=pytz.utc)

    filtered_commits = [commit for commit in data
                        if is_commit_after_cutoff_date(commit['CommitAt'], cutoff_date)
                        and is_commit_modifying_md_files(commit['Message'])]

    return filtered_commits

# Function to check if a commit is after the cutoff date
def is_commit_after_cutoff_date(commit_datetime_str, cutoff_date):
    commit_datetime = parse_commit_datetime(commit_datetime_str)
    return commit_datetime and commit_datetime > cutoff_date

# Function to check if a commit message indicates modification of .md files
def is_commit_modifying_md_files(commit_message):
    md_file_pattern = re.compile(r'\b(\S+\.md)\b', re.IGNORECASE)
    return md_file_pattern.search(commit_message)

# Function to parse the commit datetime string
def parse_commit_datetime(datetime_str):
    try:
        # e.g. "CommitAt": "2023-07-06T11:20:49.000-05:00",
        dt = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S.%f%z')
        return dt.astimezone(pytz.utc)
    except ValueError:
        return None

## Opening snapshot files

In [81]:
import json
from pprint import pprint

paths = [
    './snapshot_20230727/20230727_200003_commit_sharings.json',
    './snapshot_20230803/20230803_095317_commit_sharings.json',
    './snapshot_20230810/20230810_124807_commit_sharings.json',
    './snapshot_20230817/20230817_131244_commit_sharings.json',
    './snapshot_20230824/20230824_102435_commit_sharings.json',
    './snapshot_20230831/20230831_063412_commit_sharings.json',
    './snapshot_20230907/20230907_110036_commit_sharings.json',
    './snapshot_20230914/20230914_083202_commit_sharings.json',
    './snapshot_20231012/20231012_230826_commit_sharings.json',
]

data = []
snapshots = []

# Load the JSON data from each path and add the snapshot name to each commit
for path in paths:
    with open(path) as f:
        d = json.load(f)
        snapshot_name = path.split('/')[-1]  # Extract snapshot name from path
        for commit in d['Sources']:
            commit['Snapshot'] = snapshot_name
            data.append(commit)

## Running

In [84]:
import pandas as pd

# Filter the commits based on the cutoff date and .md file modifications
filtered_commits = filter_commits(data)

# Create a DataFrame to display the filtered commits
df = pd.DataFrame(filtered_commits, columns=['Snapshot', 'RepoName', 'Message', 'CommitAt'])

# Print the filtered commits as a table
print(df.to_string(index=False))

                            Snapshot                              RepoName                                                                                                                         Message                      CommitAt
20230727_200003_commit_sharings.json                            Hack23/cia                               Update dashboard.md\n\nchatlog https://chat.openai.com/share/67ff0200-dad4-48f2-884e-ccada57974f6 2023-07-09T11:33:06.000+02:00
20230727_200003_commit_sharings.json                    tisztamo/vueyourcv                                          Update README.md\n\nhttps://chat.openai.com/share/be79a950-1231-4e55-aae0-2a90d8962d1d 2023-06-26T13:49:10.000+02:00
20230803_095317_commit_sharings.json                            Hack23/cia                               Update dashboard.md\n\nchatlog https://chat.openai.com/share/67ff0200-dad4-48f2-884e-ccada57974f6 2023-07-09T11:33:06.000+02:00
20230803_095317_commit_sharings.json                    tisztamo/vue