# investigating commits

## setup - importing json and keyword txt

In [46]:
import json
from pprint import pprint

with open('./snapshot_20230727/20230727_200003_commit_sharings.json') as f:
    d = json.load(f)
    data = d['Sources']

In [47]:
# Read keywords from the text file
with open('keywords.txt', 'r') as file:
    keywords = [line.strip() for line in file.readlines()]

    # For checking
    # print(keywords)

# filtering

In [48]:
import pytz
from datetime import datetime
import re

In [49]:
# Convert the date to a datetime object
cutoff_date = datetime(2022, 11, 30, tzinfo=pytz.utc)

## utility functions

In [53]:
# Function to filter commits by date and keywords
def filter_commits_by_date_and_keywords(data, keywords):
    cutoff_date = datetime(2022, 11, 30, tzinfo=pytz.utc)
    relevant_keywords_pattern = r'({})'.format('|'.join(keywords))

    filtered_commits = [commit for commit in data
                        if is_commit_after_cutoff_date(commit['CommitAt'], cutoff_date)
                        and is_commit_relevant(commit['Message'], relevant_keywords_pattern)]

    return filtered_commits

# Function to check if a commit is after the cutoff date
def is_commit_after_cutoff_date(commit_datetime_str, cutoff_date):
    commit_datetime = parse_commit_datetime(commit_datetime_str)
    return commit_datetime and commit_datetime > cutoff_date

# Function to check if a commit message contains relevant keywords
def is_commit_relevant(commit_message, relevant_keywords_pattern):
    return bool(re.search(relevant_keywords_pattern, commit_message, re.IGNORECASE))

# Function to parse the commit datetime string
def parse_commit_datetime(datetime_str):
    try:
        # e.g. "CommitAt": "2023-07-06T11:20:49.000-05:00",
        dt = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S.%f%z')
        return dt.astimezone(pytz.utc)
    except ValueError:
        return None

In [54]:
# Filter the commits based on the cutoff date and relevant keywords
filtered_commits = filter_commits_by_date_and_keywords(data, keywords)

# Print the filtered commits
for commit in filtered_commits:
    print(f"Repository Name: {commit['RepoName']}")
    print(f"Commit Message: {commit['Message']}")
    print(f"Commit Date: {commit['CommitAt']}")
    print("---")

In [55]:
# all commits are beyond 2023
print('original length: ', len(data))
print('filtered length: ', len(filtered_commits))

original length:  179
filtered length:  0
