# setup

In [4]:
from dotenv import load_dotenv
import os
import json
import requests
import pandas as pd
import base64
from tqdm import tqdm
import re
import pandas as pd
import pytz
from datetime import datetime

In [9]:
# Load access token
load_dotenv()
gh_token = os.getenv('ACCESS_TOKEN')
print(gh_token)

# GitHub API setup
headers = {'Authorization': f'token {gh_token}'}

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


# Investigating commits

In [18]:
# Convert the date to a datetime object
cutoff_date = datetime(2022, 11, 30, tzinfo=pytz.utc)

In [19]:
# Function to filter commits by date, and check for .md file changes
# Also counts the number of commits per snapshot
snapshot_counts = {}

def filter_commits(data):
    filtered_commits = []
    for commit in data:
        if is_commit_after_cutoff_date(commit['CommitAt'], cutoff_date) and \
           is_commit_modifying_md_files(commit['Message']):
            filtered_commits.append(commit)
            snapshot_counts[commit['Snapshot']]['filtered'] += 1
    return filtered_commits

# Function to check if a commit is after the cutoff date
def is_commit_after_cutoff_date(commit_datetime_str, cutoff_date):
    commit_datetime = parse_commit_datetime(commit_datetime_str)
    return commit_datetime and commit_datetime > cutoff_date

# Function to check if a commit message indicates modification of .md files
def is_commit_modifying_md_files(commit_message):
    md_file_pattern = re.compile(r'\b(\S+\.md)\b', re.IGNORECASE)
    return md_file_pattern.search(commit_message)

# Function to parse the commit datetime string
def parse_commit_datetime(datetime_str):
    try:
        # e.g. "CommitAt": "2023-07-06T11:20:49.000-05:00",
        dt = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S.%f%z')
        return dt.astimezone(pytz.utc)
    except ValueError:
        return None

## Opening snapshot files

In [20]:
import json

# Paths to the JSON files
paths = [
    './snapshot_20230727/20230727_200003_commit_sharings.json',
    './snapshot_20230803/20230803_095317_commit_sharings.json',
    './snapshot_20230810/20230810_124807_commit_sharings.json',
    './snapshot_20230817/20230817_131244_commit_sharings.json',
    './snapshot_20230824/20230824_102435_commit_sharings.json',
    './snapshot_20230831/20230831_063412_commit_sharings.json',
    './snapshot_20230907/20230907_110036_commit_sharings.json',
    './snapshot_20230914/20230914_083202_commit_sharings.json',
    './snapshot_20231012/20231012_230826_commit_sharings.json',
]

data = []

# Load the JSON data from each path and add the snapshot name to each commit
for path in paths:
    with open(path) as f:
        d = json.load(f)
        snapshot_name = path.split('/')[-1]  # Extract snapshot name from path
        commits = d['Sources']
        snapshot_counts[snapshot_name] = {'original': len(commits), 'filtered': 0}
        for commit in commits:
            commit['Snapshot'] = snapshot_name
            data.append(commit)

print('Total original size: ', len(data))


Total original size:  3245


## Running

filter and create dataframe

In [21]:
import pandas as pd

# Filter the commits based on the cutoff date and .md file modifications
filtered_commits = filter_commits(data)

df = pd.DataFrame(filtered_commits, columns=['Snapshot', 'RepoName', 'Message', 'CommitAt', 'URL'])

remove duplicated commits from different snapshots (optional)

In [22]:
# Sort by snapshot date in descending order to prefer the latest snapshot
df['Snapshot'] = pd.to_datetime(df['Snapshot'].str.extract(r'(\d{8})')[0], format='%Y%m%d')
df = df.sort_values(by='Snapshot', ascending=False)

# Remove duplicate commits based on the 'URL' column, keeping the latest snapshot
df = df.drop_duplicates(subset='URL', keep='first')


print the number of filtered commits from each snapshot

In [23]:
# Print the original and filtered commit counts for each snapshot
print("\nOriginal and filtered commit counts per snapshot:")
for snapshot, counts in snapshot_counts.items():
    print(f"{snapshot}: Original={counts['original']}, Filtered={counts['filtered']}")


Original and filtered commit counts per snapshot:
20230727_200003_commit_sharings.json: Original=179, Filtered=2
20230803_095317_commit_sharings.json: Original=215, Filtered=4
20230810_124807_commit_sharings.json: Original=305, Filtered=5
20230817_131244_commit_sharings.json: Original=200, Filtered=7
20230824_102435_commit_sharings.json: Original=200, Filtered=9
20230831_063412_commit_sharings.json: Original=481, Filtered=11
20230907_110036_commit_sharings.json: Original=400, Filtered=14
20230914_083202_commit_sharings.json: Original=571, Filtered=11
20231012_230826_commit_sharings.json: Original=694, Filtered=16


print and export to csv

In [92]:
from tabulate import tabulate

# Print the filtered commits as a nicely formatted table
print(tabulate(df, headers='keys', tablefmt='psql'))

# Export the filtered commits to a CSV file
df.to_csv('filtered_commits.csv', index=False)

+----+---------------------+---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+-------------------------------+----------------------------------------------------------------------------------------------------------+
|    | Snapshot            | RepoName                              | Message                                                                                                                         | CommitAt                      | URL                                                                                                      |
|----+---------------------+---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+-------------------------------+----------------------------------------------------------------------------------------------

# adding a list of modified files

loading github api access token

In [13]:
# Load the data
csv_path = './filtered_commits.csv'
df = pd.read_csv(csv_path)

In [14]:
# Function to fetch modified files from a commit
def get_modified_files(repo_name, commit_sha):
    url = f'https://api.github.com/repos/{repo_name}/commits/{commit_sha}'
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        commit_data = response.json()

        # Only retrieve modified files with .md
        modified_files = [file['filename'] for file in commit_data['files'] if '.md' in file['filename']]
        return modified_files
    else:
        print(f"Failed to fetch commit {commit_sha} from repo {repo_name}: {response.status_code}")
        return []

In [15]:
# Process each commit
modified_files_list = []
for _, row in tqdm(df.iterrows()):
    repo_name = row['RepoName']
    commit_url = row['URL']
    commit_sha = commit_url.split('/')[-1]

    modified_files = get_modified_files(repo_name, commit_sha)
    modified_files_list.append(modified_files)

0it [00:00, ?it/s]

16it [00:10,  1.57it/s]


In [16]:
# Add the modified files to the DataFrame
df['ModifiedFiles'] = modified_files_list

# Save the updated DataFrame to a new CSV
df.to_csv('updated_commits_with_modified_files2.csv', index=False)

# retrieving the modified files and saving

In [17]:
# Load the data
csv_path = 'updated_commits_with_modified_files2.csv'
df = pd.read_csv(csv_path)

In [18]:
# Ensure the 'ModifiedFiles' column is read as a list of strings
df['ModifiedFiles'] = df['ModifiedFiles'].apply(eval)

# Define the base directory where the files will be saved
base_dir = 'project_files4'

# Create the base directory if it doesn't exist
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [19]:
# Function to sanitize file paths for Windows
def sanitize_path(path):
    return re.sub(r'[<>:"/\\|?*]', '_', path)

# Function to fetch and save the content of a modified file from a commit
def fetch_and_save_file_content(repo_name, commit_sha, filename):
    file_url = f'https://raw.githubusercontent.com/{repo_name}/{commit_sha}/{filename}'
    response = requests.get(file_url, headers=headers)

    if response.status_code == 200:
        # Sanitize the directory and file paths
        sanitized_repo_name = sanitize_path(repo_name)
        sanitized_commit_sha = sanitize_path(commit_sha)
        sanitized_filename = sanitize_path(filename)

        # Create directories for the file path if they don't exist
        file_path = os.path.join(base_dir, sanitized_repo_name, sanitized_commit_sha, sanitized_filename)
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        # Save the content to the file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        print(f"Saved: {file_path}")
    else:
        print(f"Failed to fetch {filename} from commit {commit_sha} in repo {repo_name}: {response.status_code}")


In [20]:
# Iterate over each row in the DataFrame
for _, row in tqdm(df.iterrows()):
    repo_name = row['RepoName']
    commit_url = row['URL']
    commit_sha = commit_url.split('/')[-1]
    modified_files = row['ModifiedFiles']

    for filename in modified_files:
        # Ugly solution to skip non-markdown files
        if '.md' not in filename:
            continue

        fetch_and_save_file_content(repo_name, commit_sha, filename)

print("All files have been fetched and saved to the local project folder.")

1it [00:00,  1.51it/s]

Saved: project_files4\tisztamo_Junior\6f411731d6d2445f7c91bd2cedd481e67f5ce135\prompt.md
Saved: project_files4\tisztamo_Junior\8ba2f3d4ee07003a2c2a9b4096c09fbeef8fd389\docs_roadmap.md


2it [00:01,  1.04it/s]

Saved: project_files4\tisztamo_Junior\8ba2f3d4ee07003a2c2a9b4096c09fbeef8fd389\prompt.md


3it [00:02,  1.22it/s]

Saved: project_files4\Hajaradnan_Hajaradnan\ec6241ca4370129bce7a021a4f80dff123cde103\README.md


4it [00:03,  1.40it/s]

Saved: project_files4\eshreyareddy_prompting-zomato-reviews\075750d66c6392147b7cccca28f7c225d9fa6545\README.md


5it [00:03,  1.52it/s]

Saved: project_files4\tisztamo_vueyourcv\6d454ee9b01535bb252bcbdf2e0abf13b6bcda59\README.md


6it [00:04,  1.51it/s]

Saved: project_files4\Hack23_cia\7b8639cb17b0da8317152148c1376851cf0832ee\dashboard.md


7it [00:04,  1.60it/s]

Saved: project_files4\tisztamo_Junior\9dac0b6b6797fb6d62cf41369227d0138f4397a5\docs_descriptor.md
Saved: project_files4\tisztamo_Junior\37541a0776f2c369d5ec3c888c39beef1256e0b5\docs_README.md


8it [00:06,  1.20it/s]

Saved: project_files4\tisztamo_Junior\37541a0776f2c369d5ec3c888c39beef1256e0b5\prompt.md
Saved: project_files4\tisztamo_Junior\fd0b5d400c7c86436abe3a2207ce61b9f9cf9c04\docs_README.md
Saved: project_files4\tisztamo_Junior\fd0b5d400c7c86436abe3a2207ce61b9f9cf9c04\docs_README.md.backup


9it [00:07,  1.12s/it]

Saved: project_files4\tisztamo_Junior\fd0b5d400c7c86436abe3a2207ce61b9f9cf9c04\prompt.md
Saved: project_files4\tisztamo_Junior\16f2f68315fd9a6a9a418a435196df592301509a\docs_usage.md


10it [00:09,  1.14s/it]

Saved: project_files4\tisztamo_Junior\16f2f68315fd9a6a9a418a435196df592301509a\prompt.md
Saved: project_files4\tisztamo_Junior\ed97ada1122ea1dee13ad3469b24c4cafb851f6c\README.md
Saved: project_files4\tisztamo_Junior\ed97ada1122ea1dee13ad3469b24c4cafb851f6c\prompt.md


11it [00:10,  1.34s/it]

Saved: project_files4\tisztamo_Junior\ed97ada1122ea1dee13ad3469b24c4cafb851f6c\prompt_format_shell.md
Saved: project_files4\tisztamo_Junior\d6f0d7ef512a35c1d348fbc419c212eb3688a217\docs_config_env_or_cli.md
Saved: project_files4\tisztamo_Junior\d6f0d7ef512a35c1d348fbc419c212eb3688a217\prompt_history_2023_09_29_16_26_Generate docs_config_env_or_cli.md for Junior configurations_change.sh
Saved: project_files4\tisztamo_Junior\d6f0d7ef512a35c1d348fbc419c212eb3688a217\prompt_history_2023_09_29_16_26_Generate docs_config_env_or_cli.md for Junior configurations_prompt.md


12it [00:13,  1.64s/it]

Saved: project_files4\tisztamo_Junior\d6f0d7ef512a35c1d348fbc419c212eb3688a217\prompt_history_2023_09_29_16_26_Generate docs_config_env_or_cli.md for Junior configurations_prompt.yaml
Saved: project_files4\tisztamo_Junior\c4c1c589da9229f9946dfb52d7f732d454ffe7ca\README.md


13it [00:14,  1.53s/it]

Saved: project_files4\tisztamo_Junior\c4c1c589da9229f9946dfb52d7f732d454ffe7ca\prompt.md
Saved: project_files4\tisztamo_Junior\f844aad4f267f749cf65afb618f188a4263d575e\docs_open_jobs.md
Saved: project_files4\tisztamo_Junior\f844aad4f267f749cf65afb618f188a4263d575e\prompt_history_2023_08_29_14_57_Update docs_open_jobs.md with project details_change.sh
Saved: project_files4\tisztamo_Junior\f844aad4f267f749cf65afb618f188a4263d575e\prompt_history_2023_08_29_14_57_Update docs_open_jobs.md with project details_prompt.md


14it [00:16,  1.77s/it]

Saved: project_files4\tisztamo_Junior\f844aad4f267f749cf65afb618f188a4263d575e\prompt_history_2023_08_29_14_57_Update docs_open_jobs.md with project details_prompt.yaml
Saved: project_files4\tisztamo_Junior\2addc5f9963ce394cf10dbd25ac27fad50cd1732\prompt.md


15it [00:17,  1.57s/it]

Saved: project_files4\tisztamo_Junior\2addc5f9963ce394cf10dbd25ac27fad50cd1732\prompt_system.md


16it [00:18,  1.15s/it]

Saved: project_files4\tisztamo_Junior\e2ea4fd627f1ccba0f50694dc0d9e94254657183\docs_descriptor.md
All files have been fetched and saved to the local project folder.





# retrieving commits containing the modified files before chatgpt releases

In [21]:
from datetime import datetime

# Load the data
csv_path = 'updated_commits_with_modified_files2.csv'
df = pd.read_csv(csv_path)

# Ensure the 'ModifiedFiles' column is read as a list of strings
df['ModifiedFiles'] = df['ModifiedFiles'].apply(eval)

# Define the date threshold
date_threshold = datetime(2022, 10, 30)

In [22]:
# Function to fetch commit history of a specific file up to a specific date
def fetch_commit_history_for_file(repo_name, file_path, date_threshold):
    commits = []
    page = 1
    while True:
        # url = f'https://api.github.com/repos/{repo_name}/commits?path={file_path}&until={date_threshold.isoformat()}'
        url = f'https://api.github.com/repos/{repo_name}/commits?path={file_path}'
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            break
        page_commits = response.json()
        if not page_commits:
            break
        commits.extend(page_commits)
        page += 1
    print(commits)
    return commits

# Function to find the commit with matching modified files
def find_matching_commit(repo_name, target_files, date_threshold):
    file_commits = {}
    for file in target_files:
        print(f"Fetching commit history for file: {file}")
        file_commits[file] = fetch_commit_history_for_file(repo_name, file, date_threshold)

    # Find the latest commit that modifies all target files
    common_commits = set(file_commits[target_files[0]])
    for file in target_files[1:]:
        common_commits.intersection_update(file_commits[file])

    for commit in sorted(common_commits, key=lambda x: x['commit']['author']['date'], reverse=True):
        sha = commit['sha']
        commit_url = commit['html_url']
        commit_date = commit['commit']['author']['date']
        commit_message = commit['commit']['message']
        return {
            'RepoName': repo_name,
            'Message': commit_message,
            'CommitAt': commit_date,
            'URL': commit_url,
            'ModifiedFiles': target_files
        }
    return None

In [23]:
import time


# Initialize a list to store the results
results = []

# Total number of rows for progress tracking
total_rows = len(df)
start_time = time.time()

# Iterate over each row in the DataFrame
for idx, row in df.iterrows():
    repo_name = row['RepoName']
    target_files = row['ModifiedFiles']

    print(f"Processing {idx + 1}/{total_rows}: {repo_name}")

    # Find the commit with matching modified files
    matching_commit = find_matching_commit(repo_name, target_files, date_threshold)

    if matching_commit:
        results.append(matching_commit)

    # Print progress
    elapsed_time = time.time() - start_time
    print(f"Processed {idx + 1}/{total_rows} in {elapsed_time:.2f} seconds")

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a new CSV file
results_csv_path = './matching_commits_before_chatgpt.csv'
results_df.to_csv(results_csv_path, index=False)

print(f"Matching commits have been saved to {results_csv_path}")

Processing 1/16: tisztamo/Junior
Fetching commit history for file: prompt.md


ConnectionError: HTTPSConnectionPool(host='api.github.com', port=443): Max retries exceeded with url: /repos/tisztamo/Junior/commits?path=prompt.md (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001E2B42D3DF0>: Failed to establish a new connection: [WinError 10051] A socket operation was attempted to an unreachable network'))