In [4]:
import json
import requests
from collections import defaultdict
import gzip
from io import BytesIO

def download_and_process_hourly_data(date, hour):
    url = f'https://data.gharchive.org/{date}-{hour}.json.gz'
    response = requests.get(url)
    events = []
    
    if response.status_code == 200:
        with gzip.open(BytesIO(response.content), 'r') as file:
            for line in file:
                event = json.loads(line)
                if event['type'] in ['PushEvent', 'PullRequestEvent', 'IssuesEvent']:
                    events.append(event['actor']['login'])
    return events

def count_active_developers(date):
    active_developers = set()
    
    for hour in range(24):  
        hourly_developers = download_and_process_hourly_data(date, f'{hour:02d}')
        active_developers.update(hourly_developers)
    
    return len(active_developers)

date = '2022-07-01'
num_active_developers = count_active_developers(date)
print(num_active_developers)


225716


In [None]:
import json
import requests
from collections import defaultdict
import gzip
from io import BytesIO
from datetime import datetime
import pandas as pd

def download_and_process_hourly_data(date, hour):
    url = f'https://data.gharchive.org/{date}-{hour}.json.gz'
    response = requests.get(url)
    issues = []
    
    if response.status_code == 200:
        with gzip.open(BytesIO(response.content), 'r') as file:
            for line in file:
                event = json.loads(line)
                if event['type'] == 'IssuesEvent':
                    issue = {
                        'created_at': event['created_at'],
                        'closed_at': event['issue']['closed_at'] if event['issue']['closed_at'] else None
                    }
                    issues.append(issue)
    return issues

def collect_issues(date):
    all_issues = []
    
    for hour in range(24):  
        hourly_issues = download_and_process_hourly_data(date, f'{hour:02d}')
        all_issues.extend(hourly_issues)
    
    return all_issues

def process_data(date):
    issues = collect_issues(date)
    df = pd.DataFrame(issues)
    
    # Convert date columns to datetime
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['closed_at'] = pd.to_datetime(df['closed_at'])
    
    # Calculate metrics
    issues_opened = df.shape[0]
    issues_closed = df[df['closed_at'].notna()].shape[0]
    avg_time_to_close = (df['closed_at'] - df['created_at']).dt.total_seconds().mean() / 3600 if issues_closed > 0 else None
    
    return {
        'issues_opened': issues_opened,
        'issues_closed': issues_closed,
        'avg_time_to_close': avg_time_to_close
    }

# Dates to compare
dates = ['2019-07-01', '2022-07-01']
metrics = {}

for date in dates:
    metrics[date] = process_data(date)

# Print results
for date, metric in metrics.items():
    print(f"Metrics for {date}:")
    print(f"Number of issues opened: {metric['issues_opened']}")
    print(f"Number of issues closed: {metric['issues_closed']}")
    print(f"Average time to close issues (hours): {metric['avg_time_to_close']:.2f}" if metric['avg_time_to_close'] else "Average time to close issues: No closed issues")
    print()



In [None]:
import json
import requests
from collections import defaultdict
import gzip
from io import BytesIO
from datetime import datetime
import pandas as pd

def download_and_process_hourly_data(date, hour):
    url = f'https://data.gharchive.org/{date}-{hour}.json.gz'
    response = requests.get(url)
    forks = []
    
    if response.status_code == 200:
        with gzip.open(BytesIO(response.content), 'r') as file:
            for line in file:
                event = json.loads(line)
                if event['type'] == 'ForkEvent':
                    forks.append(event)
    return forks

def collect_forks(date):
    all_forks = []
    
    for hour in range(24):  
        hourly_forks = download_and_process_hourly_data(date, f'{hour:02d}')
        all_forks.extend(hourly_forks)
    
    return all_forks

def process_data(date):
    forks = collect_forks(date)
    df = pd.DataFrame(forks)
    
    # Extract relevant data
    df['created_at'] = pd.to_datetime([event['created_at'] for event in forks])
    
    # Calculate metrics
    num_forks = df.shape[0]
    
    # Compute the growth rate of forks if needed
    # In this context, we're assuming simple year-over-year comparison

    return {
        'num_forks': num_forks,
        'date': date
    }

# Dates to compare
dates = ['2019-07-01', '2022-07-01']
metrics = {}

for date in dates:
    metrics[date] = process_data(date)

# Print results
for date, metric in metrics.items():
    print(f"Metrics for {date}:")
    print(f"Number of forks: {metric['num_forks']}")
    print()

# Calculate growth rate if needed
forks_2019 = metrics['2019-07-01']['num_forks']
forks_2022 = metrics['2022-07-01']['num_forks']
growth_rate = ((forks_2022 - forks_2019) / forks_2019) * 100 if forks_2019 > 0 else None

print(f"Growth Rate of Forks from 2019 to 2022: {growth_rate:.2f}%") if growth_rate is not None else print("No data available for 2019 to calculate growth rate.")

# You can add visualization code here if desired


In [None]:
import json
import requests
import gzip
from io import BytesIO
from datetime import datetime
import pandas as pd

def download_and_process_hourly_data(date, hour):
    url = f'https://data.gharchive.org/{date}-{hour}.json.gz'
    response = requests.get(url)
    pushes = []
    
    if response.status_code == 200:
        with gzip.open(BytesIO(response.content), 'r') as file:
            for line in file:
                event = json.loads(line)
                if event['type'] == 'PushEvent':
                    push_data = {
                        'created_at': event['created_at'],
                        'actor': event['actor']['login'],  # Developer pushing the code
                        'repo_id': event['repo']['id']    # Repository ID to track team
                    }
                    pushes.append(push_data)
    return pushes

def collect_pushes(date):
    all_pushes = []
    
    for hour in range(24):  
        hourly_pushes = download_and_process_hourly_data(date, f'{hour:02d}')
        all_pushes.extend(hourly_pushes)
    
    return all_pushes

def process_data(date):
    pushes = collect_pushes(date)
    df = pd.DataFrame(pushes)
    
    # Convert the 'created_at' field to a datetime format
    df['created_at'] = pd.to_datetime(df['created_at'])
    
    # Calculate pushes per developer and per repository
    pushes_per_dev = df.groupby('actor').size()
    pushes_per_repo = df.groupby('repo_id').size()
    
    # Calculate total pushes and average frequency per developer/repository
    total_pushes = df.shape[0]
    avg_pushes_per_dev = pushes_per_dev.mean()
    avg_pushes_per_repo = pushes_per_repo.mean()
    
    # Resample data for monthly frequency to find push frequency patterns
    monthly_pushes = df.resample('M', on='created_at').size()
    
    return {
        'total_pushes': total_pushes,
        'avg_pushes_per_dev': avg_pushes_per_dev,
        'avg_pushes_per_repo': avg_pushes_per_repo,
        'monthly_pushes': monthly_pushes
    }

# Dates to compare
dates = ['2019-07-01', '2022-07-01']
metrics = {}

for date in dates:
    metrics[date] = process_data(date)

# Print results for total pushes and averages
for date, metric in metrics.items():
    print(f"Metrics for {date}:")
    print(f"Total number of pushes: {metric['total_pushes']}")
    print(f"Average pushes per developer: {metric['avg_pushes_per_dev']:.2f}")
    print(f"Average pushes per repository: {metric['avg_pushes_per_repo']:.2f}")
    print(f"Monthly push distribution: \n{metric['monthly_pushes']}")
    print()

# Analyze push frequency trends over time (optional visualization could be added)
pushes_2019 = metrics['2019-07-01']['monthly_pushes']
pushes_2022 = metrics['2022-07-01']['monthly_pushes']

# Growth in push frequency
growth_rate_pushes = ((metrics['2022-07-01']['total_pushes'] - metrics['2019-07-01']['total_pushes']) / metrics['2019-07-01']['total_pushes']) * 100 if metrics['2019-07-01']['total_pushes'] > 0 else None

print(f"Growth Rate of Pushes from 2019 to 2022: {growth_rate_pushes:.2f}%") if growth_rate_pushes is not None else print("No data available for 2019 to calculate growth rate.")
