In [8]:
import pandas as pd 
import csv
import logging
import requests
import time
import signal
import sys
import os.path
import concurrent.futures
import threading
from datetime import datetime, timedelta
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from requests.exceptions import RetryError
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor

In [9]:
#Function To Fetch Github Repositories Using Github API
#Divide date range function is used to divide the given date range into smaller intervals :
def divide_date_range(start_date, end_date, interval):
    intervals = []
    current_date = start_date
    while current_date < end_date:
        next_date = min(current_date + timedelta(days=interval), end_date)
        intervals.append((current_date, next_date))
        current_date = next_date + timedelta(days=1)
    return intervals

In [10]:
#Fetch repositories function is used to
def fetch_repositories(start_date, end_date, token, repositories_limit):
    base_url = "https://api.github.com/search/repositories"
    headers = {"Authorization": f"Bearer {token}"}
    per_page = 100
    page = 1
    repositories = []
    repositories_count = 0

    # Configure the retry mechanism with exponential backoff
    
    retries = Retry(total = 5, backoff_factor = 0.5, status_forcelist = [429, 500, 502, 503, 504]) # Code Errors
    session = requests.Session()
    session.mount(base_url, HTTPAdapter(max_retries = retries))

    while repositories_count < repositories_limit:
        params = {
            "q": f"created:{start_date.date()}..{end_date.date()}",
            "sort": "stars",
            "order": "desc",
            "per_page": per_page,
            "page": page
        }

        try:
            response = session.get(base_url, headers=headers, params=params)
            response.raise_for_status()

            data = response.json()
            items = data.get("items", [])

            repositories.extend(items)
            repositories_count += len(items)

            total_count = min(data.get("total_count", 0), repositories_limit)
            logging.info(f"Processed page {page}/{total_count // per_page + 1} | Retrieved repositories: {repositories_count}")

            if repositories_count >= repositories_limit or end_date <= start_date:
                logging.info(f"We Gonna Stop Now , Byy !!!!!")

                break

            page += 1

        except requests.exceptions.HTTPError as err:
            
            logging.error(f"Failed to retrieve repositories from page {page}: {err}")
            
            # Retry the request after a certain interval using exponential backoff
            time.sleep(retries.get_backoff_time())

        # Check rate limit and wait if necessary after reaching the repositories limit
        remaining_requests = int(response.headers.get("X-RateLimit-Remaining", 0))
        
        if remaining_requests == 0:
            reset_time = int(response.headers["X-RateLimit-Reset"])
            current_time = time.time()
            sleep_time = max(reset_time - current_time, 0) + 1  
            logging.info(f"Rate limit reached. Sleeping for {sleep_time} seconds...")
            time.sleep(sleep_time)

    return repositories

In [11]:
#Get repositories within date range is used to fetch repositories in a given date range :
def get_repositories_within_date_range(start_date, end_date, token, filename, interval):
        
    intervals = divide_date_range(start_date, end_date, interval)
    repositories = [] 
    repositories_limit = 950 # define a repo limit , in order to avoid the github api rate limit : 

    try:
        for idx, (start, end) in enumerate(intervals):
            logging.info(f"Processing interval {idx+1}/{len(intervals)}: {start.date()} to {end.date()} ---------------------------------------------------------Enjoy-----------------------------")
            interval_repositories = fetch_repositories(start, end, token, repositories_limit)
            repositories.extend(interval_repositories)

            if len(repositories) >= repositories_limit:
                logging.info(f"Reached the repositories limit {repositories_limit}. Sleeping for 20 seconds --------- !")
                time.sleep(20)

    except KeyboardInterrupt:
        logging.info("Keyboard interruption detected. Saving progress and exiting --------- !!")
        save_repositories_to_csv(repositories, filename)  # Save progress before exiting

    return repositories

In [12]:
#Get Repo Information
def get_repository_info(repository):
    repository_info = {
        "id": repository["id"],
        "url": repository["url"],
        "name": repository["name"],
        "owner": repository["owner"]["login"],
        "ownertype": repository["owner"]["type"],
        "created_at": repository["created_at"],
        "updated_at": repository["updated_at"],
        "pushed_at": repository["pushed_at"],
        "language": repository["language"],
        "has_issues": repository["has_issues"],
        "stargazers_count": repository["stargazers_count"],
        "open_issues_count": repository["open_issues_count"],
        "description": repository["description"],
        "archive_url": repository["archive_url"],
        "forks": repository["forks"],
        "topics": repository["topics"],
        "license": repository["license"],
        "allow_forking": repository["allow_forking"],
        "contributers_url": repository["contributors_url"],
    }
    return repository_info
# Save repositories to CSV file
def save_repositories_to_csv(repositories, filename):
    keys = [
        "id",
        "url",
        "name",
        "owner",
        "ownertype",
        "created_at",
        "updated_at",
        "pushed_at",
        "language",
        "has_issues",
        "stargazers_count",
        "open_issues_count",
        "description",
        "archive_url",
        "forks",
        "topics",
        "license",
        "allow_forking",
        "contributers_url",
    ]

    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()

        for repository in repositories:
            repository_info = get_repository_info(repository)
            writer.writerow(repository_info)

        logging.info(f"Repositories saved to {filename}")

In [13]:
#Usage Function
def main():
    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

    # Set the start and end dates for the range
    start_date = datetime(2015, 1, 1)
    end_date = datetime(2023, 5, 1)

    token = "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"

    # Set the filename for saving the CSV file
    filename = "repositories2.csv"

    # Set the interval for fetching repositories (in days)
    interval = 30

    repositories = get_repositories_within_date_range(start_date, end_date, token, filename, interval)

    if repositories:
        save_repositories_to_csv(repositories, filename)
    else:
        logging.info("No repositories found within the specified date range. Exiting...")


if __name__ == "__main__":
    main()

2024-01-04 18:52:59,310 - INFO - Processing interval 1/99: 2015-01-01 to 2015-01-31 ---------------------------------------------------------Enjoy-----------------------------
2024-01-04 18:53:01,947 - INFO - Processed page 1/10 | Retrieved repositories: 100
2024-01-04 18:53:04,229 - INFO - Processed page 2/10 | Retrieved repositories: 200
2024-01-04 18:53:06,626 - INFO - Processed page 3/10 | Retrieved repositories: 300
2024-01-04 18:53:08,959 - INFO - Processed page 4/10 | Retrieved repositories: 400
2024-01-04 18:53:11,680 - INFO - Processed page 5/10 | Retrieved repositories: 500
2024-01-04 18:53:13,989 - INFO - Processed page 6/10 | Retrieved repositories: 600
2024-01-04 18:53:16,727 - INFO - Processed page 7/10 | Retrieved repositories: 700
2024-01-04 18:53:19,063 - INFO - Processed page 8/10 | Retrieved repositories: 800
2024-01-04 18:53:21,604 - INFO - Processed page 9/10 | Retrieved repositories: 900
2024-01-04 18:53:24,469 - INFO - Processed page 10/10 | Retrieved repositorie