In [1]:
import requests
import time
from pymongo import MongoClient
from tqdm import tqdm
from dotenv import load_dotenv
import os
from requests.exceptions import ConnectionError, Timeout, HTTPError
from http.client import RemoteDisconnected

# === Load environment variables ===
load_dotenv(dotenv_path=r"C:\Users\madhu\OneDrive\Desktop\Project - Git Hub Recommendation\Git-Hub-Recommendation\password.env")
GITHUB_TOKEN = os.getenv("GITHUB_API_KEY")
MONGO_URI = os.getenv("MONGO_URI")

if not GITHUB_TOKEN:
    raise ValueError("❌ GITHUB_API_KEY not found in environment variables.")
if not MONGO_URI:
    raise ValueError("❌ MONGO_URI not found in environment variables.")

# === Setup GitHub headers ===
HEADERS = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}
user_data
# === MongoDB setup ===
client = MongoClient(MONGO_URI)
db = client["github_db"]
collection = db[""]

# === Helper Functions ===
def get_json(url, retries=3, backoff=5):
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=HEADERS, timeout=10)
            if response.status_code == 403:  # rate limit
                print("⚠️ Rate limit reached. Sleeping for 60 seconds.")
                time.sleep(60)
                continue
            response.raise_for_status()
            return response.json()
        except (ConnectionError, RemoteDisconnected, Timeout) as e:
            print(f"⚠️ Connection error on attempt {attempt+1}/{retries}: {e}")
            time.sleep(backoff)
        except HTTPError as e:
            print(f"⚠️ HTTP error: {e}")
            break
    raise Exception(f"❌ Failed to fetch {url} after {retries} attempts.")

def get_users_batch(since=0, per_page=100):
    url = f"https://api.github.com/users?since={since}&per_page={per_page}"
    return get_json(url)

def get_user_details(username):
    url = f"https://api.github.com/users/{username}"
    return get_json(url)

def get_list(url, key='login'):
    return [item[key] for item in get_json(url)]

def get_starred_or_subs(url):
    return [item['html_url'] for item in get_json(url)]

def get_languages(username):
    repos = get_json(f"https://api.github.com/users/{username}/repos")
    language_data = {}
    for repo in repos:
        langs = get_json(repo['languages_url'])
        for lang, count in langs.items():
            language_data[lang] = language_data.get(lang, 0) + count
    return language_data

def get_total_commits(username):
    repos = get_json(f"https://api.github.com/users/{username}/repos")
    total = 0
    for repo in repos:
        if repo.get('fork'):
            continue
        url = repo['commits_url'].replace("{/sha}", "")
        commits = requests.get(url, headers=HEADERS, params={'per_page': 1}).json()
        if isinstance(commits, list):
            total += len(commits)
    return total

# === Core Function to Fetch and Store User Data ===
def collect_and_store_user(username):
    try:
        user = get_user_details(username)
        data = {
            "Login": username,
            "Name": user.get("name"),
            "Bio": user.get("bio"),
            "Public Repositories": user.get("public_repos"),
            "Followers Count": user.get("followers"),
            "Following Count": user.get("following"),
            "Created At": user.get("created_at"),
            "Updated At": user.get("updated_at"),
            "Avatar URL": user.get("avatar_url"),
            "Profile URL": user.get("html_url"),
            "Followers List": get_list(f"https://api.github.com/users/{username}/followers"),
            "Following List": get_list(f"https://api.github.com/users/{username}/following"),
            "Starred Repositories": get_starred_or_subs(f"https://api.github.com/users/{username}/starred"),
            "Subscriptions": get_starred_or_subs(f"https://api.github.com/users/{username}/subscriptions"),
            "Organizations": get_list(f"https://api.github.com/users/{username}/orgs"),
            "Languages": get_languages(username),
            "Total Commits": get_total_commits(username)
        }
        collection.update_one({"Login": username}, {"$set": data}, upsert=True)
        print(f"✅ Data stored for {username}")
    except Exception as e:
        print(f"❌ Error for {username}: {e}")

# === Batch Processing ===
def batch_process_users(start_since=0, batches=1, per_page=10):
    since = start_since
    for _ in range(batches):
        users = get_users_batch(since, per_page)
        if not users:
            break
        for user in tqdm(users, desc="Processing batch"):
            collect_and_store_user(user['login'])
        since = users[-1]['id']

# === Example Usage ===
# Process first 2 batches of 100 users each
batch_process_users(start_since=0, batches=2, per_page=25)


Processing batch:   4%|▍         | 1/25 [00:30<12:14, 30.59s/it]

✅ Data stored for mojombo


Processing batch:   8%|▊         | 2/25 [01:08<13:24, 34.98s/it]

✅ Data stored for defunkt


Processing batch:  12%|█▏        | 3/25 [01:23<09:24, 25.65s/it]

✅ Data stored for pjhyett


Processing batch:  16%|█▌        | 4/25 [02:00<10:32, 30.11s/it]

✅ Data stored for wycats


Processing batch:  20%|██        | 5/25 [02:27<09:41, 29.09s/it]

✅ Data stored for ezmobius


Processing batch:  24%|██▍       | 6/25 [03:05<10:13, 32.29s/it]

✅ Data stored for ivey


Processing batch:  28%|██▊       | 7/25 [03:34<09:16, 30.94s/it]

✅ Data stored for evanphx


Processing batch:  32%|███▏      | 8/25 [04:02<08:34, 30.28s/it]

✅ Data stored for vanpelt


Processing batch:  36%|███▌      | 9/25 [04:33<08:03, 30.23s/it]

✅ Data stored for wayneeseguin


Processing batch:  40%|████      | 10/25 [04:56<07:00, 28.01s/it]

✅ Data stored for brynary


Processing batch:  44%|████▍     | 11/25 [05:32<07:08, 30.61s/it]

✅ Data stored for kevinclark


Processing batch:  48%|████▊     | 12/25 [06:04<06:41, 30.87s/it]

✅ Data stored for technoweenie


Processing batch:  52%|█████▏    | 13/25 [06:41<06:35, 32.93s/it]

✅ Data stored for macournoyer


Processing batch:  56%|█████▌    | 14/25 [07:02<05:21, 29.27s/it]

✅ Data stored for takeo


Processing batch:  60%|██████    | 15/25 [07:38<05:11, 31.16s/it]

✅ Data stored for caged


Processing batch:  64%|██████▍   | 16/25 [08:03<04:24, 29.35s/it]

✅ Data stored for topfunky


Processing batch:  68%|██████▊   | 17/25 [08:29<03:46, 28.37s/it]

✅ Data stored for anotherjesse


Processing batch:  72%|███████▏  | 18/25 [08:36<02:33, 21.96s/it]

✅ Data stored for roland


Processing batch:  76%|███████▌  | 19/25 [09:10<02:33, 25.57s/it]

✅ Data stored for lukas


Processing batch:  80%|████████  | 20/25 [09:14<01:36, 19.25s/it]

✅ Data stored for fanvsfan


Processing batch:  84%|████████▍ | 21/25 [09:48<01:34, 23.68s/it]

✅ Data stored for tomtt


Processing batch:  88%|████████▊ | 22/25 [09:53<00:53, 17.95s/it]

✅ Data stored for railsjitsu


Processing batch:  92%|█████████▏| 23/25 [10:03<00:31, 15.65s/it]

✅ Data stored for nitay


Processing batch:  96%|█████████▌| 24/25 [10:40<00:21, 21.86s/it]

✅ Data stored for kevwil


Processing batch: 100%|██████████| 25/25 [11:10<00:00, 26.81s/it]

✅ Data stored for KirinDave



Processing batch:   4%|▍         | 1/25 [00:33<13:24, 33.53s/it]

✅ Data stored for jamesgolick


Processing batch:   8%|▊         | 2/25 [01:09<13:24, 34.97s/it]

✅ Data stored for atmos


Processing batch:  12%|█▏        | 3/25 [01:17<08:19, 22.70s/it]

✅ Data stored for errfree


Processing batch:  16%|█▌        | 4/25 [01:52<09:34, 27.36s/it]

✅ Data stored for mojodna


Processing batch:  20%|██        | 5/25 [02:28<10:14, 30.75s/it]

✅ Data stored for bmizerany


Processing batch:  24%|██▍       | 6/25 [02:57<09:33, 30.19s/it]

✅ Data stored for jnewland


Processing batch:  28%|██▊       | 7/25 [03:14<07:42, 25.67s/it]

✅ Data stored for joshknowles


Processing batch:  32%|███▏      | 8/25 [03:24<05:50, 20.59s/it]

✅ Data stored for hornbeck


Processing batch:  36%|███▌      | 9/25 [03:47<05:41, 21.37s/it]

✅ Data stored for jwhitmire


Processing batch:  40%|████      | 10/25 [04:12<05:38, 22.56s/it]

✅ Data stored for elbowdonkey


Processing batch:  44%|████▍     | 11/25 [04:42<05:48, 24.92s/it]

✅ Data stored for reinh


Processing batch:  48%|████▊     | 12/25 [05:17<06:03, 27.99s/it]

✅ Data stored for knzai


Processing batch:  52%|█████▏    | 13/25 [05:53<06:04, 30.35s/it]

✅ Data stored for bs


Processing batch:  56%|█████▌    | 14/25 [06:30<05:57, 32.49s/it]

✅ Data stored for rsanheim


Processing batch:  60%|██████    | 15/25 [06:54<04:58, 29.86s/it]

✅ Data stored for schacon


Processing batch:  64%|██████▍   | 16/25 [07:14<04:02, 26.93s/it]

✅ Data stored for uggedal


Processing batch:  68%|██████▊   | 17/25 [07:39<03:29, 26.17s/it]

✅ Data stored for bruce


Processing batch:  72%|███████▏  | 18/25 [08:04<03:01, 25.87s/it]

✅ Data stored for sam


Processing batch:  76%|███████▌  | 19/25 [08:32<02:38, 26.45s/it]

✅ Data stored for mmower


Processing batch:  80%|████████  | 20/25 [08:53<02:03, 24.78s/it]

✅ Data stored for abhay


Processing batch:  84%|████████▍ | 21/25 [09:16<01:38, 24.50s/it]

✅ Data stored for rabble


Processing batch:  88%|████████▊ | 22/25 [09:39<01:12, 24.03s/it]

✅ Data stored for benburkert


Processing batch:  92%|█████████▏| 23/25 [10:08<00:50, 25.36s/it]

✅ Data stored for indirect


Processing batch:  96%|█████████▌| 24/25 [10:41<00:27, 27.86s/it]

✅ Data stored for fearoffish


Processing batch: 100%|██████████| 25/25 [11:16<00:00, 27.04s/it]

✅ Data stored for ry



