# Data Extraction

In [5]:
import os
import requests
import time
from pymongo import MongoClient
from tqdm import tqdm
from dotenv import load_dotenv

# Load environment variables
load_dotenv(dotenv_path=r"C:\Users\madhu\OneDrive\Desktop\Project - Git Hub Recommendation\Git-Hub-Recommendation\password.env")  # Use your absolute path
GITHUB_TOKEN = os.getenv("GITHUB_API_KEY")
MONGO_URI = os.getenv("MONGO_URI")

if not GITHUB_TOKEN:
    raise Exception("❌ GitHub API key is missing!")
else:
    print("✅ GitHub API key loaded successfully. Ready to access GitHub API.")

if not MONGO_URI:
    raise Exception("❌ MongoDB URI is missing!")
else:
    print("✅ MongoDB URI loaded successfully. Ready to access MongoDB.")

# Headers for GitHub API
HEADERS = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

# MongoDB setup
client = MongoClient(MONGO_URI)
db = client["github_recommendation_db"]
collection = db["Users__data"]

# Helper functions
def get_json(url):
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 403:  # Rate limit exceeded
        print("⚠️ Rate limit hit. Waiting for 60 seconds...")
        time.sleep(60)
        response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    return response.json()

def get_users_batch(since=0, per_page=100):
    url = f"https://api.github.com/users?since={since}&per_page={per_page}"
    return get_json(url)

def get_user_details(username):
    url = f"https://api.github.com/users/{username}"
    return get_json(url)

def get_list(url, key='login'):
    return [item[key] for item in get_json(url)]

def get_starred_or_subs(url):
    return [item['html_url'] for item in get_json(url)]

def get_languages(username):
    repos = get_json(f"https://api.github.com/users/{username}/repos")
    language_data = {}
    for repo in repos:
        langs = get_json(repo['languages_url'])
        for lang, count in langs.items():
            language_data[lang] = language_data.get(lang, 0) + count
    return language_data

def get_total_commits(username):
    repos = get_json(f"https://api.github.com/users/{username}/repos")
    total = 0
    for repo in repos:
        if repo.get('fork'):
            continue
        url = repo['commits_url'].replace("{/sha}", "")
        commits = requests.get(url, headers=HEADERS, params={'per_page': 1}).json()
        if isinstance(commits, list):
            total += len(commits)
    return total

def collect_and_store_user(username):
    try:
        print(f"🔍 Collecting data for user: {username}")
        user = get_user_details(username)
        data = {
            "Login": username,
            "Name": user.get("name"),
            "Bio": user.get("bio"),
            "Public Repositories": user.get("public_repos"),
            "Followers Count": user.get("followers"),
            "Following Count": user.get("following"),
            "Created At": user.get("created_at"),
            "Updated At": user.get("updated_at"),
            "Avatar URL": user.get("avatar_url"),
            "Profile URL": user.get("html_url"),
            "Followers List": get_list(f"https://api.github.com/users/{username}/followers"),
            "Following List": get_list(f"https://api.github.com/users/{username}/following"),
            "Starred Repositories": get_starred_or_subs(f"https://api.github.com/users/{username}/starred"),
            "Subscriptions": get_starred_or_subs(f"https://api.github.com/users/{username}/subscriptions"),
            "Organizations": get_list(f"https://api.github.com/users/{username}/orgs"),
            "Languages": get_languages(username),
            "Total Commits": get_total_commits(username)
        }
        collection.update_one({"Login": username}, {"$set": data}, upsert=True)
        print(f"✅ Data stored/updated for user: {username}")
    except Exception as e:
        print(f"❌ Error collecting data for {username}: {e}")

def batch_process_users(start_since=0, batches=1, per_page=10):
    since = start_since
    for _ in range(batches):
        users = get_users_batch(since, per_page)
        if not users:
            print("⚠️ No more users found. Ending batch processing.")
            break
        for user in tqdm(users, desc="Processing batch"):
            collect_and_store_user(user['login'])
        since = users[-1]['id']

def fetch_user_if_not_exists(username):
    if collection.find_one({"Login": username}):
        print(f"✅ User {username} already exists in DB.")
    else:
        print(f"🔍 Fetching data for missing user: {username}")
        collect_and_store_user(username)

# === Entry Point ===
if __name__ == "__main__":
    print("🚀 Starting GitHub user data collection...")
    batch_process_users(start_since=0, batches=2, per_page=50)
    print("🎉 Data collection complete!")



✅ GitHub API key loaded successfully. Ready to access GitHub API.
✅ MongoDB URI loaded successfully. Ready to access MongoDB.
🚀 Starting GitHub user data collection...


Processing batch:   0%|          | 0/50 [00:00<?, ?it/s]

🔍 Collecting data for user: mojombo


Processing batch:   2%|▏         | 1/50 [00:32<26:24, 32.34s/it]

✅ Data stored/updated for user: mojombo
🔍 Collecting data for user: defunkt


Processing batch:   4%|▍         | 2/50 [01:02<24:49, 31.03s/it]

✅ Data stored/updated for user: defunkt
🔍 Collecting data for user: pjhyett


Processing batch:   6%|▌         | 3/50 [01:18<18:53, 24.11s/it]

✅ Data stored/updated for user: pjhyett
🔍 Collecting data for user: wycats


Processing batch:   8%|▊         | 4/50 [01:57<23:05, 30.12s/it]

✅ Data stored/updated for user: wycats
🔍 Collecting data for user: ezmobius


Processing batch:  10%|█         | 5/50 [02:27<22:35, 30.11s/it]

✅ Data stored/updated for user: ezmobius
🔍 Collecting data for user: ivey


Processing batch:  12%|█▏        | 6/50 [03:11<25:32, 34.84s/it]

✅ Data stored/updated for user: ivey
🔍 Collecting data for user: evanphx


Processing batch:  14%|█▍        | 7/50 [03:44<24:22, 34.00s/it]

✅ Data stored/updated for user: evanphx
🔍 Collecting data for user: vanpelt


Processing batch:  16%|█▌        | 8/50 [04:14<22:58, 32.83s/it]

✅ Data stored/updated for user: vanpelt
🔍 Collecting data for user: wayneeseguin


Processing batch:  18%|█▊        | 9/50 [04:43<21:43, 31.79s/it]

✅ Data stored/updated for user: wayneeseguin
🔍 Collecting data for user: brynary


Processing batch:  20%|██        | 10/50 [05:04<18:49, 28.23s/it]

✅ Data stored/updated for user: brynary
🔍 Collecting data for user: kevinclark


Processing batch:  22%|██▏       | 11/50 [05:33<18:37, 28.66s/it]

✅ Data stored/updated for user: kevinclark
🔍 Collecting data for user: technoweenie


Processing batch:  24%|██▍       | 12/50 [06:07<19:11, 30.30s/it]

✅ Data stored/updated for user: technoweenie
🔍 Collecting data for user: macournoyer


Processing batch:  26%|██▌       | 13/50 [06:39<18:52, 30.62s/it]

✅ Data stored/updated for user: macournoyer
🔍 Collecting data for user: takeo


Processing batch:  28%|██▊       | 14/50 [06:58<16:14, 27.06s/it]

✅ Data stored/updated for user: takeo
🔍 Collecting data for user: caged


Processing batch:  30%|███       | 15/50 [07:40<18:34, 31.85s/it]

✅ Data stored/updated for user: caged
🔍 Collecting data for user: topfunky


Processing batch:  32%|███▏      | 16/50 [08:21<19:31, 34.45s/it]

✅ Data stored/updated for user: topfunky
🔍 Collecting data for user: anotherjesse


Processing batch:  34%|███▍      | 17/50 [09:00<19:45, 35.92s/it]

✅ Data stored/updated for user: anotherjesse
🔍 Collecting data for user: roland


Processing batch:  36%|███▌      | 18/50 [09:09<14:45, 27.68s/it]

✅ Data stored/updated for user: roland
🔍 Collecting data for user: lukas


Processing batch:  38%|███▊      | 19/50 [09:47<15:59, 30.95s/it]

✅ Data stored/updated for user: lukas
🔍 Collecting data for user: fanvsfan


Processing batch:  40%|████      | 20/50 [09:53<11:36, 23.21s/it]

✅ Data stored/updated for user: fanvsfan
🔍 Collecting data for user: tomtt


Processing batch:  42%|████▏     | 21/50 [10:31<13:27, 27.86s/it]

✅ Data stored/updated for user: tomtt
🔍 Collecting data for user: railsjitsu


Processing batch:  44%|████▍     | 22/50 [10:36<09:47, 20.97s/it]

✅ Data stored/updated for user: railsjitsu
🔍 Collecting data for user: nitay


Processing batch:  46%|████▌     | 23/50 [10:47<08:05, 18.00s/it]

✅ Data stored/updated for user: nitay
🔍 Collecting data for user: kevwil


Processing batch:  48%|████▊     | 24/50 [11:27<10:34, 24.41s/it]

✅ Data stored/updated for user: kevwil
🔍 Collecting data for user: KirinDave


Processing batch:  50%|█████     | 25/50 [11:58<11:04, 26.59s/it]

✅ Data stored/updated for user: KirinDave
🔍 Collecting data for user: jamesgolick


Processing batch:  52%|█████▏    | 26/50 [12:27<10:51, 27.13s/it]

✅ Data stored/updated for user: jamesgolick
🔍 Collecting data for user: atmos


Processing batch:  54%|█████▍    | 27/50 [12:59<10:57, 28.60s/it]

✅ Data stored/updated for user: atmos
🔍 Collecting data for user: errfree


Processing batch:  56%|█████▌    | 28/50 [13:06<08:11, 22.36s/it]

✅ Data stored/updated for user: errfree
🔍 Collecting data for user: mojodna


Processing batch:  58%|█████▊    | 29/50 [13:38<08:46, 25.05s/it]

✅ Data stored/updated for user: mojodna
🔍 Collecting data for user: bmizerany


Processing batch:  60%|██████    | 30/50 [14:12<09:14, 27.70s/it]

✅ Data stored/updated for user: bmizerany
🔍 Collecting data for user: jnewland


Processing batch:  62%|██████▏   | 31/50 [14:47<09:30, 30.04s/it]

✅ Data stored/updated for user: jnewland
🔍 Collecting data for user: joshknowles


Processing batch:  64%|██████▍   | 32/50 [15:09<08:13, 27.43s/it]

✅ Data stored/updated for user: joshknowles
🔍 Collecting data for user: hornbeck


Processing batch:  66%|██████▌   | 33/50 [15:25<06:47, 23.99s/it]

✅ Data stored/updated for user: hornbeck
🔍 Collecting data for user: jwhitmire


Processing batch:  68%|██████▊   | 34/50 [15:55<06:57, 26.08s/it]

✅ Data stored/updated for user: jwhitmire
🔍 Collecting data for user: elbowdonkey


Processing batch:  70%|███████   | 35/50 [16:30<07:08, 28.54s/it]

✅ Data stored/updated for user: elbowdonkey
🔍 Collecting data for user: reinh


Processing batch:  72%|███████▏  | 36/50 [17:06<07:10, 30.76s/it]

✅ Data stored/updated for user: reinh
🔍 Collecting data for user: knzai


Processing batch:  74%|███████▍  | 37/50 [17:37<06:43, 31.04s/it]

✅ Data stored/updated for user: knzai
🔍 Collecting data for user: bs


Processing batch:  76%|███████▌  | 38/50 [18:11<06:23, 31.94s/it]

✅ Data stored/updated for user: bs
🔍 Collecting data for user: rsanheim


Processing batch:  78%|███████▊  | 39/50 [18:48<06:07, 33.40s/it]

✅ Data stored/updated for user: rsanheim
🔍 Collecting data for user: schacon


Processing batch:  80%|████████  | 40/50 [19:20<05:30, 33.02s/it]

✅ Data stored/updated for user: schacon
🔍 Collecting data for user: uggedal


Processing batch:  82%|████████▏ | 41/50 [19:45<04:34, 30.45s/it]

✅ Data stored/updated for user: uggedal
🔍 Collecting data for user: bruce


Processing batch:  84%|████████▍ | 42/50 [20:18<04:10, 31.30s/it]

✅ Data stored/updated for user: bruce
🔍 Collecting data for user: sam


Processing batch:  86%|████████▌ | 43/50 [20:57<03:54, 33.46s/it]

✅ Data stored/updated for user: sam
🔍 Collecting data for user: mmower


Processing batch:  88%|████████▊ | 44/50 [21:39<03:37, 36.23s/it]

✅ Data stored/updated for user: mmower
🔍 Collecting data for user: abhay


Processing batch:  90%|█████████ | 45/50 [22:10<02:53, 34.61s/it]

✅ Data stored/updated for user: abhay
🔍 Collecting data for user: rabble


Processing batch:  92%|█████████▏| 46/50 [22:46<02:20, 35.11s/it]

✅ Data stored/updated for user: rabble
🔍 Collecting data for user: benburkert


Processing batch:  94%|█████████▍| 47/50 [23:20<01:43, 34.55s/it]

✅ Data stored/updated for user: benburkert
🔍 Collecting data for user: indirect


Processing batch:  96%|█████████▌| 48/50 [23:48<01:05, 32.73s/it]

✅ Data stored/updated for user: indirect
🔍 Collecting data for user: fearoffish


Processing batch:  98%|█████████▊| 49/50 [24:19<00:32, 32.29s/it]

✅ Data stored/updated for user: fearoffish
🔍 Collecting data for user: ry


Processing batch: 100%|██████████| 50/50 [24:46<00:00, 29.73s/it]

✅ Data stored/updated for user: ry



Processing batch:   0%|          | 0/50 [00:00<?, ?it/s]

🔍 Collecting data for user: engineyard


Processing batch:   2%|▏         | 1/50 [00:26<21:51, 26.76s/it]

✅ Data stored/updated for user: engineyard
🔍 Collecting data for user: jsierles


Processing batch:   4%|▍         | 2/50 [00:51<20:36, 25.75s/it]

✅ Data stored/updated for user: jsierles
🔍 Collecting data for user: tweibley


Processing batch:   6%|▌         | 3/50 [01:06<16:09, 20.63s/it]

✅ Data stored/updated for user: tweibley
🔍 Collecting data for user: peimei


Processing batch:   8%|▊         | 4/50 [01:17<13:02, 17.01s/it]

✅ Data stored/updated for user: peimei
🔍 Collecting data for user: brixen


Processing batch:  10%|█         | 5/50 [01:46<15:50, 21.13s/it]

✅ Data stored/updated for user: brixen
🔍 Collecting data for user: tmornini


Processing batch:  12%|█▏        | 6/50 [02:10<16:11, 22.08s/it]

✅ Data stored/updated for user: tmornini
🔍 Collecting data for user: outerim


Processing batch:  14%|█▍        | 7/50 [02:40<17:42, 24.70s/it]

✅ Data stored/updated for user: outerim
🔍 Collecting data for user: daksis


Processing batch:  16%|█▌        | 8/50 [02:52<14:29, 20.71s/it]

✅ Data stored/updated for user: daksis
🔍 Collecting data for user: sr


Processing batch:  18%|█▊        | 9/50 [03:30<17:45, 26.00s/it]

✅ Data stored/updated for user: sr
🔍 Collecting data for user: lifo


Processing batch:  20%|██        | 10/50 [03:53<16:44, 25.11s/it]

✅ Data stored/updated for user: lifo
🔍 Collecting data for user: rsl


Processing batch:  22%|██▏       | 11/50 [04:26<17:59, 27.67s/it]

✅ Data stored/updated for user: rsl
🔍 Collecting data for user: imownbey


Processing batch:  24%|██▍       | 12/50 [04:59<18:26, 29.11s/it]

✅ Data stored/updated for user: imownbey
🔍 Collecting data for user: dylanegan


Processing batch:  26%|██▌       | 13/50 [05:30<18:20, 29.74s/it]

✅ Data stored/updated for user: dylanegan
🔍 Collecting data for user: jm


Processing batch:  28%|██▊       | 14/50 [06:06<18:56, 31.58s/it]

✅ Data stored/updated for user: jm
🔍 Collecting data for user: kmarsh


Processing batch:  30%|███       | 15/50 [06:42<19:14, 32.99s/it]

✅ Data stored/updated for user: kmarsh
🔍 Collecting data for user: jvantuyl


Processing batch:  32%|███▏      | 16/50 [07:16<18:55, 33.39s/it]

✅ Data stored/updated for user: jvantuyl
🔍 Collecting data for user: BrianTheCoder


Processing batch:  34%|███▍      | 17/50 [07:49<18:14, 33.17s/it]

✅ Data stored/updated for user: BrianTheCoder
🔍 Collecting data for user: freeformz


Processing batch:  36%|███▌      | 18/50 [08:28<18:40, 35.03s/it]

✅ Data stored/updated for user: freeformz
🔍 Collecting data for user: hassox


Processing batch:  38%|███▊      | 19/50 [09:03<18:01, 34.90s/it]

✅ Data stored/updated for user: hassox
🔍 Collecting data for user: automatthew


Processing batch:  40%|████      | 20/50 [09:34<16:50, 33.68s/it]

✅ Data stored/updated for user: automatthew
🔍 Collecting data for user: queso


Processing batch:  42%|████▏     | 21/50 [09:57<14:48, 30.63s/it]

✅ Data stored/updated for user: queso
🔍 Collecting data for user: lancecarlson


Processing batch:  44%|████▍     | 22/50 [10:24<13:44, 29.44s/it]

✅ Data stored/updated for user: lancecarlson
🔍 Collecting data for user: drnic


Processing batch:  46%|████▌     | 23/50 [10:50<12:47, 28.44s/it]

✅ Data stored/updated for user: drnic
🔍 Collecting data for user: lukesutton


Processing batch:  48%|████▊     | 24/50 [11:09<11:09, 25.77s/it]

✅ Data stored/updated for user: lukesutton
🔍 Collecting data for user: danwrong


Processing batch:  50%|█████     | 25/50 [11:39<11:11, 26.85s/it]

✅ Data stored/updated for user: danwrong
🔍 Collecting data for user: HamptonMakes


Processing batch:  52%|█████▏    | 26/50 [12:06<10:45, 26.90s/it]

✅ Data stored/updated for user: HamptonMakes
🔍 Collecting data for user: jfrost


Processing batch:  54%|█████▍    | 27/50 [12:18<08:34, 22.37s/it]

✅ Data stored/updated for user: jfrost
🔍 Collecting data for user: mattetti


Processing batch:  56%|█████▌    | 28/50 [12:48<09:01, 24.63s/it]

✅ Data stored/updated for user: mattetti
🔍 Collecting data for user: ctennis


Processing batch:  58%|█████▊    | 29/50 [12:58<07:11, 20.53s/it]

✅ Data stored/updated for user: ctennis
🔍 Collecting data for user: lawrencepit


Processing batch:  60%|██████    | 30/50 [13:23<07:12, 21.62s/it]

✅ Data stored/updated for user: lawrencepit
🔍 Collecting data for user: marcjeanson


Processing batch:  62%|██████▏   | 31/50 [13:56<08:00, 25.27s/it]

✅ Data stored/updated for user: marcjeanson
🔍 Collecting data for user: grempe


Processing batch:  64%|██████▍   | 32/50 [14:33<08:37, 28.74s/it]

✅ Data stored/updated for user: grempe
🔍 Collecting data for user: peterc


Processing batch:  66%|██████▌   | 33/50 [15:10<08:51, 31.25s/it]

✅ Data stored/updated for user: peterc
🔍 Collecting data for user: ministrycentered


Processing batch:  68%|██████▊   | 34/50 [15:31<07:30, 28.14s/it]

✅ Data stored/updated for user: ministrycentered
🔍 Collecting data for user: afarnham


Processing batch:  70%|███████   | 35/50 [16:00<07:04, 28.28s/it]

✅ Data stored/updated for user: afarnham
🔍 Collecting data for user: up_the_irons


Processing batch:  72%|███████▏  | 36/50 [16:32<06:53, 29.52s/it]

✅ Data stored/updated for user: up_the_irons
🔍 Collecting data for user: cristibalan


Processing batch:  74%|███████▍  | 37/50 [17:05<06:34, 30.36s/it]

✅ Data stored/updated for user: cristibalan
🔍 Collecting data for user: heavysixer


Processing batch:  76%|███████▌  | 38/50 [17:44<06:37, 33.13s/it]

✅ Data stored/updated for user: heavysixer
🔍 Collecting data for user: brosner


Processing batch:  78%|███████▊  | 39/50 [18:22<06:20, 34.59s/it]

✅ Data stored/updated for user: brosner
🔍 Collecting data for user: danielmorrison


Processing batch:  80%|████████  | 40/50 [18:47<05:15, 31.55s/it]

✅ Data stored/updated for user: danielmorrison
🔍 Collecting data for user: danielharan


Processing batch:  82%|████████▏ | 41/50 [19:20<04:48, 32.08s/it]

✅ Data stored/updated for user: danielharan
🔍 Collecting data for user: kvnsmth


Processing batch:  84%|████████▍ | 42/50 [19:44<03:57, 29.74s/it]

✅ Data stored/updated for user: kvnsmth
🔍 Collecting data for user: collectiveidea


Processing batch:  86%|████████▌ | 43/50 [20:17<03:35, 30.79s/it]

✅ Data stored/updated for user: collectiveidea
🔍 Collecting data for user: canadaduane


Processing batch:  88%|████████▊ | 44/50 [20:52<03:11, 31.91s/it]

✅ Data stored/updated for user: canadaduane
🔍 Collecting data for user: corasaurus-hex


Processing batch:  90%|█████████ | 45/50 [21:25<02:40, 32.20s/it]

✅ Data stored/updated for user: corasaurus-hex
🔍 Collecting data for user: dstrelau


Processing batch:  92%|█████████▏| 46/50 [21:52<02:02, 30.63s/it]

✅ Data stored/updated for user: dstrelau
🔍 Collecting data for user: sunny


Processing batch:  94%|█████████▍| 47/50 [22:19<01:28, 29.49s/it]

✅ Data stored/updated for user: sunny
🔍 Collecting data for user: dkubb


Processing batch:  96%|█████████▌| 48/50 [22:50<01:00, 30.03s/it]

✅ Data stored/updated for user: dkubb
🔍 Collecting data for user: jnicklas


Processing batch:  98%|█████████▊| 49/50 [23:20<00:30, 30.05s/it]

✅ Data stored/updated for user: jnicklas
🔍 Collecting data for user: richcollins


Processing batch: 100%|██████████| 50/50 [23:47<00:00, 28.54s/it]

✅ Data stored/updated for user: richcollins
🎉 Data collection complete!



