## 1. Imports

In [1]:
###
import requests
import time
import pandas as pd
!pip install python-dotenv



## 2.  Configuration and Token

In [2]:

import requests
import time
import pandas as pd
import os
from dotenv import load_dotenv
from typing import List, Dict


load_dotenv()

token = os.getenv("GITHUB_TOKEN")
if not token:
    raise EnvironmentError("❌ GitHub token not found. Please set GITHUB_TOKEN in your environment.")
else:
    print(f"✅ GITHUB_TOKEN loaded: {token[:10]}...")

USERNAME = "OSMFHtech"
START_YEAR = 2008
END_YEAR = 2024

HEADERS = {
    "Authorization": f"token {token}",
    "Accept": "application/vnd.github.v3+json"
}

def test_github_connection():
    try:
        response = requests.get("https://api.github.com/rate_limit", headers=HEADERS)
        if response.status_code == 200:
            rate_limit_info = response.json()
            remaining = rate_limit_info['rate']['remaining']
            print(f"✅ GitHub API connection successful! Remaining API calls: {remaining}")
            return True
        else:
            print(f"❌ GitHub API connection failed with status code: {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Error testing GitHub connection: {str(e)}")
        return False

test_github_connection()


✅ GITHUB_TOKEN loaded: ghp_O0vr0f...
✅ GitHub API connection successful! Remaining API calls: 4987


True

## 3. Repository Fetching with Pagination

In [3]:
def get_repositories(user):
    repos = []
    page = 1
    per_page = 100
    while True:
        url = f"https://api.github.com/users/{user}/repos?per_page={per_page}&page={page}&sort=created"
        response = requests.get(url, headers=HEADERS)

        if response.status_code != 200:
            print(f"Error fetching repos: {response.status_code} - {response.text}")
            break

        data = response.json()
        if not data:
            print("No more repos found, ending pagination.")
            break

        for repo in data:
            created_year = int(repo["created_at"][:4])
            if START_YEAR <= created_year <= END_YEAR:
                repos.append({
                    "name": repo["name"],
                    "created_at": repo["created_at"],
                    "languages_url": repo["languages_url"]
                })

        print(f"Fetched page {page}, total repos collected: {len(repos)}")
        page += 1
        time.sleep(1)  # To avoid GitHub API rate limits

    return repos



## 4. Get Languages for Each Repository

In [4]:
def get_languages(languages_url):
    print(f"🌐 Fetching languages from: {languages_url}")
    response = requests.get(languages_url, headers=HEADERS)
    if response.status_code == 200:
        return list(response.json().keys())
    else:
        print(f"⚠️  Failed to get languages: {response.status_code} - {response.text}")
        return []


## 5. Full Data Extraction Function

In [5]:
def fetch_repo_data(user):
    repos = get_repositories(user)
    for repo in repos:
        repo["languages"] = get_languages(repo["languages_url"]) or []
    return repos


## 6. Run, Create DataFrame, and Save

In [6]:
# Fetch and save data
try:
    print(f"🔄 Fetching repository data for user: {USERNAME}")
    repos_info = fetch_repo_data(USERNAME)

    if not repos_info:
        raise ValueError(f"❌ No repository data returned for '{USERNAME}'. Please check:\n"
                        f"1. Username is correct\n"
                        f"2. GitHub token is valid\n"
                        f"3. User has public repositories")

    # Create DataFrame and validate
    df = pd.DataFrame(repos_info)

    if df.empty:
        raise ValueError("❌ DataFrame is empty after conversion")

    if "languages" not in df.columns:
        raise KeyError("❌ 'languages' column is missing. API response format may have changed")

    # Process languages
    df["languages"] = df["languages"].apply(lambda x: ", ".join(x) if isinstance(x, list) else str(x))
    df = df[["name", "created_at", "languages"]]

    # Ensure output directory exists
    import os
    output_dir = os.path.join(os.path.dirname(os.getcwd()), "data")
    os.makedirs(output_dir, exist_ok=True)

    output_path = os.path.join(output_dir, "github_repos.csv")
    df.to_csv(output_path, index=False)

    print(f"✅ Successfully saved {len(df)} repositories to {output_path}")
    print("\n📊 Preview of the data:")
    display(df.head())

    # Print summary statistics
    print(f"\n📈 Summary:")
    print(f"Total repositories: {len(df)}")
    print(f"Date range: {df['created_at'].min()} to {df['created_at'].max()}")
    print(f"Unique languages: {len(set(','.join(df['languages']).split(',')))}")

except ValueError as ve:
    print(str(ve))
except KeyError as ke:
    print(str(ke))
except Exception as e:
    print(f"❌ Unexpected error: {str(e)}")
    if 'df' in locals():
        print("\nAvailable columns:", df.columns.tolist())

🔄 Fetching repository data for user: OSMFHtech
Fetched page 1, total repos collected: 7
No more repos found, ending pagination.
🌐 Fetching languages from: https://api.github.com/repos/OSMFHtech/SWARC-Introduction/languages
🌐 Fetching languages from: https://api.github.com/repos/OSMFHtech/Maintenance-Monitor/languages
🌐 Fetching languages from: https://api.github.com/repos/OSMFHtech/StRiNgMoDiFy2/languages
🌐 Fetching languages from: https://api.github.com/repos/OSMFHtech/Calculator/languages
🌐 Fetching languages from: https://api.github.com/repos/OSMFHtech/SLMExam/languages
🌐 Fetching languages from: https://api.github.com/repos/OSMFHtech/Tutorium/languages
🌐 Fetching languages from: https://api.github.com/repos/OSMFHtech/Projekt-Calc-API/languages
✅ Successfully saved 7 repositories to C:\Users\OSM_LOQ\Desktop\FH Technikum\4.semester\Big Data Engineering\BigDataEngineeringProject\data\github_repos.csv

📊 Preview of the data:


Unnamed: 0,name,created_at,languages
0,SWARC-Introduction,2023-10-17T13:47:32Z,
1,Maintenance-Monitor,2023-07-01T08:23:13Z,Java
2,StRiNgMoDiFy2,2023-06-28T17:58:29Z,Java
3,Calculator,2023-06-28T13:37:42Z,Java
4,SLMExam,2023-06-28T10:31:36Z,Java



📈 Summary:
Total repositories: 7
Date range: 2023-05-31T17:14:46Z to 2023-10-17T13:47:32Z
Unique languages: 2


###  Summary

- This notebook uses the GitHub REST API to extract repository metadata.
- Output is saved to `../data/github_repos.csv`.
- This CSV can now be used by other components in our pipeline (e.g., Kafka producer).

Next steps:
- Push this notebook and output CSV to the shared GitHub repo.
- Use this file as the "REST API data source" for the project specification.
import requests
import os
from typing import List, Dict

def fetch_repo_data(username: str) -> List[Dict]:
    """
    Fetch repository data for a given GitHub username with improved error handling
    """
    github_token = os.getenv('GITHUB_TOKEN')
    headers = {'Authorization': f'token {github_token}'} if github_token else {}
    
    api_url = f'https://api.github.com/users/{username}/repos'
    
    try:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()  # Raise exception for HTTP errors
        
        if response.status_code == 200:
            repos = response.json()
            if not repos:
                print(f"No repositories found for user: {username}")
                return []
                
            repos_info = []
            for repo in repos:
                repo_info = {
                    'name': repo['name'],
                    'stars': repo['stargazers_count'],
                    'forks': repo['forks_count'],
                    'language': repo['language'],
                    'created_at': repo['created_at']
                }
                repos_info.append(repo_info)
            return repos_info
            
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {str(e)}")
        return []
    except ValueError as e:
        print(f"Error parsing JSON response: {str(e)}")
        return []
    
    return []

# Usage
USERNAME = "your_github_username"  # Replace with actual username
repos_info = fetch_repo_data(USERNAME)

if repos_info:
    import pandas as pd
    df = pd.DataFrame(repos_info)
    print(f"Found {len(repos_info)} repositories")
    print(df.head())
else:
    print("No repository data found. Please check:")
    print("1. GitHub username is correct")
    print("2. GitHub token is properly set")
    print("3. Network connectivity")