Code for User.csv

In [None]:
import requests
import pandas as pd
import time
from requests.adapters import HTTPAdapter
from requests.exceptions import RequestException
from urllib3.util.retry import Retry

# GitHub API endpoint and headers
GITHUB_API_URL = "https://api.github.com"
GITHUB_TOKEN = "Replace with your GitHub token"  # Replace with your GitHub token


HEADERS = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

# Session with retry and backoff
session = requests.Session()
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.headers.update(HEADERS)

def fetch_sydney_users(min_followers=100):
    """Fetch GitHub users in Sydney with more than the specified number of followers."""
    users = []
    page = 1

    while True:
        try:
            url = f"{GITHUB_API_URL}/search/users?q=location:Sydney+followers:>{min_followers}&page={page}&per_page=100"
            response = session.get(url, timeout=10)

            if response.status_code != 200:
                print(f"Error: {response.status_code} - {response.text}")
                break

            data = response.json()
            if not data['items']:
                break

            for user in data['items']:
                user_details = fetch_user_details(user["login"], user["url"])
                if user_details:
                    users.append(user_details)

            page += 1
            if page > 10:
                break

            time.sleep(1)

        except RequestException as e:
            print(f"Request failed: {e}")
            break

    return users

def fetch_user_details(login, user_detail_url):
    """Fetch detailed information for a specific user."""
    response = session.get(user_detail_url, timeout=10)
    if response.status_code == 200:
        user_data = response.json()
        return {
            "login": user_data.get("login", ""),
            "name": user_data.get("name", ""),
            "company": (user_data.get("company") or "").lstrip("@").upper().strip(),
            "location": user_data.get("location", ""),
            "email": user_data.get("email", ""),
            "hireable": user_data.get("hireable", ""),
            "bio": user_data.get("bio", ""),
            "public_repos": user_data.get("public_repos", 0),
            "followers": user_data.get("followers", 0),
            "following": user_data.get("following", 0),
            "created_at": user_data.get("created_at", "")
        }
    else:
        error_message = response.json().get('message', 'No additional error info')
        print(f"Failed to fetch details for {login}: {response.status_code} - {error_message}")
    return None

# Fetch users and save to 'users.csv'
sydney_users = fetch_sydney_users()
df_users = pd.DataFrame(sydney_users)
df_users.to_csv("users.csv", index=False)
print("Detailed user data saved to users.csv successfully.")


#more cleaned users.csv

In [13]:
#more cleaned users.csv
import pandas as pd

# Load the data
users_df = pd.read_csv('users.csv')

# Clean the company names
users_df['company'] = users_df['company'].str.strip()  # Trim whitespace
users_df['company'] = users_df['company'].str.lstrip('@')  # Strip leading '@'
users_df['company'] = users_df['company'].str.upper()  # Convert to uppercase

# Save the cleaned DataFrame back to users.csv
users_df.to_csv('users.csv', index=False)

print("Company names cleaned and saved to users.csv.")


Company names cleaned and saved to users.csv.


Download User.csv

In [15]:
from google.colab import files

# Download users.csv
files.download("users.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Code for repositories.csv

In [None]:
import requests
import pandas as pd
import time
from requests.adapters import HTTPAdapter
from requests.exceptions import RequestException
from urllib3.util.retry import Retry

# GitHub API endpoint and headers
GITHUB_API_URL = "https://api.github.com"
GITHUB_TOKEN = "Replace with your GitHub token"  # Replace with your GitHub token
HEADERS = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

# Session with retry and backoff
session = requests.Session()
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.headers.update(HEADERS)

def fetch_user_repositories(login):
    """Fetch public repositories for a given user."""
    repos = []
    page = 1
    while True:
        try:
            url = f"{GITHUB_API_URL}/users/{login}/repos?page={page}&per_page=100"
            response = session.get(url, timeout=10)

            if response.status_code != 200:
                print(f"Error fetching repos for {login}: {response.status_code} - {response.text}")
                break

            repo_data = response.json()
            if not repo_data:
                break

            for repo in repo_data:
                repos.append({
                    "login": login,
                    "full_name": repo.get("full_name", ""),
                    "created_at": repo.get("created_at", ""),
                    "stargazers_count": repo.get("stargazers_count", 0),
                    "watchers_count": repo.get("watchers_count", 0),
                    "language": repo.get("language", ""),
                    "has_projects": repo.get("has_projects", False),
                    "has_wiki": repo.get("has_wiki", False),
                    "license_name": repo.get("license").get("key", "") if repo.get("license") else ""
                })
            page += 1
            time.sleep(1)  # To avoid hitting the rate limit
        except RequestException as e:
            print(f"Request failed for {login}: {e}")
            break

    return repos

# Read users from 'users.csv'
users_df = pd.read_csv("users.csv")

# Initialize a list to store repository data
all_repos = []

# Loop through each user and fetch their repositories
for index, row in users_df.iterrows():
    login = row["login"]
    print(f"Fetching repositories for user: {login}")
    user_repos = fetch_user_repositories(login)
    all_repos.extend(user_repos)  # Append fetched repositories

# Save repositories to 'repositories.csv'
df_repositories = pd.DataFrame(all_repos)
df_repositories.to_csv("repositories.csv", index=False)
print("Repository data saved to repositories.csv successfully.")


Code to Download repositories.csv from Google Colab

In [17]:
from google.colab import files

# Assuming 'repositories.csv' is the file you want to download
file_name = 'repositories.csv'  # Change this to your file name if needed

# Download the file
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>