In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime, timedelta

# Input: URL of the website
# Get the current date
current_date = datetime.now()

# Subtract one day to get yesterday's date
yesterday_date = current_date - timedelta(days=1)

# Format the date as YYYY/MM/DD
formatted_date = yesterday_date.strftime("%Y/%m/%d")

# Build the URL
base_url = "https://nightly.changelog.com"
input_url = f"{base_url}/{formatted_date}"
filename = f"exports/export_changelog_{yesterday_date.year}_{yesterday_date.month:02d}_{yesterday_date.day:02d}.csv"

# Build the exclusion list
exclusion_list = ["https://github.com/thechangelog", "https://github.com/trending", "https://github.com/NVIDIA"]




In [2]:
# Function to scrape all URLs from the given website
def scrape_urls(website_url):
    try:
        response = requests.get(website_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all anchor tags
        links = [a.get('href') for a in soup.find_all('a', href=True)]
        return links
    except requests.RequestException as e:
        print(f"Error accessing {website_url}: {e}")
        return []

# Function to check if a URL redirects to github.com
def check_redirect_to_github(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        if 'github.com' in response.url:
            return True, response.url
    except requests.RequestException:
        pass
    return False, None

# Function to check if a URL is a GitHub repository
def is_github_repo(url):
    pattern = r"^https://github\.com/([^/]+)/([^/]+)$"
    match = re.match(pattern, url)
    if match:
        author = f"https://github.com/{match.group(1)}"
        repo = url
        return True, author, repo
    return False, None, None

# Step 1: Scrape all URLs from the input URL
print("Scraping URLs...")
urls = scrape_urls(input_url)

# Count the number of items in the data list
urls_count = len(urls)
print(f"Number of items in the initial url list: {urls_count}")

Scraping URLs...
Number of items in the initial url list: 94


In [3]:
# Step 2: Check for redirection to github.com
print("Checking for redirects to github.com...")
github_redirects = pd.DataFrame(columns=["Repo", "Repo_name", "Repo_desc" , "Star_count" , "Author", "Name", "Type", "Website", "Bio", "Location"])
for url in urls:
    if not url.startswith("http"):
        url = requests.compat.urljoin(input_url, url)  # Handle relative URLs
    is_redirect, redirected_url = check_redirect_to_github(url)
    if is_redirect:
        is_repo, author, repo = is_github_repo(redirected_url)
        if is_repo:
            github_redirects = pd.concat([
                github_redirects, 
                pd.DataFrame([{"Repo": repo, "Author": author}])
            ], ignore_index=True)

row_count = len(github_redirects)
print(f"Number of repos: {row_count}")
github_redirects.head(50)

Checking for redirects to github.com...
Number of repos: 42


Unnamed: 0,Repo,Repo_name,Repo_desc,Star_count,Author,Name,Type,Website,Bio,Location
0,https://github.com/thechangelog/nightly,,,,https://github.com/thechangelog,,,,,
1,https://github.com/henrythe9th/AI-Crash-Course,,,,https://github.com/henrythe9th,,,,,
2,https://github.com/StellarSand/privacy-settings,,,,https://github.com/StellarSand,,,,,
3,https://github.com/trending/python,,,,https://github.com/trending,,,,,
4,https://github.com/PriorLabs/TabPFN,,,,https://github.com/PriorLabs,,,,,
5,https://github.com/trending/kotlin,,,,https://github.com/trending,,,,,
6,https://github.com/rifsxd/KernelSU-Next,,,,https://github.com/rifsxd,,,,,
7,https://github.com/trending/typescript,,,,https://github.com/trending,,,,,
8,https://github.com/PollensAI/Pollens,,,,https://github.com/PollensAI,,,,,
9,https://github.com/trending/javascript,,,,https://github.com/trending,,,,,


In [4]:
for index, row in github_redirects.iterrows():
    if row['Author'] in exclusion_list:
        github_redirects.drop(index, inplace=True)

row_count = len(github_redirects)
print(f"Number of repos after exclusion: {row_count}")

print(f"Repos list after exclusion:")
github_redirects.head(50)

Number of repos after exclusion: 22
Repos list after exclusion:


Unnamed: 0,Repo,Repo_name,Repo_desc,Star_count,Author,Name,Type,Website,Bio,Location
1,https://github.com/henrythe9th/AI-Crash-Course,,,,https://github.com/henrythe9th,,,,,
2,https://github.com/StellarSand/privacy-settings,,,,https://github.com/StellarSand,,,,,
4,https://github.com/PriorLabs/TabPFN,,,,https://github.com/PriorLabs,,,,,
6,https://github.com/rifsxd/KernelSU-Next,,,,https://github.com/rifsxd,,,,,
8,https://github.com/PollensAI/Pollens,,,,https://github.com/PollensAI,,,,,
10,https://github.com/ASITHA-MD/ASITHA-BOT,,,,https://github.com/ASITHA-MD,,,,,
12,https://github.com/polarsignals/kubezonnet,,,,https://github.com/polarsignals,,,,,
14,https://github.com/jackplus-xyz/binary.nvim,,,,https://github.com/jackplus-xyz,,,,,
16,https://github.com/lio-mengxiang/eslint9-quick...,,,,https://github.com/lio-mengxiang,,,,,
18,https://github.com/SILENTLOVER40/SILENT-SOBX-M...,,,,https://github.com/SILENTLOVER40,,,,,


In [None]:
GITHUB_TOKEN = "INSERT TOKEN HERE"

def get_github_author_info(author_url):
    username = author_url.split("https://github.com/")[-1]
    api_url = f"https://api.github.com/users/{username}"
    headers={
        "Accept": "application/vnd.github.v3+json",
        "Authorization": "Bearer INSERT TOKEN HERE"
    }
    try:
        response = requests.get(api_url, headers=headers)  # Pass headers as a keyword argument
        response.raise_for_status()
        user_data = response.json()
        user_name = user_data.get("name")
        user_type = user_data.get("type")
        blog = user_data.get("blog") or "No website"
        bio = user_data.get("bio") or "No bio"
        location = user_data.get("location") or "No location"
        return user_name, user_type, blog, bio, location
    except requests.RequestException as e:
        print(f"Error fetching data for {username}: {e}")
        return None, "No website", "No bio", "No location"
    

def query_github_repo(repo_url):
    if not repo_url.startswith("https://github.com/"):
        raise ValueError("Invalid GitHub repository URL")

    # Extract the owner and repo name from the URL
    parts = repo_url.split("https://github.com/")[-1].split("/")
    if len(parts) < 2:
        raise ValueError("Incomplete GitHub repository URL")

    owner, repo = parts[:2]

    # Construct the API call
    api_url = f"https://api.github.com/repos/{owner}/{repo}"
    headers={
        "Accept": "application/vnd.github.v3+json",
        "Authorization": "Bearer INSERT TOKEN HERE"
    }
    try:
        response = requests.get(api_url, headers=headers)  # Pass headers as a keyword argument
        response.raise_for_status()
        repo_data = response.json()

        # Extract required information
        repo_name = repo_data.get("name", "Unknown Repo")
        repo_description = repo_data.get("description", "No description available")
        star_count = repo_data.get("stargazers_count", 0)

        return repo_name, repo_description, star_count
    except requests.RequestException as e:
        print(f"Error querying GitHub API: {e}")
        return None, None, None


for index, row in github_redirects.iterrows():
    user_name, user_type, blog, bio, location = get_github_author_info(row['Author'])
    repo_name, repo_desc, star_count = query_github_repo(row['Repo'])
    github_redirects.at[index, 'Repo_name'] = repo_name
    github_redirects.at[index, 'Repo_desc'] = repo_desc
    github_redirects.at[index, 'Star_count'] = star_count  # Fixed the column name
    github_redirects.at[index, 'Name'] = user_name
    github_redirects.at[index, 'Type'] = user_type
    github_redirects.at[index, 'Website'] = blog
    github_redirects.at[index, 'Bio'] = bio
    github_redirects.at[index, 'Location'] = location
    print(star_count)

github_redirects['Star_count'] = pd.to_numeric(github_redirects['Star_count'], errors='coerce')

df = github_redirects.copy()
df.head(40)


1031
877
1702
117
301
25
32
23
23
16
11
18681
3912
1979
771
2199
21122
966
42722
12277
1522
58022


Unnamed: 0,Repo,Repo_name,Repo_desc,Star_count,Author,Name,Type,Website,Bio,Location,Start_count
1,https://github.com/henrythe9th/AI-Crash-Course,AI-Crash-Course,AI Crash Course to help busy builders catch up...,,https://github.com/henrythe9th,Henry Shi,User,www.super.com,Co-Founder of Super.com. Recently Reactivated ...,San Francisco,1031.0
2,https://github.com/StellarSand/privacy-settings,privacy-settings,Guide to privacy settings for most major softw...,,https://github.com/StellarSand,StellarSand,User,No website,No bio,Planet 3,877.0
4,https://github.com/PriorLabs/TabPFN,TabPFN,⚡ TabPFN: Foundation Model for Tabular Data ⚡,,https://github.com/PriorLabs,Prior Labs,Organization,https://www.priorlabs.ai,No bio,No location,1702.0
6,https://github.com/rifsxd/KernelSU-Next,KernelSU-Next,A Kernel based root solution for Android,,https://github.com/rifsxd,Rifat Azad,User,No website,Tired.....,No location,117.0
8,https://github.com/PollensAI/Pollens,Pollens,Buzz Together in Real Time: Your BeeSync for C...,,https://github.com/PollensAI,Pollens,Organization,pollens.app,Buzz Together in Real Time: Your BeeSync for C...,No location,301.0
10,https://github.com/ASITHA-MD/ASITHA-BOT,ASITHA-BOT,Asitha-md whatsapp bot,,https://github.com/ASITHA-MD,,User,No website,No bio,No location,25.0
12,https://github.com/polarsignals/kubezonnet,kubezonnet,Monitor cross-zone network traffic in Kubernetes.,,https://github.com/polarsignals,Polar Signals,Organization,https://polarsignals.com/,No bio,No location,32.0
14,https://github.com/jackplus-xyz/binary.nvim,binary.nvim,A minmal Neovim colorscheme using only two col...,,https://github.com/jackplus-xyz,Jack Huang,User,https://jackplus.xyz,A software engineer who has a dream to liberat...,No location,23.0
16,https://github.com/lio-mengxiang/eslint9-quick...,eslint9-quick-config-tool,it's a cli tool to help you update eslint 9.x ...,,https://github.com/lio-mengxiang,孟祥_成都,User,https://juejin.cn/user/96412752684744/posts,keep making progress，永远保持一颗学习的心 微信：a2298613245,成都,23.0
18,https://github.com/SILENTLOVER40/SILENT-SOBX-M...,SILENT-SOBX-MD-V2,✧〖THE WORLD BEST WHATSAPP BOT CREATED BY SILEN...,,https://github.com/SILENTLOVER40,SILENTLOVER432,User,https://github.com/SILENTLOVER40/SILENT-SOBX-MD,"""✰°☆ ⃝ ᎠoᴎT wAꙅTɘ Uᴙ Timɘ To impᴙɘꙅꙅ mɘ ...",PAKISTAN,16.0


In [8]:
github_redirects = github_redirects.sort_values(by='Type', ascending=True)
github_redirects.to_csv(filename, index=False)
print(input_url)

https://nightly.changelog.com/2025/01/09


In [11]:
def get_rate_limit():
    api_url = "https://api.github.com/rate_limit"
    headers = {
        "Accept": "application/vnd.github.v3+json",
        "Authorization": "Bearer INSERT TOKEN HERE"
    }
    try:
        # Query the GitHub API
        response = requests.get(api_url, headers=headers)  # Pass headers as a keyword argument
        response.raise_for_status()
        rate_limit_data = response.json()

        # Extract rate limit information
        remaining = rate_limit_data['rate']['remaining']
        limit = rate_limit_data['rate']['limit']
        reset_time = rate_limit_data['rate']['reset']

        print(f"Rate Limit: {limit}, Remaining: {remaining}, Resets at: {reset_time}")
    except requests.RequestException as e:
        print(f"Error fetching rate limit data: {e}")

# Call the function
get_rate_limit()

Rate Limit: 5000, Remaining: 5000, Resets at: 1736508224
