# acquire.py

In [16]:
import requests
import os
import pandas as pd
from bs4 import BeautifulSoup

# Initialize headers to be used for the requests


def get_soup(url, headers):
    """
    Fetches and parses HTML content of a webpage.
    
    Parameters:
        url (str): The URL of the webpage to scrape.
        
    Returns:
        BeautifulSoup: Parsed HTML content of the webpage.
    """
    response = requests.get(url, headers=headers)
    return BeautifulSoup(response.content, 'html.parser')

#-----------------------------------------------------------------------------------------
#-----------------------------------------------------------------------------------------
#                                       CODEUP SCRAPER
#-----------------------------------------------------------------------------------------
#-----------------------------------------------------------------------------------------

def get_blog_from_page(soup, headers): #page_number
    """
    Extracts articles from a single page of a blog site.
    
    Parameters:
        soup (BeautifulSoup): Parsed HTML content of the blog page.
        page_number (int): The current page number for logging purposes.
        
    Returns:
        list: List of dictionaries, each containing the title and content of a blog article.
    """
    links = soup.find_all("h3")
    articles = []
    for i, article in enumerate(links):
        # print(f"Getting article #{i+1} of page #{page_number}...")
        article_dict = {}
        
        if article.find("a"):
            article_dict['title'] = article.get_text()
            article_url = article.find("a").get("href")
            article_dict['content'] = get_blog_content(article_url, headers)
            articles.append(article_dict)
    return articles

#-----------------------------------------------------------------------------------------
#-----------------------------------------------------------------------------------------

def get_blog_content(article_url, headers):
    """
    Fetches and extracts the content of a single blog article.
    
    Parameters:
        article_url (str): The URL of the blog article to scrape.
        
    Returns:
        str: The cleaned text content of the article.
    """
    article_response = requests.get(article_url, headers=headers)
    article_soup = BeautifulSoup(article_response.content, 'html.parser')
    article_content = article_soup.select(".entry-content")[0].find_all("p")
    clean_content = ' '.join(p.get_text() for p in article_content)
    return clean_content

#-----------------------------------------------------------------------------------------
#-----------------------------------------------------------------------------------------

def get_blog_next_page_url(soup):
    """
    Identifies the URL for the next page of articles, if available.
    
    Parameters:
        soup (BeautifulSoup): Parsed HTML content of the current blog page.
        
    Returns:
        str or None: The URL for the next page of articles, or None if not present.
    """
    next_page = soup.find("div", class_="alignleft").find("a")
    return next_page.get("href") if next_page else None

#-----------------------------------------------------------------------------------------
#-----------------------------------------------------------------------------------------

def get_blog_articles(url):
    """
    Scrapes and aggregates articles from multiple pages of a blog site.
    
    Parameters:
        url (str): The initial URL to start scraping from.
        
    Returns:
        list: List of dictionaries, each containing the title and content of a blog article.
    """
    headers = {'User-Agent': 'Codeup Data Science'}
    blog_articles = []
    page_number = 1
    while True:
        print(f"Getting page #{page_number}...")
        soup = get_soup(url, headers)
        articles = get_blog_from_page(soup, page_number, headers)
        blog_articles.extend(articles)
        print(f"Completed page #{page_number}.")
        next_page_url = get_blog_next_page_url(soup)
        if next_page_url is not None:
            url = next_page_url
            page_number += 1
        else:
            print("Complete")
            break
    return blog_articles



In [3]:
import requests
import os
import pandas as pd
from bs4 import BeautifulSoup
from env import github_token, github_username

In [8]:
url = 'https://github.com/search?q=robotics+stars%3A%3E200&type=repositories'
# headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}


In [9]:
url = 'https://github.com/search?q=robotics+stars%3A%3E200&type=repositories'
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache",
}


In [10]:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

In [11]:
soup

{"payload":{"header_redesign_enabled":false,"results":[{"id":"54376220","archived":false,"color":"#3572A5","followers":19990,"has_funding_file":true,"hl_name":"AtsushiSakai/Python<em>Robotics</em>","hl_trunc_description":"Python sample codes for <em>robotics</em> algorithms.","language":"Python","mirror":false,"owned_by_organization":false,"public":true,"repo":{"repository":{"id":54376220,"name":"PythonRobotics","owner_id":3813847,"owner_login":"AtsushiSakai","updated_at":"2023-10-18T13:02:18.783Z","has_issues":true}},"sponsorable":true,"topics":["slam","robotics","localization","python","mapping","control","algorithm","robot","animation","path-planning","autonomous-driving","autonomous-vehicles","ekf","hacktoberfest","cvxpy","autonomous-navigation"],"type":"Public","help_wanted_issues_count":4,"good_first_issue_issues_count":0,"starred_by_current_user":false},{"id":"46938122","archived":false,"color":null,"followers":3543,"has_funding_file":false,"hl_name":"kiloreux/awesome-<em>roboti

In [27]:
soup

{"payload":{"header_redesign_enabled":false,"results":[{"id":"54376220","archived":false,"color":"#3572A5","followers":19990,"has_funding_file":true,"hl_name":"AtsushiSakai/Python<em>Robotics</em>","hl_trunc_description":"Python sample codes for <em>robotics</em> algorithms.","language":"Python","mirror":false,"owned_by_organization":false,"public":true,"repo":{"repository":{"id":54376220,"name":"PythonRobotics","owner_id":3813847,"owner_login":"AtsushiSakai","updated_at":"2023-10-18T13:02:18.783Z","has_issues":true}},"sponsorable":true,"topics":["slam","robotics","localization","python","mapping","control","algorithm","robot","animation","path-planning","autonomous-driving","autonomous-vehicles","ekf","hacktoberfest","cvxpy","autonomous-navigation"],"type":"Public","help_wanted_issues_count":4,"good_first_issue_issues_count":0,"starred_by_current_user":false},{"id":"46938122","archived":false,"color":null,"followers":3543,"has_funding_file":false,"hl_name":"kiloreux/awesome-<em>roboti

In [21]:
links = soup.find_all("h3")
links

[]

In [None]:
articles = []
for i, article in enumerate(links):
    # print(f"Getting article #{i+1} of page #{page_number}...")
    article_dict = {}
    
    if article.find("a"):
        article_dict['title'] = article.get_text()
        article_url = article.find("a").get("href")
        article_dict['content'] = get_blog_content(article_url, headers)
        articles.append(article_dict)
return articles