In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def scrape_website(url):
    response = requests.get(url)
    if response.status_code == 200:
        website_content = response.content
        soup = BeautifulSoup(website_content, 'html.parser')
        link_tags = soup.find_all('link', rel='stylesheet')
        css_content = []
        for link in link_tags:
            css_url = link.get('href')
            if not bool(urlparse(css_url).netloc):
                css_url = urljoin(url, css_url)
            css_response = requests.get(css_url)
            if css_response.status_code == 200:
                css_content.append(css_response.text)
        return soup.prettify(), css_content

url = 'https://www.uscis.gov/'
html, css = scrape_website(url)
print("HTML:\n", html)
print("CSS:\n", css)

HTML:
 <!DOCTYPE html>
<html dir="ltr" lang="en" prefix="og: https://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <noscript>
   <style>
    form.antibot * :not(.antibot-message) { display: none !important; }
   </style>
  </noscript>
  <script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-12572986-2">
  </script>
  <script>
   window.dataLayer = window.dataLayer || [];function gtag(){dataLayer.push(arguments)};gtag("js", new Date());gtag("set", "developer_id.dMDhkMT", true);gtag("config", "UA-12572986-2", {"groups":"default","anonymize_ip":true,"page_placeholder":"PLACEHOLDER_page_path","allow_ad_personalization_signals":false});
  </script>
  <link href="https://www.uscis.gov/" rel="canonical"/>
  <link href="https://www.uscis.gov/" rel="shortlink"/>
  <meta content="USCIS" property="og:site_name"/>
  <meta content="article" property="og:type"/>
  <meta content="https://www.uscis.gov/home" property="og:url"/>
  <meta content="Home" property="og:title"/>
  <meta cont

In [2]:
def extract_important_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    important_tags = ['a', 'li', 'p']
    important_content = [tag.get_text(strip=True) for tag in soup.find_all(important_tags)]
    image_tags = soup.find_all('img')
    image_sources = [tag.get('src') for tag in image_tags if tag.get('src')]
    combined_content = important_content + image_sources

    return combined_content

In [3]:
print(extract_important_content(html))

['Skip to main content', "Here's how you know", 'Español', 'Multilingual Resources', 'Official websites use .govA.govwebsite belongs to an official government organization in the United States.', "Secure .gov websites use HTTPSAlock(A locked padlock) orhttps://means you've safely connected to the .gov website. Share sensitive information only on official, secure websites.", '', 'Sign In', 'Sign In', 'Create Account', 'Create Account', 'Sign In', 'Create Account', 'Topics', 'Topics', 'Forms', 'Forms', 'Newsroom', 'Newsroom', 'Citizenship', 'Citizenship', 'Green Card', 'Green Card', 'Laws', 'Laws', 'Tools', 'Tools', 'Contact us', 'Contact us', 'Multilingual Resources', 'Multilingual Resources', 'On July 7, 2023, DHS announced the implementation of new family reunification parole (FRP) processes for Colombia, El Salvador, Guatemala, and Honduras. The new FRP processes are for nationals from Colombia, El Salvador, Guatemala, and Honduras whose family members are U.S. citizens or lawful per

In [1]:
# Complete Script w/ Compression

from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin, urlparse
#import openai

#openai.api_key = 'your-api-key' 

def scrape_website(url):
    response = requests.get(url)
    if response.status_code == 200:
        website_content = response.content
        soup = BeautifulSoup(website_content, 'html.parser')
        important_tags = ['a', 'li', 'p']
        important_content = [tag.get_text(strip=True) for tag in soup.find_all(important_tags)]
        image_tags = soup.find_all('img')
        image_sources = [tag.get('src') for tag in image_tags if tag.get('src')]
        combined_content = important_content + image_sources
        return combined_content

def compress_content(content):
    compressed_content = ' '.join(content)
    return compressed_content

def generate_content_with_gpt3(prompt):
    response = openai.Completion.create(
        engine="text-davinci-003",  # use "text-curie-003" for more tokens
        prompt=prompt,
        temperature=0.5,
        max_tokens=100
    )
    return response.choices[0].text.strip()

url = 'https://www.amazon.com/s?k=kaweco+mechanical+pencil&crid=11ZKCINA82HMT&sprefix=%2Caps%2C214&ref=nb_sb_ss_recent_1_0_recent'
content = scrape_website(url)
compressed_content = compress_content(content)
#generated_content = generate_content_with_gpt3(compressed_content)
print(compressed_content)

TypeError: can only join an iterable

In [4]:
import tiktoken

def count_tokens(text):
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    num_tokens = len(encoding.encode(text))
    return num_tokens

print(count_tokens(compressed_content))

0


In [3]:
from bs4 import BeautifulSoup
from selenium import webdriver

def scrape_website(url):
    driver = webdriver.Chrome()  # or webdriver.Chrome(), depending on your installed browser
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    return soup

def get_product_info(soup):
    product_info = []
    for product in soup.find_all('div', {'class': 'sg-col-inner'}):
        info = {}
        title = product.find('span', {'class': 'a-size-medium'})
        link = product.find('a', {'class': 'a-link-normal'})
        image = product.find('img', {'class': 's-image'})
        if title:
            info['title'] = title.text
        if link and link.get('href'):
            info['link'] = urljoin(url, link.get('href'))
        if image and image.get('src'):
            info['image'] = image.get('src')
        if info:
            product_info.append(info)
    return product_info

url = 'https://www.amazon.com/s?k=kaweco+mechanical+pencil&crid=11ZKCINA82HMT&sprefix=%2Caps%2C214&ref=nb_sb_ss_recent_1_0_recent'
soup = scrape_website(url)
product_info = get_product_info(soup)
for product in product_info:
    print(product)


{'title': 'You got ideas. We got pens.You got ideas. We got pens.', 'link': 'https://aax-us-iad.amazon.com/x/c/RHclAUGidRLDd8OSWuvO_04AAAGJZamMzwEAAAH2AQBvbm9fdHhuX2JpZDEgICBvbm9fdHhuX2ltcDEgICDUguus/https://www.amazon.com/stores/page/20CCFF74-C6F6-4420-9C79-1B3431BE5508/?_encoding=UTF8&store_ref=SB_A00126013MH5UI7TMLFO9&pd_rd_plhdr=t&aaxitk=2b3ce955bcee71cebf2010db5bf846cb&hsa_cr_id=6424767650201&lp_asins=B08S7C631F%2CB08S44VHXD&lp_query=kaweco%20mechanical%20pencil&lp_slot=auto-sparkle-hsa-tetris&ref_=sbx_be_s_sparkle_mcd_bkgd&pd_rd_w=hBuzQ&content-id=amzn1.sym.cd95889f-432f-43a7-8ec8-833616493f4a%3Aamzn1.sym.cd95889f-432f-43a7-8ec8-833616493f4a&pf_rd_p=cd95889f-432f-43a7-8ec8-833616493f4a&pf_rd_r=0T1B0AB5STDM2DW3B7CK&pd_rd_wg=5acNA&pd_rd_r=e698e3db-acf3-475d-b9d9-c723ca32f242', 'image': 'https://m.media-amazon.com/images/I/3181KZu3A3L._AC_UL400_.jpg'}
{'link': 'https://www.amazon.com/Kaweco-Special-Mechanical-Pencil-Brass/dp/B06XCLY7Y5/ref=sr_1_1?crid=11ZKCINA82HMT&keywords=kaweco+m

In [None]:
#GPT-4 Markdown, hashes are compressed links

# Main Content

## Sign In
- [Create Account](#)

## Topics
- [Forms](#)
- [Newsroom](#)
- [Citizenship](#)
- [Green Card](#)
- [Laws](#)
- [Tools](#)

## Forms
- All Forms
- [Explore my Options](#)
- [How to Change Your Address](#)

## Filing Guidance
- [How to Change Your Address](#)

### How to Change Your Address
We strongly encourage you to update your address with USCIS to ensure you receive all correspondence and benefits from us in a timely manner and avoid possible delays related to your case. Changing your address with the U.S. Postal Service will not change your address with USCIS. Please update your information with both USCIS and USPS.

You can change your address in two ways:
1. Through your existing [USCIS online account](#) if you filed your form online
2. Filing Form AR-11, Alien’s Change of Address Card, online using the [Change of Address page](#) (as long as you didn’t file one of the forms in the sections below).

If you previously filed and/or have a pending or approved form in the chart below, you must mail your Form AR-11, Alien’s Change of Address Card, to the Vermont Service Center address listed in the chart below.

## Public Service Announcements
- [Change of Address 30 second PSA](#)
- [Change of Address 60 second PSA](#)
- [Change of Address PSA Scripts](#)