In [1]:
#!/usr/bin/env python
# coding: utf-8

import requests
from bs4 import BeautifulSoup
import csv

def fetch_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        print(f'Failed to retrieve {url}. Status code: {response.status_code}')
        return None

def save_to_csv(data, filename, headers=None):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        if headers:
            writer.writerow(headers)
        for row in data:
            writer.writerow(row)

def scrape_vokasi_uns():
    url = 'https://vokasi.uns.ac.id/'
    content = fetch_content(url)
    if content:
        print(content)

def scrape_bukalapak():
    url = 'https://www.bukalapak.com/'
    content = fetch_content(url)
    if content:
        soup = BeautifulSoup(content, 'html.parser')
        print(soup.title)
        print(soup.title.name)
        print(soup.title.parent.name)

def scrape_proxyway_images():
    url = 'https://proxyway.com/news'
    content = fetch_content(url)
    if content:
        soup = BeautifulSoup(content, 'html.parser')
        images_list = []
        images = soup.select('img')
        for image in images:
            src = image.get('src')
            alt = image.get('alt')
            images_list.append([src, alt])
        save_to_csv(images_list, 'data_gambar.csv', headers=['src', 'alt'])
        for image in images_list:
            print(image)

def scrape_proxyway_subtitles():
    url = 'https://proxyway.com/news'
    content = fetch_content(url)
    if content:
        soup = BeautifulSoup(content, 'html.parser')
        paragraphs = soup.find_all('h2')
        subtitles = [[paragraph.text] for paragraph in paragraphs]
        save_to_csv(subtitles, 'subjudul.csv')
        for subtitle in subtitles:
            print(subtitle[0])

def scrape_proxyway_descriptions():
    url = 'https://proxyway.com/news'
    content = fetch_content(url)
    if content:
        soup = BeautifulSoup(content, 'html.parser')
        div_elements = soup.find_all('div', attrs={'data-widget_type': 'theme-post-excerpt.default'})
        descriptions = []
        for div_element in div_elements:
            inner_div = div_element.find('div', class_='elementor-widget-container')
            if inner_div:
                text_content = inner_div.get_text(strip=True)
                descriptions.append([text_content])
                print(text_content)
        save_to_csv(descriptions, 'keterangan.csv')

if __name__ == "__main__":
    # Scrape vokasi.uns.ac.id and print content
    scrape_vokasi_uns()
    
    # Scrape bukalapak.com and print title details
    scrape_bukalapak()
    
    # Scrape proxyway.com/news for images and save to CSV
    scrape_proxyway_images()
    
    # Scrape proxyway.com/news for subtitles (h2 tags) and save to CSV
    scrape_proxyway_subtitles()
    
    # Scrape proxyway.com/news for specific div descriptions and save to CSV
    scrape_proxyway_descriptions()


Failed to retrieve https://vokasi.uns.ac.id/. Status code: 429
<title>Situs Belanja Online dan Jual Beli Mudah Terpercaya | Bukalapak</title>
title
head
['https://proxyway.com/wp-content/uploads/2023/04/proxyway.svg?ver=1681290142', 'Proxyway']
['https://proxyway.com/wp-content/uploads/2023/04/Newsfeed-image-min.png', 'Adam sitting in a chair reading a newspaper']
['https://secure.gravatar.com/avatar/40de7ce27e119cfc9a04eef5e77cc6c2?s=96&r=g', 'Adam Dubois']
['https://proxyway.com/wp-content/uploads/2020/07/oxylabs-logo-png.png?ver=1704718753', 'Oxylabs logo']
['https://secure.gravatar.com/avatar/40de7ce27e119cfc9a04eef5e77cc6c2?s=96&r=g', 'Adam Dubois']
['https://proxyway.com/wp-content/uploads/2022/05/bright-data-logo.png?ver=1704718964', 'Bright Data logo']
['https://secure.gravatar.com/avatar/40de7ce27e119cfc9a04eef5e77cc6c2?s=96&r=g', 'Adam Dubois']
['https://proxyway.com/wp-content/uploads/2022/07/rayobyte-logo.png?ver=1704718347', 'rayobyte logo']
['https://secure.gravatar.com/a