Mengirim Permintaan HTTP dengan Requests

In [9]:
import requests

# Mengirim permintaan HTTP GET ke URL
url = 'https://www.kaggle.com/datasets/zeesolver/spotfy'
response = requests.get(url)

# Mengecek status kode permintaan
print(f'Status Kode: {response.status_code}')

# Menampilkan konten HTML dari halaman
print(response.text)


Status Kode: 200


<!DOCTYPE html>
<html lang="en">

<head>
  <title>Spotify Songs Album | Kaggle</title>
  <meta charset="utf-8" />
    <meta name="robots" content="index, follow" />
  <meta name="description" content="A Sonic Journey Through Melodies" />
    <meta name="keywords" content="arts and entertainment,music,data visualization,exploratory data analysis,websites" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=5.0, minimum-scale=1.0">
  <meta name="theme-color" content="#008ABC" />
  <script nonce="aMshxvxutkYMoNULdLHCLA==" type="text/javascript">
    window["pageRequestStartTime"] = 1716477628724;
    window["pageRequestEndTime"] = 1716477628797;
    window["initialPageLoadStartTime"] = new Date().getTime();
  </script>
  <script nonce="aMshxvxutkYMoNULdLHCLA==" id="gsi-client" src="https://accounts.google.com/gsi/client" async defer></script>
  <script nonce="aMshxvxutkYMoNULdLHCLA==">window.KAGGLE_JUPYTERLAB_PATH = "/static/assets

Mem-parsing HTML dengan BeautifulSoup and Menangani Permintaan yang Gagal


In [10]:
from bs4 import BeautifulSoup

try:
    response = requests.get(url)
    response.raise_for_status()  # Memicu error jika status kode bukan 200
except requests.exceptions.HTTPError as http_err:
    print(f'HTTP error occurred: {http_err}')
except Exception as err:
    print(f'Other error occurred: {err}')
else:
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    # Lanjutkan dengan parsing dan ekstraksi data


# Menampilkan konten HTML yang sudah diparsing
print(soup.prettify())


<!DOCTYPE html>
<html lang="en">
 <head>
  <title>
   Spotify Songs Album | Kaggle
  </title>
  <meta charset="utf-8"/>
  <meta content="index, follow" name="robots"/>
  <meta content="A Sonic Journey Through Melodies" name="description"/>
  <meta content="arts and entertainment,music,data visualization,exploratory data analysis,websites" name="keywords"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=5.0, minimum-scale=1.0" name="viewport"/>
  <meta content="#008ABC" name="theme-color">
   <script nonce="G7BHfFCVvY0ByoFbeT16mQ==" type="text/javascript">
    window["pageRequestStartTime"] = 1716477634222;
    window["pageRequestEndTime"] = 1716477634327;
    window["initialPageLoadStartTime"] = new Date().getTime();
   </script>
   <script async="" defer="" id="gsi-client" nonce="G7BHfFCVvY0ByoFbeT16mQ==" src="https://accounts.google.com/gsi/client">
   </script>
   <script nonce="G7BHfFCVvY0ByoFbeT16mQ==">
    window.KAGGLE_JUPYTERLAB_PATH = "/static/assets/jup

Mengekstrak Data dari HTML

In [11]:
# Menemukan semua tag <a> di halaman
links = soup.find_all('a')

# Menampilkan semua link
for link in links:
    print(link.get('href'))


Mengekstrak Elemen Spesifik

In [12]:
# Menemukan semua tag <p> di halaman
paragraphs = soup.find_all('p')

# Menampilkan semua paragraf
for para in paragraphs:
    print(para.get_text())


Mengekstrak Heading Spesifik

In [13]:
# Menemukan tag <h1>
heading1 = soup.find('h1')

# Menampilkan teks di dalam tag <h1>
if heading1:
    print(heading1.get_text())


Mengekstrak Elemen dengan Kelas atau ID Tertentu

In [14]:
# Menemukan elemen dengan kelas tertentu
elements_with_class = soup.find_all(class_='example-class')

# Menampilkan teks di dalam elemen dengan kelas tertentu
for elem in elements_with_class:
    print(elem.get_text())

# Menemukan elemen dengan ID tertentu
element_with_id = soup.find(id='example-id')

# Menampilkan teks di dalam elemen dengan ID tertentu
if element_with_id:
    print(element_with_id.get_text())


Menyimpan Data yang Diekstrak ke dalam File


In [15]:
# Menyimpan semua URL ke dalam file teks
with open('urls.txt', 'w') as file:
    for link in links:
        href = link.get('href')
        if href:
            file.write(href + '\n')
