In [1]:
import requests

In [2]:
response = requests.get("https://books.toscrape.com/")

In [3]:
response.status_code

200

| Code | Meaning                      |
| ---- | ---------------------------- |
| 200  | OK                           |
| 301  | Moved Permanently (redirect) |
| 403  | Forbidden                    |
| 404  | Not Found                    |
| 500  | Server Error                 |


In [4]:
for key, value in response.headers.items():
    print(f"{key}: {value}")

Date: Thu, 03 Jul 2025 07:24:07 GMT
Content-Type: text/html
Content-Length: 51294
Connection: keep-alive
Last-Modified: Wed, 08 Feb 2023 21:02:32 GMT
ETag: "63e40de8-c85e"
Accept-Ranges: bytes
Strict-Transport-Security: max-age=0; includeSubDomains; preload


Notice that the Content-Type is text/html so this site does not provide structure data like an api so the json will raise an error

In [5]:
from json import JSONDecodeError

try:
    response.json()
except JSONDecodeError as e:
    print(e)

Expecting value: line 1 column 1 (char 0)


In [6]:
print(response.text)  # Return the html like inspect mode available on many the browsers

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html lang="en-us" class="no-js"> <!--<![endif]-->
    <head>
        <title>
    All products | Books to Scrape - Sandbox
</title>

        <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
        <meta name="created" content="24th Jun 2016 09:29" />
        <meta name="description" content="" />
        <meta name="viewport" content="width=device-width" />
        <meta name="robots" content="NOARCHIVE,NOCACHE" />

        <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
        <!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->

        
            <link rel="shortcut icon" href="static/oscar/favicon.

 Let's use request library to get some data from an API.

In [7]:
url = "https://api.agify.io/?name=michael"

response = requests.get(url)

# Check if request was successful
if response.status_code == 200:
    data = response.json()
    print(data)
else:
    print("Failed to retrieve data", response.status_code)

{'count': 298219, 'name': 'michael', 'age': 64}


In [8]:
response.text

'{"count":298219,"name":"michael","age":64}'

In [9]:
for key, value in response.headers.items():
    print(f"{key}: {value}")

Server: nginx/1.16.1
Date: Thu, 03 Jul 2025 07:24:08 GMT
Content-Type: application/json; charset=utf-8
Content-Length: 60
Connection: keep-alive
vary: accept-encoding
content-encoding: gzip
cache-control: max-age=0, private, must-revalidate
x-request-id: GE6s7z2hLQR_DB2Za1zB
access-control-allow-credentials: true
access-control-allow-origin: *
access-control-expose-headers: x-rate-limit-limit,x-rate-limit-remaining,x-rate-limit-reset
x-rate-limit-limit: 100
x-rate-limit-remaining: 98
x-rate-limit-reset: 59752


In [10]:
import os
import random
from pathlib import Path
from urllib.parse import urlencode
from dotenv import load_dotenv
from icecream import ic

Let's take some fake header sections from scrapeops.io/v1/browser_headers using our api_key

In [11]:
parent_dir = Path().resolve().parent
cnt =0

while parent_dir != parent_dir.root and cnt < 5:
    cnt += 1
    config_dir = parent_dir / "configs"
    if config_dir.exists():
        load_dotenv(dotenv_path=config_dir / "scraper.env", override=True)
        break
    parent_dir = parent_dir.parent
else:
    raise FileNotFoundError("Config directory not found")


api_key = os.environ.get("SCRAPEOPS_API_KEY", 'N/A')
response = requests.get(
  url='https://headers.scrapeops.io/v1/browser-headers',
  params={
      'api_key': api_key,
        }
)
result = response.json()["result"]
print('Result: ', "\n\n".join(map(str,result[0:2])), end="\n\n")
# Notice that we can get only the user-agent if we want

user_agents_list = [d.get("user-agent", "Missing-UA") for d in response.json()["result"]]
print(user_agents_list)

Result:  {'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'sec-fetch-site': 'same-site', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '?1', 'sec-fetch-dest': 'document', 'accept-encoding': 'gzip, deflate, br, zstd', 'accept-language': 'en-US'}

{'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'accept': 'text/html,application/xhtml

Given the scrapeops documentation though we can directly get only the user-agents.

In [12]:
response = requests.get(
  url='https://headers.scrapeops.io/v1/user-agents',
  params={
      'api_key': api_key,
        }
)
result = response.json()["result"]
print(result)

['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'Mozilla/5.0 (iPad; CPU OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/131.0.6778.154 Mobile/15E148 Safari/604.1', 'Mozilla/5.0 (Macintosh; Intel Mac O

In [13]:
for _ in range(2):
    print(random.choice(result))

Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36


The user-agents and browser-header will be used in the project with bookscraper where we will set up a middleware to rotate our request info in order to avoid our request get flagged by the website.

Finally, if we want to create the url string by ourselves the best practice is to use the urllib. For example

1) spaces encoded by +
2) query parameters sequentially applied using &

In [14]:
params = {
    "api_key": "hjgsdasg2635871",
    "name": "John Doe",
    "age": 19}
base_url = "https://example.com/search"
f"{base_url}?{urlencode(params)}"

'https://example.com/search?api_key=hjgsdasg2635871&name=John+Doe&age=19'