# Webscraping

In [1]:
# Import necessary libraries
import requests # make HTTP requests to fetch web pages content
from bs4 import BeautifulSoup # parse HTML and XML docs for easier data extraction
import csv # write scraped data into CSV file
import time # introduce delays between requests to avoid server overload
import re #regular expression
import random # vary time delays to simulate human-like behaviour
import psycopg2 #py package to interact with PostgreSQL
from psycopg2 import sql
import os
from dotenv import load_dotenv

**Webscraping Page: Jumia**

_Products to scrape:_  
1. Tvs 
    * Smart
    * Digital
2. Cookers

In [28]:
# request url
url1 = 'https://www.jumia.co.ke/televisions/#catalog-listing' # TVs url
url2 = 'https://www.jumia.co.ke/home-cooking-appliances-cookers/#catalog-listing' # Cookers url

# User-Agent headers for automating requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}


Check url status

In [29]:
# Sending GET requests to each URL and check response
def check_response(urls, headers):
    # Output the status code of each response
    for url in urls:
        try:
            response =requests.get(url, headers=headers)
            print(f"url: {response.url} - status: { response.status_code}")

            if response.status_code != 200:
                print(f"Failed to retrieve page. Status code: {response.status_code}")
                return 1  # Return 1 if the request fails
        except requests.exceptions.RequestException as e:
            # Handle GET request exceptions (timeout, connection/http errors etc.)
            print(f"An error occured: {e}")
            return 1  # Return 1 if an error occurs during the request
        
    return 0  # Return 0 if no error

# List of urls
urls = [url1, url2]
responses = check_response(urls, headers=headers) # Check all url GET requests responses

# Check result
if responses == 0:
    print("All pages retrieved successfully.")
else:
    print("Some pages failed to load.")


url: https://www.jumia.co.ke/televisions/#catalog-listing - status: 200
url: https://www.jumia.co.ke/home-cooking-appliances-cookers/#catalog-listing - status: 200
All pages retrieved successfully.


Considering that the webpages are paginated, there is need to navigatethrough, and retrieve data from each page.
The last page in the page numbers is identified, then all pages are iterated.

The pagination URL parameters...

In [30]:
# Define function to get the last page number
def get_last_page_number(urls, headers):
    """
    get the last page number in a catalog listing for pagination iteration

    args:
        urls: webpage URL or URLs
        headers: User-Agent headers
    """
    # Parse webpage content with BeautifulSoup
    #for url in urls:
    response =requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find link for last page using its aria-label attribute
    last_page_link = soup.find('a', attrs={'aria-label': 'Last Page'})

    if last_page_link and 'href' in last_page_link.attrs:
        last_page_url =last_page_link['href']
        try:
            page_number = last_page_url.split('?page=')[1].split('#')[0]
            return int(page_number)
        except Exception as e:
            print(f"Error extracting last page number: {e}")
            return 1  # Default to 1 if error occurs


In [7]:
# Function to fetch and scrape product details from a single page
def scrape_product_details(url, headers):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    catalog_divs = soup.find_all('div', attrs={'data-catalog': 'true'})

    products = []
    for catalog_div in catalog_divs:
        product = catalog_div.find('a', class_='core')
        
        if product:
            name = product.find('h3', class_='name').get_text(strip=True) if product.find('h3', class_='name') else "N/A"
            price = product.find('div', class_='prc').get_text(strip=True) if product.find('div', class_='prc') else "N/A"
            old_price = product.find('div', class_='old').get_text(strip=True) if product.find('div', class_='old') else "N/A"
            discount = product.find('div', class_='bdg _dsct _sm').get_text(strip=True) if product.find('div', class_='bdg _dsct _sm') else "N/A"
            
            rating = product.find('div', class_='rev')
            if rating:
                stars = rating.find('div', class_='stars _s').get_text(strip=True) if rating.find('div', class_='stars _s') else "N/A"
                reviews_count = rating.get_text(strip=True).split('(')[-1].strip(')') if '(' in rating.get_text() else "N/A"
            else:
                stars = "N/A"
                reviews_count = "N/A"

            item_id = product.get('data-gtm-id', "N/A")
            item_brand = product.get('data-gtm-brand', "N/A")
            product_url = product['href'] if product.has_attr('href') else "N/A"

            # Store product info in a dictionary
            products.append({
                "product_name": name,
                "price": price,
                "old_price": old_price,
                "discount": discount,
                "rating": f"{stars} ({reviews_count} reviews)",
                "item_id": item_id,
                "item_brand": item_brand,
                "product_url": f"https://www.jumia.co.ke{product_url}"
            })
        
        # Optional delay between requests
        time.sleep(random.uniform(1, 3))  # Random delay between 1 and 3 seconds

    return products

# Function to get the last page number from the website using the 'Last Page' link
def get_last_page_number(url, headers):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the 'Last Page' link using its aria-label attribute
    last_page_link = soup.find('a', attrs={'aria-label': 'Last Page'})
    
    if last_page_link and 'href' in last_page_link.attrs:
        last_page_url = last_page_link['href']
        try:
            page_number = last_page_url.split('?page=')[1].split('#')[0]
            return int(page_number)
        except Exception as e:
            print(f"Error extracting last page number: {e}")
            return 1  # Default to 1 if error occurs
    else:
        print("Last page link not found.")
        return 1  # Default to 1 page if no last page is found

# Function to save data to PostgreSQL
def save_to_postgresql(products, db_params, table_name):
    try:
        # Connect to your PostgreSQL database
        conn = psycopg2.connect(**db_params)
        cursor = conn.cursor()

        # Insert data into the specified table
        insert_query = f"""
        INSERT INTO {table_name} (product_name, price, old_price, discount, rating, item_id, item_brand, product_url)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        """

        for product in products:
            cursor.execute(insert_query, (
                product["product_name"],
                product["price"],
                product["old_price"],
                product["discount"],
                product["rating"],
                product["item_id"],
                product["item_brand"],
                product["product_url"]
            ))

        # Commit the transaction
        conn.commit()

        print(f"Successfully inserted {len(products)} products into the {table_name} table.")

    except Exception as e:
        print(f"Error occurred: {e}")
        conn.rollback()

    finally:
        cursor.close()
        conn.close()

# Function to scrape and save data for a specific URL and table
def scrape_and_save(url, headers, db_params, table_name):
    print(f"Scraping URL: {url}")
    
    # Get the last page number for the current URL
    last_page = get_last_page_number(url, headers)

    # Initialize an empty list to store all products
    products_list = []

    # Iterate through all pages from 1 to the last page
    for page_num in range(1, last_page + 1):
        page_url = f"{url}?page={page_num}#catalog-listing"
        print(f"Scraping page {page_num} from {url}...")

        # Scrape the products from the current page
        products = scrape_product_details(page_url, headers)
        products_list.extend(products)  # Add the scraped products to the main list

        # Sleep for a random time between requests to avoid overwhelming the server
        time.sleep(random.uniform(1, 3))

    # Save the products to PostgreSQL after scraping all pages for this URL
    if products_list:
        #save_to_postgresql(products_list, db_params, table_name)
        print(f"Saved {len(products_list)} products from {url} to {table_name} table.")
    else:
        print(f"No products found on {url}.")

# Define your headers and the URLs to scrape
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Define url1 and url2
url1 = "https://www.jumia.co.ke/televisions/#catalog-listing"
#url2 = "https://www.jumia.co.ke/cookers/#catalog-listing" 
url2 = "https://www.jumia.co.ke/home-cooking-appliances-cookers/#catalog-listing"
# Database connection parameters
db_params = {
    "host": "localhost",     # Database host
    "database": "e-analytics_db",   # Database name
    "user": "postgres",     # Database user
    "password": "password"  # Database password
}

# Scrape and save data for both URLs
scrape_and_save(url1, headers, db_params, "jumia_televisions")
scrape_and_save(url2, headers, db_params, "jumia_cookers")


Scraping URL: https://www.jumia.co.ke/televisions/#catalog-listing
Scraping page 1 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 2 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 3 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 4 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 5 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 6 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 7 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 8 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 9 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 10 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 11 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 12 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scrapi