# Webscraping

In [10]:
# Import necessary libraries
import requests # make HTTP requests to fetch web pages content
from bs4 import BeautifulSoup # parse HTML and XML docs for easier data extraction
import csv # write scraped data into CSV file
import time # introduce delays between requests to avoid server overload
import re #regular expression
import random # vary time delays to simulate human-like behaviour
import psycopg2 #py package to interact with PostgreSQL
from psycopg2 import sql
import logging
import json

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('jumia_scraping.log'),
        logging.StreamHandler()  # Also show logs in console
    ]
)

In [11]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('jumia_scraping.log'),
        logging.StreamHandler()  # Also show logs in console
    ]
)

**Webscraping Page: Jumia**

_Products to scrape:_  
1. Tvs 
    * Smart
    * Digital
2. Cookers

In [12]:
# request url
url1 = 'https://www.jumia.co.ke/televisions/#catalog-listing' # TVs url
url2 = 'https://www.jumia.co.ke/home-cooking-appliances-cookers/#catalog-listing' # Cookers url

# User-Agent headers for automating requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}


Check url status

In [13]:
# Sending GET requests to each URL and check response
def check_response(urls, headers):
    # Output the status code of each response
    for url in urls:
        try:
            response =requests.get(url, headers=headers)
            print(f"url: {response.url} - status: { response.status_code}")

            if response.status_code != 200:
                print(f"Failed to retrieve page. Status code: {response.status_code}")
                return 1  # Return 1 if the request fails
        except requests.exceptions.RequestException as e:
            # Handle GET request exceptions (timeout, connection/http errors etc.)
            print(f"An error occured: {e}")
            return 1  # Return 1 if an error occurs during the request
        
    return 0  # Return 0 if no error

# List of urls
urls = [url1, url2]
responses = check_response(urls, headers=headers) # Check all url GET requests responses

# Check result
if responses == 0:
    print("All pages retrieved successfully.")
else:
    print("Some pages failed to load.")


url: https://www.jumia.co.ke/televisions/#catalog-listing - status: 200
url: https://www.jumia.co.ke/home-cooking-appliances-cookers/#catalog-listing - status: 200
All pages retrieved successfully.


Considering that the webpages are paginated, there is need to navigatethrough, and retrieve data from each page.
The last page in the page numbers is identified, then all pages are iterated.

The pagination URL parameters...

In [14]:
def scrape_product_details(url, headers):
    products = []
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the catalog listing div
        catalog_listing = soup.find('div', {'id': 'catalog-listing'})
                
        if catalog_listing:
                
            products = []
            for product in catalog_listing:
                try:
                    # Find the core link which contains most product data
                    core_link = product.find('a', {'class': 'core'})
                    if not core_link:
                        continue

                    # Extract data from data-ga4 attributes which contain the most reliable info
                    item_id = core_link.get('data-ga4-item_id', "N/A")
                    name = core_link.get('data-ga4-item_name', "N/A")
                    brand = core_link.get('data-ga4-item_brand', "N/A")
                    price = core_link.get('data-ga4-price', "N/A")
                    discount = core_link.get('data-ga4-discount', "N/A")
                    category = core_link.get('data-ga4-item_category', "N/A")
                    subcategory = core_link.get('data-ga4-item_category2', "N/A")
                
                    # Get product URL
                    product_url = core_link.get('href')
                    if product_url:
                        product_url = f"https://www.jumia.co.ke{product_url}"
                    else:
                        product_url = "N/A"

                    # Extract displayed price and old price
                    price_elem = product.find('div', {'class': 'prc'})
                    displayed_price = price_elem.text.strip() if price_elem else "N/A"
                    
                    old_price_elem = product.find('div', {'class': 'old'})
                    old_price = old_price_elem.text.strip() if old_price_elem else "N/A"

                    # Extract rating information
                    stars_elem = product.find('div', {'class': 'stars _s'})
                    reviews_count = "0"
                    rating = "N/A"
                    
                    if stars_elem:
                        rating_text = stars_elem.text.strip()
                        if 'out of 5' in rating_text:
                            rating = rating_text.split('out of 5')[0].strip()
                        
                        # Find review count
                        rev_elem = product.find('div', {'class': 'rev'})
                        if rev_elem:
                            reviews_text = rev_elem.text
                            reviews_count = reviews_text.strip('()') if '(' in reviews_text else "0"

                    # Check if it's from official store
                    is_official = bool(product.find('div', {'class': 'bdg _mall _xs'}))

                    # Check if express shipping is available
                    has_express = bool(product.find('svg', {'class': 'ic xprss'}))

                    if item_id != "N/A":  # Only add products with valid IDs
                        product_info = {
                            "name": name,
                            "price": displayed_price,
                            "price": price,  # Numerical price value
                            "old_price": old_price,
                            "discount": discount,
                            "rating": rating,
                            "reviews": reviews_count,
                            "product_id": item_id,
                            "brand": brand,
                            "url": product_url,
                            #"size": product_size
                            "category": category,
                            "subcategory": subcategory,
                            "official_store": is_official,
                            "express_shipping": has_express
                        }
                    products.append(product_info)
                except Exception as e:
                    logging.warning(f"Error extracting product details: {str(e)}")
                    continue
                
        logging.info(f"Successfully extracted {len(products)} valid products")
        
        # Optional delay between requests
        time.sleep(random.uniform(1, 3))  # Random delay between 1 and 3 seconds

        return products
        
    except requests.exceptions.RequestException as e:
        logging.error(f"Request failed for {url}: {str(e)}")
        return []
    except Exception as e:
        logging.error(f"Unexpected error scraping {url}: {str(e)}")
        return []

In [15]:
# Define function to get the last page number
def get_last_page_number(urls, headers):
    """
    get the last page number in a catalog listing for pagination iteration

    args:
        urls: webpage URL or URLs
        headers: User-Agent headers
    """
    # Parse webpage content with BeautifulSoup
    #for url in urls:
    response =requests.get(urls, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find link for last page using its aria-label attribute
    last_page_link = soup.find('a', attrs={'aria-label': 'Last Page'})

    if last_page_link and 'href' in last_page_link.attrs:
        last_page_url =last_page_link['href']
        try:
            page_number = last_page_url.split('?page=')[1].split('#')[0]
            return int(page_number)
        except Exception as e:
            print(f"Error extracting last page number: {e}")
            return 1  # Default to 1 if error occurs


In [16]:
def save_to_postgresql(products, db_params, table_name):
    #conn = None
    #cursor = None
    try:
        # Connect to PostgreSQL database
        conn = psycopg2.connect(**db_params)
        cursor = conn.cursor()
        
        logging.info(f"Attempting to insert {len(products)} products into {table_name}")
        
        # Create table if it doesn't exist
        create_table_query = f"""
            CREATE TABLE IF NOT EXISTS {table_name} (
                id SERIAL PRIMARY KEY,
                product_name TEXT,
                product_id TEXT UNIQUE,
                price TEXT,
                old_price TEXT,
                discount TEXT,
                brand TEXT,
                rating TEXT,
                reviews TEXT,
                category TEXT,
                subcategory TEXT,
                official_store BOOLEAN,
                express_shipping BOOLEAN,              
                url TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """
        cursor.execute(create_table_query)
        conn.commit()
        print(f"Table {table_name} created or verified")
        
        print(f"Connected successfully. Inserting {len(products)} products...")
        
        # Insert data into the specified table
        insert_query = f"""
        INSERT INTO {table_name} (product_name, price, old_price, discount, rating, item_id, item_brand, product_url)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        """
        
        # Prepare data for insertion
        insert_data = []
        for product in products:
            # Format data to match schema exactly
            row = (
                product["name"],
                product["price"],
                product["old_price"],
                product["discount"],
                #f"{product['rating']} ({product['reviews_count']} reviews)",
                product["rating"],
                product["reviews"],
                product["product_id"],
                product["brand"],
                #product["size"],
                #product["type"],
                #product["source"]
                product["category"],
                product["subcategory"],
                product["url"]
            )
            insert_data.append(row)
        
        
    except Exception as e:
        logging.error(f"Database error while processing {table_name}: {str(e)}")
        if conn:
            conn.rollback()
        raise
    finally:
        if cursor:
            cursor.close()
        if conn:
            conn.close()
        
               
        # Verify final count
        cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
        final_count = cursor.fetchone()[0]
        print(f"\nFinal count in {table_name}: {final_count}")
        
        # Show sample of inserted data
        print("\nVerifying inserted data...")
        cursor.execute(f"SELECT * FROM {table_name} LIMIT 1")
        sample = cursor.fetchone()
        if sample:
            print("Sample record:", sample)
        else:
            print("No records found in database!")

            

In [17]:
def scrape_and_save(urls, headers, db_params, table_name):
    print(f"\nStarting scrape for {table_name}")
    
    try:
        # Get last page number
        last_page = get_last_page_number(urls, headers)
        print(f"Found {last_page} pages to scrape")
        
        # Initialize products list
        products = []
        
        # Scrape each page
        for page_num in range(1, last_page + 1):
            page_url = f"{urls}?page={page_num}#catalog-listing"
            print(f"Scraping page {page_num}/{last_page}")
            
            # Scrape products from current page
            page_products = scrape_product_details(page_url, headers)
            print(f"Found {len(page_products)} products on page {page_num}")
            products.extend(page_products)
            
            # Random delay
            time.sleep(random.uniform(1, 3))
        
        print(f"Total products scraped: {len(products)}")
        
        # Save products to database
        if products:
            print(f"\nFirst product data (sample):")
            first_product = products[0]
            for key, value in first_product.items():
                print(f"{key}: {value}")
                
            print(f"\nAttempting to save {len(products)} products to {table_name}")
            save_to_postgresql(products, db_params, table_name)
        else:
            print("No products found to save!")
            
    except Exception as e:
        print(f"Error in scrape_and_save: {str(e)}")
        raise
        
        # URLs
        url1 = "https://www.jumia.co.ke/televisions/"
        url2 = "https://www.jumia.co.ke/home-cooking-appliances-cookers/"
        
        # Database connection parameters
        db_params = {
            "host": "localhost",     # Database host
            "database": "e-analytics_db",   # Database name
            "user": "postgres",     # Database user
            "password": "password"  # Database password
        }
        # Scrape and save data for both URLs
        scrape_and_save(url1, headers, db_params, "jumia_televisions")
        scrape_and_save(url2, headers, db_params, "jumia_cookers")
        
    except Exception as e:
        print(f"Main execution error: {str(e)}")

In [None]:
# Function to fetch and scrape product details from a single page
def scrape_product_details(url, headers):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    catalog_divs = soup.find_all('div', attrs={'data-catalog': 'true'})

    products = []
    for catalog_div in catalog_divs:
        product = catalog_div.find('a', class_='core')
        
        if product:
            name = product.find('h3', class_='name').get_text(strip=True) if product.find('h3', class_='name') else "N/A"
            price = product.find('div', class_='prc').get_text(strip=True) if product.find('div', class_='prc') else "N/A"
            old_price = product.find('div', class_='old').get_text(strip=True) if product.find('div', class_='old') else "N/A"
            discount = product.find('div', class_='bdg _dsct _sm').get_text(strip=True) if product.find('div', class_='bdg _dsct _sm') else "N/A"
            
            rating = product.find('div', class_='rev')
            if rating:
                stars = rating.find('div', class_='stars _s').get_text(strip=True) if rating.find('div', class_='stars _s') else "N/A"
                reviews_count = rating.get_text(strip=True).split('(')[-1].strip(')') if '(' in rating.get_text() else "N/A"
            else:
                stars = "N/A"
                reviews_count = "N/A"

            item_id = product.get('data-gtm-id', "N/A")
            item_brand = product.get('data-gtm-brand', "N/A")
            product_url = product['href'] if product.has_attr('href') else "N/A"

            # Store product info in a dictionary
            products.append({
                "product_name": name,
                "price": price,
                "old_price": old_price,
                "discount": discount,
                "rating": f"{stars} ({reviews_count} reviews)",
                "item_id": item_id,
                "item_brand": item_brand,
                "product_url": f"https://www.jumia.co.ke{product_url}"
            })
        
        # Optional delay between requests
        time.sleep(random.uniform(1, 3))  # Random delay between 1 and 3 seconds

    return products

# Function to get the last page number
def get_last_page_number(url, headers):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the last page link
        last_page_link = soup.find('a', {'aria-label': 'Last Page'})
        
        if last_page_link:
            last_page_url = last_page_link.get('href', '')
            
            # Extract the page number from the URL
            page_number_str = last_page_url.split('=')[-1]
            page_number_str = page_number_str.split('#')[0]  # Remove any fragment identifier (#catalog-listing)
            
            page_number = int(page_number_str) if page_number_str.isdigit() else 1
            return page_number
        else:
            logging.warning(f"Last page link not found for {url}. Defaulting to page 1.")
            return 1
    except Exception as e:
        logging.error(f"Error fetching last page number for {url}: {str(e)}")
        return 1

# Function to save data to PostgreSQL
def save_to_postgresql(products, db_params, table_name):
    try:
        # Connect to your PostgreSQL database
        conn = psycopg2.connect(**db_params)
        cursor = conn.cursor()

        # Insert data into the specified table
        insert_query = f"""
        INSERT INTO {table_name} (product_name, price, old_price, discount, rating, item_id, item_brand, product_url)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        """

        for product in products:
            cursor.execute(insert_query, (
                product["product_name"],
                product["price"],
                product["old_price"],
                product["discount"],
                product["rating"],
                product["item_id"],
                product["item_brand"],
                product["product_url"]
            ))

        # Commit the transaction
        conn.commit()

        print(f"Successfully inserted {len(products)} products into the {table_name} table.")

    except Exception as e:
        print(f"Error occurred: {e}")
        conn.rollback()

    finally:
        cursor.close()
        conn.close()

# Function to scrape and save data for a specific URL and table
def scrape_and_save(url, headers, db_params, table_name):
    print(f"Scraping URL: {url}")
    
    # Get the last page number for the current URL
    last_page = get_last_page_number(url, headers)

    # Initialize an empty list to store all products
    products_list = []

    # Iterate through all pages from 1 to the last page
    for page_num in range(1, last_page + 1):
        page_url = f"{url}?page={page_num}#catalog-listing"
        print(f"Scraping page {page_num} from {url}...")

        # Scrape the products from the current page
        products = scrape_product_details(page_url, headers)
        products_list.extend(products)  # Add the scraped products to the main list

        # Sleep for a random time between requests to avoid overwhelming the server
        time.sleep(random.uniform(1, 3))

    # Save the products to PostgreSQL after scraping all pages for this URL
    if products_list:
        #save_to_postgresql(products_list, db_params, table_name)
        print(f"Saved {len(products_list)} products from {url} to {table_name} table.")
    else:
        print(f"No products found on {url}.")

# Define your headers and the URLs to scrape
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Define url1 and url2
url1 = "https://www.jumia.co.ke/televisions/#catalog-listing"
#url2 = "https://www.jumia.co.ke/cookers/#catalog-listing" 
url2 = "https://www.jumia.co.ke/home-cooking-appliances-cookers/#catalog-listing"
# Database connection parameters
db_params = {
    "host": "localhost",     # Database host
    "database": "e-analytics_db",   # Database name
    "user": "postgres",     # Database user
    "password": "password"  # Database password
}

# Scrape and save data for both URLs
scrape_and_save(url1, headers, db_params, "jumia_televisions")
scrape_and_save(url2, headers, db_params, "jumia_cookers")


Scraping URL: https://www.jumia.co.ke/televisions/#catalog-listing
Scraping page 1 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 2 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 3 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 4 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 5 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 6 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 7 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 8 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 9 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 10 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 11 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scraping page 12 from https://www.jumia.co.ke/televisions/#catalog-listing...
Scrapi