# Webscraping

In [52]:
# Import necessary libraries
import requests # make HTTP requests to fetch web pages content
from bs4 import BeautifulSoup # parse HTML and XML docs for easier data extraction
import csv # write scraped data into CSV file
import time # introduce delays between requests to avoid server overload
import re #regular expression
import random # vary time delays to simulate human-like behaviour
import psycopg2 #py package to interact with PostgreSQL
from psycopg2 import sql
import logging
import json

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('jumia_scraping.log'),
        logging.StreamHandler()  # Also show logs in console
    ]
)

In [53]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('jumia_scraping.log'),
        logging.StreamHandler()  # Also show logs in console
    ]
)

**Webscraping Page: Jumia**

_Products to scrape:_  
1. Tvs 
    * Smart
    * Digital
2. Cookers

In [54]:
# request url
url1 = 'https://www.jumia.co.ke/televisions/#catalog-listing' # TVs url
url2 = 'https://www.jumia.co.ke/home-cooking-appliances-cookers/#catalog-listing' # Cookers url

# User-Agent headers for automating requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}


Check url status

In [55]:
# Sending GET requests to each URL and check response
def check_response(urls, headers):
    # Output the status code of each response
    for url in urls:
        try:
            response =requests.get(url, headers=headers)
            print(f"url: {response.url} - status: { response.status_code}")

            if response.status_code != 200:
                print(f"Failed to retrieve page. Status code: {response.status_code}")
                return 1  # Return 1 if the request fails
        except requests.exceptions.RequestException as e:
            # Handle GET request exceptions (timeout, connection/http errors etc.)
            print(f"An error occured: {e}")
            return 1  # Return 1 if an error occurs during the request
        
    return 0  # Return 0 if no error

# List of urls
urls = [url1, url2]
responses = check_response(urls, headers=headers) # Check all url GET requests responses

# Check result
if responses == 0:
    print("All pages retrieved successfully.")
else:
    print("Some pages failed to load.")


url: https://www.jumia.co.ke/televisions/#catalog-listing - status: 200
url: https://www.jumia.co.ke/home-cooking-appliances-cookers/#catalog-listing - status: 200
All pages retrieved successfully.


Considering that the webpages are paginated, there is need to navigatethrough, and retrieve data from each page.
The last page in the page numbers is identified, then all pages are iterated.

The pagination URL parameters...

In [56]:
# Define function to get the last page number
def get_last_page_number(urls, headers):
    """
    get the last page number in a catalog listing for pagination iteration

    args:
        urls: webpage URL or URLs
        headers: User-Agent headers
    """
    # Parse webpage content with BeautifulSoup
    #for url in urls:
    response =requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find link for last page using its aria-label attribute
    last_page_link = soup.find('a', attrs={'aria-label': 'Last Page'})

    if last_page_link and 'href' in last_page_link.attrs:
        last_page_url =last_page_link['href']
        try:
            page_number = last_page_url.split('?page=')[1].split('#')[0]
            return int(page_number)
        except Exception as e:
            print(f"Error extracting last page number: {e}")
            return 1  # Default to 1 if error occurs


In [57]:
def scrape_product_details(url, headers):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find all product articles
        product_articles = soup.find_all('article', {'class': 'prd _fb col c-prd'})
        
        logging.info(f"Found {len(product_articles)} products on {url}")
        
        products = []
        for article in product_articles:
            try:
                # Find the core link which contains most product data
                core_link = article.find('a', {'class': 'core'})
                if not core_link:
                    continue

                # Extract data from data-ga4 attributes which contain the most reliable info
                item_id = core_link.get('data-ga4-item_id', "N/A")
                name = core_link.get('data-ga4-item_name', "N/A")
                brand = core_link.get('data-ga4-item_brand', "N/A")
                price = core_link.get('data-ga4-price', "N/A")
                discount = core_link.get('data-ga4-discount', "N/A")
                category = core_link.get('data-ga4-item_category', "N/A")
                subcategory = core_link.get('data-ga4-item_category2', "N/A")
                
                # Get product URL
                product_url = core_link.get('href')
                if product_url:
                    product_url = f"https://www.jumia.co.ke{product_url}"
                else:
                    product_url = "N/A"

                # Extract displayed price and old price
                price_elem = article.find('div', {'class': 'prc'})
                displayed_price = price_elem.text.strip() if price_elem else "N/A"
                
                old_price_elem = article.find('div', {'class': 'old'})
                old_price = old_price_elem.text.strip() if old_price_elem else "N/A"

                # Extract rating information
                stars_elem = article.find('div', {'class': 'stars _s'})
                reviews_count = "0"
                rating = "N/A"
                
                if stars_elem:
                    rating_text = stars_elem.text.strip()
                    if 'out of 5' in rating_text:
                        rating = rating_text.split('out of 5')[0].strip()
                    
                    # Find review count
                    rev_elem = article.find('div', {'class': 'rev'})
                    if rev_elem:
                        reviews_text = rev_elem.text
                        reviews_count = reviews_text.strip('()') if '(' in reviews_text else "0"

                # Check if it's from official store
                is_official = bool(article.find('div', {'class': 'bdg _mall _xs'}))

                # Check if express shipping is available
                has_express = bool(article.find('svg', {'class': 'ic xprss'}))

                if item_id != "N/A":  # Only add products with valid IDs
                    products.append({
                        "product_name": name,
                        "price": displayed_price,
                        "price_value": price,  # Numerical price value
                        "old_price": old_price,
                        "discount_percentage": discount,
                        "rating": rating,
                        "reviews_count": reviews_count,
                        "item_id": item_id,
                        "item_brand": brand,
                        "product_url": product_url,
                        "category": category,
                        "subcategory": subcategory,
                        "is_official_store": is_official,
                        "has_express_shipping": has_express
                    })
                    
            except Exception as e:
                logging.warning(f"Error extracting product details: {str(e)}")
                continue
                
        logging.info(f"Successfully extracted {len(products)} valid products")
        return products
        
    except requests.exceptions.RequestException as e:
        logging.error(f"Request failed for {url}: {str(e)}")
        return []
    except Exception as e:
        logging.error(f"Unexpected error scraping {url}: {str(e)}")
        return []

In [58]:
# Function to get the last page number from the website using the 'Last Page' link
def get_last_page_number(url, headers):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the 'Last Page' link using its aria-label attribute
    last_page_link = soup.find('a', attrs={'aria-label': 'Last Page'})
    
    if last_page_link and 'href' in last_page_link.attrs:
        last_page_url = last_page_link['href']
        try:
            page_number = last_page_url.split('?page=')[1].split('#')[0]
            return int(page_number)
        except Exception as e:
            print(f"Error extracting last page number: {e}")
            return 1  # Default to 1 if error occurs
    else:
        print("Last page link not found.")
        return 1  # Default to 1 page if no last page is found


In [59]:
def save_to_postgresql(products, db_params, table_name):
    try:
        # Connect to PostgreSQL database
        conn = psycopg2.connect(**db_params)
        cursor = conn.cursor()
        
        logging.info(f"Attempting to insert {len(products)} products into {table_name}")
        
        # Create temporary table matching exact schema
        temp_table = f"temp_{table_name}_{int(time.time())}"
        cursor.execute(f"""
            CREATE TEMP TABLE {temp_table} (
                product_name text,
                price text,
                old_price text,
                discount text,
                rating text,
                item_id text,
                item_brand text,
                product_url text
            )
        """)
        
        # Prepare data for insertion
        insert_data = []
        for product in products:
            # Format data to match schema exactly
            row = (
                product["product_name"],
                product["price"],
                product["old_price"],
                product["discount_percentage"],
                f"{product['rating']} ({product['reviews_count']} reviews)",
                product["item_id"],
                product["item_brand"],
                product["product_url"]
            )
            insert_data.append(row)
        
        # Batch insert into temp table
        cursor.executemany(f"""
            INSERT INTO {temp_table} 
            (product_name, price, old_price, discount, rating, item_id, item_brand, product_url)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        """, insert_data)
        
        # Insert only new records from temp table to target table
        insert_query = f"""
            INSERT INTO {table_name}
            SELECT t.*
            FROM {temp_table} t
            LEFT JOIN {table_name} m ON m.item_id = t.item_id
            WHERE m.item_id IS NULL
        """
        cursor.execute(insert_query)
        
        # Get counts for logging
        cursor.execute(f"SELECT COUNT(*) FROM {temp_table}")
        temp_count = cursor.fetchone()[0]
        
        cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
        final_count = cursor.fetchone()[0]
        
        # Get count of newly inserted records
        cursor.execute(f"""
            SELECT COUNT(*) FROM {table_name} m
            WHERE EXISTS (
                SELECT 1 FROM {temp_table} t
                WHERE t.item_id = m.item_id
            )
        """)
        new_records = cursor.fetchone()[0]
        
        conn.commit()
        logging.info(f"Processed {temp_count} products for {table_name}")
        logging.info(f"Inserted {new_records} new records into {table_name}")
        logging.info(f"Table {table_name} now has {final_count} total records")
        
    except Exception as e:
        logging.error(f"Database error while processing {table_name}: {str(e)}")
        if conn:
            conn.rollback()
        raise
    finally:
        if cursor:
            cursor.close()
        if conn:
            conn.close()

In [60]:
def scrape_and_save(url, headers, db_params, table_name):
    print(f"\nStarting scrape for {table_name}")
    
    try:
        # Get last page number
        last_page = get_last_page_number(url, headers)
        print(f"Found {last_page} pages to scrape")
        
        # Initialize products list
        all_products = []
        
        # Scrape each page
        for page_num in range(1, last_page + 1):
            page_url = f"{url}?page={page_num}#catalog-listing"
            print(f"Scraping page {page_num}/{last_page}")
            
            # Scrape products from current page
            page_products = scrape_product_details(page_url, headers)
            print(f"Found {len(page_products)} products on page {page_num}")
            all_products.extend(page_products)
            
            # Random delay
            time.sleep(random.uniform(1, 3))
        
        print(f"Total products scraped: {len(all_products)}")
        
        # Verify first product data before saving
        if all_products:
            print("\nSample of first product data:")
            print(json.dumps(all_products[0], indent=2))
        
        # Save products to database
        if all_products:
            print(f"\nAttempting to save {len(all_products)} products to {table_name}")
            save_to_postgresql(all_products, db_params, table_name)
        else:
            print("No products found to save!")
            
    except Exception as e:
        print(f"Error in scrape_and_save: {str(e)}")
        raise

def save_to_postgresql(products, db_params, table_name):
    conn = None
    cursor = None
    try:
        # Connect to PostgreSQL database
        print("Connecting to database...")
        conn = psycopg2.connect(**db_params)
        cursor = conn.cursor()
        
        print(f"Connected successfully. Inserting {len(products)} products...")
        
        # Simple insert query
        insert_query = f"""
            INSERT INTO {table_name} 
            (product_name, price, old_price, discount, rating, item_id, item_brand, product_url)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        """
        
        # Insert each product and commit after each successful insert
        for i, product in enumerate(products, 1):
            try:
                row = (
                    product["product_name"],
                    product["price"],
                    product["old_price"],
                    product["discount_percentage"],
                    f"{product['rating']} ({product['reviews_count']} reviews)",
                    product["item_id"],
                    product["item_brand"],
                    product["product_url"]
                )
                
                cursor.execute(insert_query, row)
                conn.commit()  # Commit after each insert
                
                if i % 100 == 0:
                    print(f"Inserted {i} products...")
                
            except Exception as e:
                print(f"Error inserting product {i}: {str(e)}")
                print(f"Problem product data: {json.dumps(product, indent=2)}")
                continue
        
        # Verify final count
        cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
        final_count = cursor.fetchone()[0]
        print(f"\nFinal count in {table_name}: {final_count}")
        
        # Show sample of inserted data
        print("\nVerifying inserted data...")
        cursor.execute(f"SELECT * FROM {table_name} LIMIT 1")
        sample = cursor.fetchone()
        if sample:
            print("Sample record:", sample)
        else:
            print("No records found in database!")
        
    except Exception as e:
        print(f"Database error: {str(e)}")
        if conn:
            conn.rollback()
        raise
    finally:
        if cursor:
            cursor.close()
        if conn:
            conn.close()
        print("Database connection closed.")

# Main execution
if __name__ == "__main__":
    try:
        # Your headers and db_params here
        
        # Test database connection first
        print("Testing database connection...")
        conn = psycopg2.connect(**db_params)
        conn.close()
        print("Database connection successful!")
        
        # URLs
        url1 = "https://www.jumia.co.ke/televisions/"
        url2 = "https://www.jumia.co.ke/home-cooking-appliances-cookers/"
        
        # Scrape and save data for both URLs
        scrape_and_save(url1, headers, db_params, "jumia_televisions")
        scrape_and_save(url2, headers, db_params, "jumia_cookers")
        
    except Exception as e:
        print(f"Main execution error: {str(e)}")

Testing database connection...
Database connection successful!

Starting scrape for jumia_televisions
Found 50 pages to scrape
Scraping page 1/50


2025-01-10 20:25:48,041 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=1#catalog-listing
2025-01-10 20:25:48,047 - INFO - Successfully extracted 40 valid products


Found 40 products on page 1
Scraping page 2/50


2025-01-10 20:25:50,412 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=2#catalog-listing
2025-01-10 20:25:50,422 - INFO - Successfully extracted 40 valid products


Found 40 products on page 2
Scraping page 3/50


2025-01-10 20:25:52,180 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=3#catalog-listing
2025-01-10 20:25:52,190 - INFO - Successfully extracted 40 valid products


Found 40 products on page 3
Scraping page 4/50


2025-01-10 20:25:54,364 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=4#catalog-listing
2025-01-10 20:25:54,384 - INFO - Successfully extracted 40 valid products


Found 40 products on page 4
Scraping page 5/50


2025-01-10 20:25:56,885 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=5#catalog-listing
2025-01-10 20:25:56,902 - INFO - Successfully extracted 40 valid products


Found 40 products on page 5
Scraping page 6/50


2025-01-10 20:26:04,116 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=6#catalog-listing
2025-01-10 20:26:04,136 - INFO - Successfully extracted 40 valid products


Found 40 products on page 6
Scraping page 7/50


2025-01-10 20:26:07,585 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=7#catalog-listing
2025-01-10 20:26:07,604 - INFO - Successfully extracted 40 valid products


Found 40 products on page 7
Scraping page 8/50


2025-01-10 20:26:11,254 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=8#catalog-listing
2025-01-10 20:26:11,273 - INFO - Successfully extracted 40 valid products


Found 40 products on page 8
Scraping page 9/50


2025-01-10 20:26:14,338 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=9#catalog-listing
2025-01-10 20:26:14,356 - INFO - Successfully extracted 40 valid products


Found 40 products on page 9
Scraping page 10/50


2025-01-10 20:26:48,184 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=10#catalog-listing
2025-01-10 20:26:48,191 - INFO - Successfully extracted 40 valid products


Found 40 products on page 10
Scraping page 11/50


2025-01-10 20:26:50,172 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=11#catalog-listing
2025-01-10 20:26:50,180 - INFO - Successfully extracted 40 valid products


Found 40 products on page 11
Scraping page 12/50


2025-01-10 20:26:51,953 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=12#catalog-listing
2025-01-10 20:26:51,953 - INFO - Successfully extracted 40 valid products


Found 40 products on page 12
Scraping page 13/50


2025-01-10 20:26:55,168 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=13#catalog-listing
2025-01-10 20:26:55,178 - INFO - Successfully extracted 40 valid products


Found 40 products on page 13
Scraping page 14/50


2025-01-10 20:26:57,106 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=14#catalog-listing
2025-01-10 20:26:57,113 - INFO - Successfully extracted 40 valid products


Found 40 products on page 14
Scraping page 15/50


2025-01-10 20:26:58,799 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=15#catalog-listing
2025-01-10 20:26:58,806 - INFO - Successfully extracted 40 valid products


Found 40 products on page 15
Scraping page 16/50


2025-01-10 20:27:01,461 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=16#catalog-listing
2025-01-10 20:27:01,470 - INFO - Successfully extracted 40 valid products


Found 40 products on page 16
Scraping page 17/50


2025-01-10 20:27:04,541 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=17#catalog-listing
2025-01-10 20:27:04,564 - INFO - Successfully extracted 40 valid products


Found 40 products on page 17
Scraping page 18/50


2025-01-10 20:27:09,073 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=18#catalog-listing
2025-01-10 20:27:09,081 - INFO - Successfully extracted 40 valid products


Found 40 products on page 18
Scraping page 19/50


2025-01-10 20:27:12,217 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=19#catalog-listing
2025-01-10 20:27:12,224 - INFO - Successfully extracted 40 valid products


Found 40 products on page 19
Scraping page 20/50


2025-01-10 20:27:14,230 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=20#catalog-listing
2025-01-10 20:27:14,233 - INFO - Successfully extracted 40 valid products


Found 40 products on page 20
Scraping page 21/50


2025-01-10 20:27:17,667 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=21#catalog-listing
2025-01-10 20:27:17,673 - INFO - Successfully extracted 40 valid products


Found 40 products on page 21
Scraping page 22/50


2025-01-10 20:27:19,669 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=22#catalog-listing
2025-01-10 20:27:19,683 - INFO - Successfully extracted 40 valid products


Found 40 products on page 22
Scraping page 23/50


2025-01-10 20:27:23,403 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=23#catalog-listing
2025-01-10 20:27:23,413 - INFO - Successfully extracted 40 valid products


Found 40 products on page 23
Scraping page 24/50


2025-01-10 20:27:25,988 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=24#catalog-listing
2025-01-10 20:27:26,000 - INFO - Successfully extracted 40 valid products


Found 40 products on page 24
Scraping page 25/50


2025-01-10 20:27:28,909 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=25#catalog-listing
2025-01-10 20:27:28,918 - INFO - Successfully extracted 40 valid products


Found 40 products on page 25
Scraping page 26/50


2025-01-10 20:27:31,119 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=26#catalog-listing
2025-01-10 20:27:31,123 - INFO - Successfully extracted 40 valid products


Found 40 products on page 26
Scraping page 27/50


2025-01-10 20:27:33,237 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=27#catalog-listing
2025-01-10 20:27:33,243 - INFO - Successfully extracted 40 valid products


Found 40 products on page 27
Scraping page 28/50


2025-01-10 20:28:00,135 - INFO - Found 0 products on https://www.jumia.co.ke/televisions/?page=28#catalog-listing
2025-01-10 20:28:00,135 - INFO - Successfully extracted 0 valid products


Found 0 products on page 28
Scraping page 29/50


2025-01-10 20:28:02,369 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=29#catalog-listing
2025-01-10 20:28:02,383 - INFO - Successfully extracted 40 valid products


Found 40 products on page 29
Scraping page 30/50


2025-01-10 20:28:05,383 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=30#catalog-listing
2025-01-10 20:28:05,392 - INFO - Successfully extracted 40 valid products


Found 40 products on page 30
Scraping page 31/50


2025-01-10 20:28:07,386 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=31#catalog-listing
2025-01-10 20:28:07,393 - INFO - Successfully extracted 40 valid products


Found 40 products on page 31
Scraping page 32/50


2025-01-10 20:28:10,903 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=32#catalog-listing
2025-01-10 20:28:10,912 - INFO - Successfully extracted 40 valid products


Found 40 products on page 32
Scraping page 33/50


2025-01-10 20:28:13,314 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=33#catalog-listing
2025-01-10 20:28:13,320 - INFO - Successfully extracted 40 valid products


Found 40 products on page 33
Scraping page 34/50


2025-01-10 20:28:16,811 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=34#catalog-listing
2025-01-10 20:28:16,820 - INFO - Successfully extracted 40 valid products


Found 40 products on page 34
Scraping page 35/50


2025-01-10 20:28:19,064 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=35#catalog-listing
2025-01-10 20:28:19,071 - INFO - Successfully extracted 40 valid products


Found 40 products on page 35
Scraping page 36/50


2025-01-10 20:28:21,187 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=36#catalog-listing
2025-01-10 20:28:21,195 - INFO - Successfully extracted 40 valid products


Found 40 products on page 36
Scraping page 37/50


2025-01-10 20:28:23,221 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=37#catalog-listing
2025-01-10 20:28:23,227 - INFO - Successfully extracted 40 valid products


Found 40 products on page 37
Scraping page 38/50


2025-01-10 20:28:25,259 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=38#catalog-listing
2025-01-10 20:28:25,267 - INFO - Successfully extracted 40 valid products


Found 40 products on page 38
Scraping page 39/50


2025-01-10 20:28:28,492 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=39#catalog-listing
2025-01-10 20:28:28,498 - INFO - Successfully extracted 40 valid products


Found 40 products on page 39
Scraping page 40/50


2025-01-10 20:28:32,084 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=40#catalog-listing
2025-01-10 20:28:32,093 - INFO - Successfully extracted 40 valid products


Found 40 products on page 40
Scraping page 41/50


2025-01-10 20:28:34,908 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=41#catalog-listing
2025-01-10 20:28:34,914 - INFO - Successfully extracted 40 valid products


Found 40 products on page 41
Scraping page 42/50


2025-01-10 20:28:37,988 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=42#catalog-listing
2025-01-10 20:28:37,997 - INFO - Successfully extracted 40 valid products


Found 40 products on page 42
Scraping page 43/50


2025-01-10 20:28:39,869 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=43#catalog-listing
2025-01-10 20:28:39,873 - INFO - Successfully extracted 40 valid products


Found 40 products on page 43
Scraping page 44/50


2025-01-10 20:28:43,298 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=44#catalog-listing
2025-01-10 20:28:43,306 - INFO - Successfully extracted 40 valid products


Found 40 products on page 44
Scraping page 45/50


2025-01-10 20:28:46,431 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=45#catalog-listing
2025-01-10 20:28:46,439 - INFO - Successfully extracted 40 valid products


Found 40 products on page 45
Scraping page 46/50


2025-01-10 20:28:49,965 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=46#catalog-listing
2025-01-10 20:28:49,972 - INFO - Successfully extracted 40 valid products


Found 40 products on page 46
Scraping page 47/50


2025-01-10 20:28:52,274 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=47#catalog-listing
2025-01-10 20:28:52,281 - INFO - Successfully extracted 40 valid products


Found 40 products on page 47
Scraping page 48/50


2025-01-10 20:28:55,984 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=48#catalog-listing
2025-01-10 20:28:55,994 - INFO - Successfully extracted 40 valid products


Found 40 products on page 48
Scraping page 49/50


2025-01-10 20:28:58,076 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=49#catalog-listing
2025-01-10 20:28:58,088 - INFO - Successfully extracted 40 valid products


Found 40 products on page 49
Scraping page 50/50


2025-01-10 20:29:00,279 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=50#catalog-listing
2025-01-10 20:29:00,284 - INFO - Successfully extracted 40 valid products


Found 40 products on page 50
Total products scraped: 1960

Sample of first product data:
{
  "product_name": "HTC4388FS - 43\" Smart  Android Frameless TV - Black (1YR WRTY)",
  "price": "KSh 19,459",
  "price_value": "145.87",
  "old_price": "KSh 28,599",
  "discount_percentage": "68.52",
  "rating": "3.9",
  "reviews_count": "3.9 out of 5(2436",
  "item_id": "VI505EA0KTU37NAFAMZ",
  "item_brand": "Vitron",
  "product_url": "https://www.jumia.co.ke/vitron-htc4388fs-43-smart-android-frameless-tv-black-1yr-wrty-93348943.html",
  "category": "Electronics",
  "subcategory": "Television & Video",
  "is_official_store": true,
  "has_express_shipping": true
}

Attempting to save 1960 products to jumia_televisions
Connecting to database...
Connected successfully. Inserting 1960 products...
Error inserting product 1: duplicate key value violates unique constraint "jumia_televisions_item_id_key"
DETAIL:  Key (item_id)=(VI505EA0KTU37NAFAMZ) already exists.

Problem product data: {
  "product_nam

In [61]:
import requests
from bs4 import BeautifulSoup
import psycopg2
import time
import random
import logging
import json  # Add this import

def scrape_and_save(url, headers, db_params, table_name):
    print(f"\nStarting scrape for {table_name}")
    
    try:
        # Get last page number
        last_page = get_last_page_number(url, headers)
        print(f"Found {last_page} pages to scrape")
        
        # Initialize products list
        all_products = []
        
        # Scrape each page
        for page_num in range(1, last_page + 1):
            page_url = f"{url}?page={page_num}#catalog-listing"
            print(f"Scraping page {page_num}/{last_page}")
            
            # Scrape products from current page
            page_products = scrape_product_details(page_url, headers)
            print(f"Found {len(page_products)} products on page {page_num}")
            all_products.extend(page_products)
            
            # Random delay
            time.sleep(random.uniform(1, 3))
        
        print(f"Total products scraped: {len(all_products)}")
        
        # Save products to database
        if all_products:
            print(f"\nFirst product data (sample):")
            first_product = all_products[0]
            for key, value in first_product.items():
                print(f"{key}: {value}")
                
            print(f"\nAttempting to save {len(all_products)} products to {table_name}")
            save_to_postgresql(all_products, db_params, table_name)
        else:
            print("No products found to save!")
            
    except Exception as e:
        print(f"Error in scrape_and_save: {str(e)}")
        raise


In [62]:
def save_to_postgresql(products, db_params, table_name):
    conn = None
    cursor = None
    try:
        # Connect to PostgreSQL database
        print("Connecting to database...")
        conn = psycopg2.connect(**db_params)
        cursor = conn.cursor()
        
        # Create table if it doesn't exist
        create_table_query = f"""
            CREATE TABLE IF NOT EXISTS {table_name} (
                id SERIAL PRIMARY KEY,
                product_name TEXT,
                price TEXT,
                old_price TEXT,
                discount TEXT,
                rating TEXT,
                reviews_count TEXT,
                category TEXT,
                subcategory TEXT,
                is_official_store BOOLEAN,
                has_express_shipping BOOLEAN,
                item_id TEXT UNIQUE,
                item_brand TEXT,
                product_url TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """
        cursor.execute(create_table_query)
        conn.commit()
        print(f"Table {table_name} created or verified")
        
        print(f"Connected successfully. Inserting {len(products)} products...")
        
        # Simple insert query
        insert_query = f"""
            INSERT INTO {table_name} 
            (product_name, price, old_price, discount, rating, reviews_count, 
             category, subcategory, is_official_store, has_express_shipping,
             item_id, item_brand, product_url)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON CONFLICT (item_id) DO NOTHING
        """
        
        # Insert each product and commit after each successful insert
        for i, product in enumerate(products, 1):
            try:
                row = (
                    product["product_name"],
                    product["price"],
                    product["old_price"],
                    product["discount_percentage"],
                    product["rating"],
                    product["reviews_count"],
                    product.get("category", "N/A"),  # New fields with default values
                    product.get("subcategory", "N/A"),
                    product.get("is_official_store", False),
                    product.get("has_express_shipping", False),
                    product["item_id"],
                    product["item_brand"],
                    product["product_url"]
                )
                
                cursor.execute(insert_query, row)
                conn.commit()  # Commit after each insert
                
                if i % 100 == 0:
                    print(f"Inserted {i} products...")
                
            except Exception as e:
                print(f"Error inserting product {i}: {str(e)}")
                print(f"Problem product data: {json.dumps(product, indent=2)}")
                continue
        
        # Verify final count
        cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
        final_count = cursor.fetchone()[0]
        print(f"\nFinal count in {table_name}: {final_count}")
        
        # Show sample of inserted data
        print("\nVerifying inserted data...")
        cursor.execute(f"SELECT * FROM {table_name} LIMIT 1")
        sample = cursor.fetchone()
        if sample:
            print("Sample record:", sample)
        else:
            print("No records found in database!")
        
    except Exception as e:
        print(f"Database error: {str(e)}")
        if conn:
            conn.rollback()
        raise
    finally:
        if cursor:
            cursor.close()
        if conn:
            conn.close()
        print("Database connection closed.")


In [63]:
# Main execution
if __name__ == "__main__":
    try:
        # Test database connection first
        print("Testing database connection...")
        conn = psycopg2.connect(**db_params)
        conn.close()
        print("Database connection successful!")
        
        # URLs
        url1 = "https://www.jumia.co.ke/televisions/"
        url2 = "https://www.jumia.co.ke/home-cooking-appliances-cookers/"
        
        # Scrape and save data for both URLs
        scrape_and_save(url1, headers, db_params, "jumia_televisions")
        scrape_and_save(url2, headers, db_params, "jumia_cookers")
        
    except Exception as e:
        print(f"Main execution error: {str(e)}")

Testing database connection...
Database connection successful!

Starting scrape for jumia_televisions
Found 50 pages to scrape
Scraping page 1/50


2025-01-10 20:29:06,165 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=1#catalog-listing
2025-01-10 20:29:06,175 - INFO - Successfully extracted 40 valid products


Found 40 products on page 1
Scraping page 2/50


2025-01-10 20:29:08,190 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=2#catalog-listing
2025-01-10 20:29:08,198 - INFO - Successfully extracted 40 valid products


Found 40 products on page 2
Scraping page 3/50


2025-01-10 20:29:11,448 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=3#catalog-listing
2025-01-10 20:29:11,459 - INFO - Successfully extracted 40 valid products


Found 40 products on page 3
Scraping page 4/50


2025-01-10 20:29:14,651 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=4#catalog-listing
2025-01-10 20:29:14,660 - INFO - Successfully extracted 40 valid products


Found 40 products on page 4
Scraping page 5/50


2025-01-10 20:29:18,434 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=5#catalog-listing
2025-01-10 20:29:18,444 - INFO - Successfully extracted 40 valid products


Found 40 products on page 5
Scraping page 6/50


2025-01-10 20:29:21,716 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=6#catalog-listing
2025-01-10 20:29:21,723 - INFO - Successfully extracted 40 valid products


Found 40 products on page 6
Scraping page 7/50


2025-01-10 20:29:25,637 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=7#catalog-listing
2025-01-10 20:29:25,648 - INFO - Successfully extracted 40 valid products


Found 40 products on page 7
Scraping page 8/50


2025-01-10 20:29:27,999 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=8#catalog-listing
2025-01-10 20:29:28,004 - INFO - Successfully extracted 40 valid products


Found 40 products on page 8
Scraping page 9/50


2025-01-10 20:29:29,750 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=9#catalog-listing
2025-01-10 20:29:29,754 - INFO - Successfully extracted 40 valid products


Found 40 products on page 9
Scraping page 10/50


2025-01-10 20:29:32,350 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=10#catalog-listing
2025-01-10 20:29:32,359 - INFO - Successfully extracted 40 valid products


Found 40 products on page 10
Scraping page 11/50


2025-01-10 20:29:34,559 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=11#catalog-listing
2025-01-10 20:29:34,568 - INFO - Successfully extracted 40 valid products


Found 40 products on page 11
Scraping page 12/50


2025-01-10 20:29:36,542 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=12#catalog-listing
2025-01-10 20:29:36,549 - INFO - Successfully extracted 40 valid products


Found 40 products on page 12
Scraping page 13/50


2025-01-10 20:29:39,257 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=13#catalog-listing
2025-01-10 20:29:39,264 - INFO - Successfully extracted 40 valid products


Found 40 products on page 13
Scraping page 14/50


2025-01-10 20:29:42,074 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=14#catalog-listing
2025-01-10 20:29:42,084 - INFO - Successfully extracted 40 valid products


Found 40 products on page 14
Scraping page 15/50


2025-01-10 20:29:45,559 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=15#catalog-listing
2025-01-10 20:29:45,567 - INFO - Successfully extracted 40 valid products


Found 40 products on page 15
Scraping page 16/50


2025-01-10 20:29:48,060 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=16#catalog-listing
2025-01-10 20:29:48,068 - INFO - Successfully extracted 40 valid products


Found 40 products on page 16
Scraping page 17/50


2025-01-10 20:29:51,355 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=17#catalog-listing
2025-01-10 20:29:51,366 - INFO - Successfully extracted 40 valid products


Found 40 products on page 17
Scraping page 18/50


2025-01-10 20:29:53,034 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=18#catalog-listing
2025-01-10 20:29:53,043 - INFO - Successfully extracted 40 valid products


Found 40 products on page 18
Scraping page 19/50


2025-01-10 20:29:55,094 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=19#catalog-listing
2025-01-10 20:29:55,104 - INFO - Successfully extracted 40 valid products


Found 40 products on page 19
Scraping page 20/50


2025-01-10 20:29:58,545 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=20#catalog-listing
2025-01-10 20:29:58,556 - INFO - Successfully extracted 40 valid products


Found 40 products on page 20
Scraping page 21/50


2025-01-10 20:30:01,105 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=21#catalog-listing
2025-01-10 20:30:01,113 - INFO - Successfully extracted 40 valid products


Found 40 products on page 21
Scraping page 22/50


2025-01-10 20:30:02,824 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=22#catalog-listing
2025-01-10 20:30:02,837 - INFO - Successfully extracted 40 valid products


Found 40 products on page 22
Scraping page 23/50


2025-01-10 20:30:23,068 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=23#catalog-listing
2025-01-10 20:30:23,081 - INFO - Successfully extracted 40 valid products


Found 40 products on page 23
Scraping page 24/50


2025-01-10 20:30:26,431 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=24#catalog-listing
2025-01-10 20:30:26,445 - INFO - Successfully extracted 40 valid products


Found 40 products on page 24
Scraping page 25/50


2025-01-10 20:30:29,024 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=25#catalog-listing
2025-01-10 20:30:29,046 - INFO - Successfully extracted 40 valid products


Found 40 products on page 25
Scraping page 26/50


2025-01-10 20:30:32,604 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=26#catalog-listing
2025-01-10 20:30:32,619 - INFO - Successfully extracted 40 valid products


Found 40 products on page 26
Scraping page 27/50


2025-01-10 20:30:35,844 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=27#catalog-listing
2025-01-10 20:30:35,864 - INFO - Successfully extracted 40 valid products


Found 40 products on page 27
Scraping page 28/50


2025-01-10 20:30:38,799 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=28#catalog-listing
2025-01-10 20:30:38,824 - INFO - Successfully extracted 40 valid products


Found 40 products on page 28
Scraping page 29/50


2025-01-10 20:30:41,115 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=29#catalog-listing
2025-01-10 20:30:41,131 - INFO - Successfully extracted 40 valid products


Found 40 products on page 29
Scraping page 30/50


2025-01-10 20:30:44,299 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=30#catalog-listing
2025-01-10 20:30:44,318 - INFO - Successfully extracted 40 valid products


Found 40 products on page 30
Scraping page 31/50


2025-01-10 20:30:46,753 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=31#catalog-listing
2025-01-10 20:30:46,770 - INFO - Successfully extracted 40 valid products


Found 40 products on page 31
Scraping page 32/50


2025-01-10 20:31:12,771 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=32#catalog-listing
2025-01-10 20:31:12,794 - INFO - Successfully extracted 40 valid products


Found 40 products on page 32
Scraping page 33/50


2025-01-10 20:31:15,694 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=33#catalog-listing
2025-01-10 20:31:15,710 - INFO - Successfully extracted 40 valid products


Found 40 products on page 33
Scraping page 34/50


2025-01-10 20:31:18,644 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=34#catalog-listing
2025-01-10 20:31:18,659 - INFO - Successfully extracted 40 valid products


Found 40 products on page 34
Scraping page 35/50


2025-01-10 20:31:20,820 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=35#catalog-listing
2025-01-10 20:31:20,838 - INFO - Successfully extracted 40 valid products


Found 40 products on page 35
Scraping page 36/50


2025-01-10 20:31:23,534 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=36#catalog-listing
2025-01-10 20:31:23,561 - INFO - Successfully extracted 40 valid products


Found 40 products on page 36
Scraping page 37/50


2025-01-10 20:31:25,905 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=37#catalog-listing
2025-01-10 20:31:25,925 - INFO - Successfully extracted 40 valid products


Found 40 products on page 37
Scraping page 38/50


2025-01-10 20:31:28,920 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=38#catalog-listing
2025-01-10 20:31:28,935 - INFO - Successfully extracted 40 valid products


Found 40 products on page 38
Scraping page 39/50


2025-01-10 20:31:31,549 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=39#catalog-listing
2025-01-10 20:31:31,568 - INFO - Successfully extracted 40 valid products


Found 40 products on page 39
Scraping page 40/50


2025-01-10 20:31:35,204 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=40#catalog-listing
2025-01-10 20:31:35,220 - INFO - Successfully extracted 40 valid products


Found 40 products on page 40
Scraping page 41/50


2025-01-10 20:31:38,893 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=41#catalog-listing
2025-01-10 20:31:38,925 - INFO - Successfully extracted 40 valid products


Found 40 products on page 41
Scraping page 42/50


2025-01-10 20:31:41,849 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=42#catalog-listing
2025-01-10 20:31:41,860 - INFO - Successfully extracted 40 valid products


Found 40 products on page 42
Scraping page 43/50


2025-01-10 20:31:44,835 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=43#catalog-listing
2025-01-10 20:31:44,849 - INFO - Successfully extracted 40 valid products


Found 40 products on page 43
Scraping page 44/50


2025-01-10 20:31:48,217 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=44#catalog-listing
2025-01-10 20:31:48,236 - INFO - Successfully extracted 40 valid products


Found 40 products on page 44
Scraping page 45/50


2025-01-10 20:31:51,287 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=45#catalog-listing
2025-01-10 20:31:51,309 - INFO - Successfully extracted 40 valid products


Found 40 products on page 45
Scraping page 46/50


2025-01-10 20:31:53,955 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=46#catalog-listing
2025-01-10 20:31:53,970 - INFO - Successfully extracted 40 valid products


Found 40 products on page 46
Scraping page 47/50


2025-01-10 20:31:56,891 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=47#catalog-listing
2025-01-10 20:31:56,906 - INFO - Successfully extracted 40 valid products


Found 40 products on page 47
Scraping page 48/50


2025-01-10 20:32:00,436 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=48#catalog-listing
2025-01-10 20:32:00,455 - INFO - Successfully extracted 40 valid products


Found 40 products on page 48
Scraping page 49/50


2025-01-10 20:32:04,135 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=49#catalog-listing
2025-01-10 20:32:04,155 - INFO - Successfully extracted 40 valid products


Found 40 products on page 49
Scraping page 50/50


2025-01-10 20:32:06,240 - INFO - Found 40 products on https://www.jumia.co.ke/televisions/?page=50#catalog-listing
2025-01-10 20:32:06,256 - INFO - Successfully extracted 40 valid products


Found 40 products on page 50
Total products scraped: 2000

First product data (sample):
product_name: HTC4388FS - 43" Smart  Android Frameless TV - Black (1YR WRTY)
price: KSh 19,459
price_value: 145.87
old_price: KSh 28,599
discount_percentage: 68.52
rating: 3.9
reviews_count: 3.9 out of 5(2436
item_id: VI505EA0KTU37NAFAMZ
item_brand: Vitron
product_url: https://www.jumia.co.ke/vitron-htc4388fs-43-smart-android-frameless-tv-black-1yr-wrty-93348943.html
category: Electronics
subcategory: Television & Video
is_official_store: True
has_express_shipping: True

Attempting to save 2000 products to jumia_televisions
Connecting to database...
Table jumia_televisions created or verified
Connected successfully. Inserting 2000 products...
Error inserting product 1: column "reviews_count" of relation "jumia_televisions" does not exist
LINE 3: ...product_name, price, old_price, discount, rating, reviews_co...
                                                             ^

Problem product data: {
 