# Webscraping

In [69]:
# Import necessary libraries
import requests # make HTTP requests to fetch web pages content
from bs4 import BeautifulSoup # parse HTML and XML docs for easier data extraction
import csv # write scraped data into CSV file
import time # introduce delays between requests to avoid server overload
import re #regular expression
import random # vary time delays to simulate human-like behaviour

In [70]:
# User-Agent headers for automating requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

**Webscraping Page: Jumia**

_Products to scrape:_  
1. Tvs 
    * Smart
    * Digital
2. Cookers

In [71]:
# request url
url1 = 'https://www.jumia.co.ke/televisions/#catalog-listing' # TVs url
url2 = 'https://www.jumia.co.ke/home-cooking-appliances-cookers/#catalog-listing' # Cookers url


Check url status

In [72]:
# Sending GET requests to each URL and check response
def check_response(urls, headers):
    # Output the status code of each response
    for url in urls:
        try:
            response =requests.get(url, headers=headers)
            print(f"url: {response.url} - status: { response.status_code}")

            if response.status_code != 200:
                print(f"Failed to retrieve page. Status code: {response.status_code}")
                return 1  # Return 1 if the request fails
        except requests.exceptions.RequestException as e:
            # Handle GET request exceptions (timeout, connection/http errors etc.)
            print(f"An error occured: {e}")
            return 1  # Return 1 if an error occurs during the request
        
    return 0  # Return 0 if no error

# List of urls
urls = [url1, url2]
responses = check_response(urls, headers=headers) # Check all url GET requests responses

# Check result
if responses == 0:
    print("All pages retrieved successfully.")
else:
    print("Some pages failed to load.")


url: https://www.jumia.co.ke/televisions/#catalog-listing - status: 200
url: https://www.jumia.co.ke/home-cooking-appliances-cookers/#catalog-listing - status: 200
All pages retrieved successfully.


Considering that the webpages are paginated, there is need to navigatethrough, and retrieve data from each page.
The last page in the page numbers is identified, then all pages are iterated.

The pagination URL parameters...

In [73]:
# Define function to get the last page number
def get_last_page_number(urls):
    
    # Parse webpage content with BeautifulSoup
    response =requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find link for last page using its aria-label attribute
    last_page_link = soup.find('a', attrs={'aria-label': 'Last Page'})

    if last_page_link and 'href' in last_page_link.attrs:
        last_page_url =last_page_link['href']
        try:
            page_number = last_page_url.split('?page=')[1].split('#')[0]
            return int(page_number)
        except Exception as e:
            print(f"Error extracting last page number: {e}")
            return 1  # Default to 1 if error occurs


In [74]:
# Function to scrape product details from a given URL
def scrape_product_details(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    products = soup.find_all('a', class_='core')

    product_details = []

    for product in products:
        # Extract product link (relative URL)
        link = product['href'] if 'href' in product.attrs else None
        # Complete the URL if the link is relative
        link = f"https://www.jumia.co.ke{link}" if link and link.startswith('/') else link
    
        # Extract product name from 'data-gtm-name' attribute
        name = product.get('data-gtm-name', "N/A")
        
        # Extract product price (actual price) from the 'prc' div
        price = product.find('div', class_='prc').get_text(strip=True) if product.find('div', class_='prc') else "N/A"
        
        # Extract old price (if available)
        old_price = (
            product.find('div', class_='old') or
            product.find('span', class_='-tal -gy5 -lthr -fs16 -pvxs -ubpt')
        )
        old_price = old_price.get_text(strip=True) if old_price else "N/A"


        # Extract discount (if available)
        discount = (
            product.find('div', class_='bdg _dsct _sm') or
            product.find('span', attrs={'data-disc': True})
        )
        discount = discount.get_text(strip=True) if discount else "N/A"
                
        # Extract item ID (from data-gtm-id attribute)
        item_id = product.get('data-gtm-id', "N/A")
        
        # Extract item brand (from data-gtm-brand attribute)
        item_brand = product.get('data-gtm-brand', "N/A")
        
        # Extract the stars rating (from the 'stars _m _al' or 'stars _s' class)
        stars_rating = product.find('div', class_='stars _m _al') or product.find('div', class_='stars _s')
        if stars_rating:
            rating = stars_rating.get_text(strip=True).split(" out of ")[0]  # Extract the rating value (e.g., "3.9")
        else:
            rating = "N/A"
        
        # Extract reviews count (from the 'rev' class or 'verified ratings' link)
        #reviews = product.find('div', class_='rev')
        #if reviews:
        #   reviews_count = reviews.get_text(strip=True).split('(')[-1].split(')')[0]  # Extract the review count (e.g., "798")
        #else:
        #   reviews_link = product.find('a', class_='-plxs _more')
        #   reviews_count = reviews_link.get_text(strip=True).split('(')[-1].split(')')[0] if reviews_link else "N/A"

        # Extract reviews count (from the 'rev' class)
        reviews = product.find('div', class_='rev')
        if reviews:
        # Use regular expression to find the number inside parentheses
            reviews_count = re.search(r'\((\d+)\)', reviews.get_text(strip=True))
            reviews_count = reviews_count.group(1) if reviews_count else "N/A"
        else:
            reviews_link = product.find('a', class_='-plxs _more')
            reviews_count = re.search(r'\((\d+)\)', reviews_link.get_text(strip=True))
            reviews_count = reviews_count.group(1) if reviews_count else "N/A"

        # Store all the extracted product details
        product_details.append({
            'name': name,
            'discounted_price': price,
            'previous_price': old_price,
            'discount_%': discount,
            'id': item_id,
            'brand': item_brand,
            'rating': rating,
            'reviews_count': reviews,
            'link': link,
        })
    
    return product_details


In [75]:
# Iterate over all URLs to scrape data and save to a CSV file
for url in urls:
    # Get the last page number for the current URL
    last_page = get_last_page_number(url)

    # Initialize an empty list to store all products
    products_list = []

    # Iterate through all pages from 1 to the last page
    for page_num in range(1, last_page + 1):
        page_url = f"{url}?page={page_num}#catalog-listing"
        print(f"Scraping page {page_num} from {url}...")
        
        # Scrape the products from the current page
        products = scrape_product_details(page_url)
        products_list.extend(products)  # Add the scraped products to the main list

        # Sleep for a random time between requests to avoid overwhelming the server
        time.sleep(random.uniform(1, 3))

    # Determine the output CSV file name based on the URL
    if url == url1:
        csv_filename = 'data/scrapped/jumia_scraped_televisions.csv' 
    else:
        csv_filename = 'data/scrapped/jumia_scraped_cookers.csv'

    # Save the scraped product details to a CSV file
    with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=[ 
            'name', 'discounted_price', 'previous_price', 'discount_%', 'id', 'brand', 'rating', 'reviews_count', 'link'
        ])
        writer.writeheader()  # Write the header row
        
        for product in products_list:
            writer.writerow(product)

    print(f"Scraped {len(products_list)} products from {url} and saved them to '{csv_filename}'.")

Scraping page 1 from https://www.jumia.co.ke/televisions/#catalog-listing...


AttributeError: 'NoneType' object has no attribute 'get_text'