In [2]:
#Importing the major libraries needed for web scraping
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup as bso

In [2]:

class JumiaProduct(object):
    """
    This class parses the products section pages of jumia.com.ng,
    then it extracts some data such as the product names, prices, ratings, number of sales,
    links.
    
    You instantiate this class, by passing in:
             a valid section link as a parameter (a string) 
             the start page number (an integer)
             the end page number (an integer)
    
    The link should be the first page of section without having page number in the string:
    
    Correct: https://www.jumia.com.ng/mobile-phones/
             https://www.jumia.com.ng/beauty-corner/
             
    Wrong:   https://www.jumia.com.ng/mobile-phones/?page=1
             https://www.jumia.com.ng/beauty-corner/?page=1
    
    
    Example: JumiaProduct("https://www.jumia.com.ng/mobile-phones/", 2, 4)
    
    If link is wrong, it returns an InvalidLink error.
    
    It has the following methods:
    
       get_pages()
       get_products()
       get_links()
       get_prices()
       get_names()
       get_ratings()
       get_sales()  
    
    """
    
    def __init__(self, category_link, start_page, end_page):
        
        if requests.get(category_link).status_code != 200:
            raise ConnectionError("This link is invalid")
        
        if (".html" or "?") in section_link:
            raise Exception("This link doesn't lead to a Jumia category")
        
        if (type(start_page) or type(end_page)) != int:
            raise TypeError("start_page and end_page should be integers")
            
        
        self.category_link = category_link
        self.start_page = start_page
        self.end_page = end_page
        self.pages = []
        self.products = []
        self.names = []
        self.links = []
        self.prices = []
        self.ratings = []
        self.rated_sales = []
        
        
    def get_pages(self):
        """
        
        This method is used to get the html pages of products from
        the section link provided.
        
        It returns a list of the html pages.
        
        """
        
        for page_number in range(self.start_page, self.end_page + 1):
                address_extension = "?page={}".format(page_number)
                full_address = self.category_link + address_extension
                html_request = requests.get(full_address)
                html_content = html_request.content
                page = bso(html_content, "html.parser").find("section", {"class":"products -mabaya"})
                self.pages.append(page)   
                
    def get_products(self): 
        """
        
        This method is used to get the products from
        the section link provided.
        
        It returns a list of the products.
        
        """
        
        for page in self.pages:
            class_value = re.compile(r"(?:^mabaya sku -gallery$)|(?:^sku -gallery$)|(?:^sku -gallery.+$)")
            self.products += each_page.find_all("div", {"class": class_value})  
          
    def get_links(self):
        """
        
        It takes in a list of html pages already parsed by the get_pages method
        and further parses them to get the link
        to every product.
        
        It returns a list of the product links
        
        """
        
        for product in self.products:
            self.links.append(product.find("a")["href"])
    
    def get_names():
        """
        
        It takes in a list of html pages already parsed by the get_pages method
        and further and parses them to get the name
        of every product.
        
        It returns a list of the product names
        
        """
        
        for product in self.products:
            self.names.append(product.find("span", {"class": "name"}).text)
        
    def get_prices():
        """
        
        It takes in a list of html pages already parsed by the get_pages method
        and further and parses them to get the price
        of every product.
        
        It returns a list of the product prices
        
        """
        
        for product in self.products:
            self.prices.append(product.find("span", {"data-price": re.compile(r"\d+")}).text)
        
    def get_ratings():
        """
        
        It takes in a list of html pages already parsed by the get_pages method
        and further and parses them to get the rating
        of every product.
        
        It returns a list of the product ratings
        
        """
        
        for product in self.products:
            if each.find("div", {"class": "stars"}) == None:
                self.ratings.append("No Rating")
            else:
                rating_tag = each.find("div", {"class": "stars"})["style"]
                rating_re = re.compile(r"[0-9]+")
                stars = format(int(rating_re.findall(rating_tag)[0])/100 * 5, '.2f')
                self.ratings.append(stars)
    
    def get_rated_sales():
        """
        
        It takes in a list of html pages already parsed by the get_pages method
        and further and parses them to get the number of sales
        of every product.
        
        It returns a list of the product sales number
        
        """

        for product in self.products:
            if product.find("div", {"class": "total-ratings"}) == None:
                self.rated_sales.append("No Rated Sales")
            else:
                rated_sales_re = re.compile(r"[0-9]+")
                dirty_sales = each.find("div", {"class": "total-ratings"}).text
                print(rated_sales_re.findall(dirty_sales)[0])

In [16]:
ade = requests.get("https://www.jumia.com.ng/beauty-corner/")

In [30]:
ade2 = ade.content
ade_html = bso(ade2, "html.parser").find("section", {"class":"products -mabaya"})
ade3 = ade_html.find_all("div", {"class": re.compile(r"(?:^mabaya sku -gallery$)|(?:^sku -gallery$)|(?:^sku -gallery.+$)")})
for each in ade3:
    if each.find("div", {"class": "total-ratings"}) == None:
                print("No Rated Sales")
    else:
        rated_sales_re = re.compile(r"[0-9]+")
        dirty_sales = each.find("div", {"class": "total-ratings"}).text
        print(rated_sales_re.findall(dirty_sales)[0])
#.find("span", {"class": "price "}).get_text()
#.find("span", {"data-price": re.compile(r"\d+")}).text

379
No Rated Sales
No Rated Sales
122
29
171
3
146
226
182
45
523
93
214
65
805
55
51
No Rated Sales
132
78
8
2
8
798
6
22
35
3
No Rated Sales
2
No Rated Sales
28
16
61
No Rated Sales
69
25
13
477


In [1]:
import requests
print(requests.__version__)



2.18.1


In [77]:
dir(float)

['__abs__',
 '__add__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getformat__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__int__',
 '__le__',
 '__lt__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rdivmod__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmod__',
 '__rmul__',
 '__round__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__setattr__',
 '__setformat__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__trunc__',
 'as_integer_ratio',
 'conjugate',
 'fromhex',
 'hex',
 'imag',
 'is_integer',
 'real']

In [13]:
asd = "(234)"

In [14]:
asf = asd.replace(["(",")"], "")

TypeError: Can't convert 'list' object to str implicitly

In [12]:
asf

'234)'