In [1]:
from pathlib import Path
import os
import pandas as pd
import re
import requests
import time
import datetime
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

In [2]:
#BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# BASE_DIR = Path.cwd()
# DATA_DIR = os.path.join(BASE_DIR, 'data')
# os.makedirs(DATA_DIR, exist_ok=True)
# product_category_links_outout = os.path.join(DATA_DIR, 'category-products.csv')
# product_outout = os.path.join(DATA_DIR, 'products.csv')

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / 'data'
if not DATA_DIR.exists():
    DATA_DIR.mkdir(exist_ok=True)

product_category_links_outout = DATA_DIR / 'category-products.csv'
product_outout = DATA_DIR / 'products.csv'

In [3]:
options =  Options()
options.add_argument('--headless')

driver = webdriver.Chrome(options=options)

In [4]:
categories = [
    {'name': 'smartphones', 'url': 'https://www.konga.com/category/smartphones-7539'},
    {'name': 'phones-tablets', 'url': 'https://www.konga.com/category/phones-tablets-5294'},
    {'name': 'home-kitchen', 'url': 'https://www.konga.com/category/home-kitchen-602'}
]

In [5]:
my_regex_pattern = r"https://www.konga.com/product/(?P<slug>[\w-]+)-(?P<product_id>[\w-]+)"

def extract_product_id_from_url(url):
    product_id = None
    regex = re.compile(my_regex_pattern)
    match = regex.match(url)
    if match != None:
        try:
            product_id = match['product_id']
        except:
            pass
    return product_id

In [6]:
def clean_page_links(page_links=[], category=None):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({"url":url, "product_id": product_id, "category": category})
    return final_page_links

In [26]:
def scrape_category_product_links(categories=[]):
    all_product_links =[]
    for category in categories:
        time.sleep(2)
        url = category.get('url')
        driver.get(url)
        body_el = driver.find_element(by=By.CSS_SELECTOR, value='body')
        html_str = body_el.get_attribute('innerHTML')
        html_obj = HTML(html=html_str)
        page_links = [f'https://www.konga.com{x}' for x in html_obj.links if x.startswith('/')]
        cleaned_links = clean_page_links(page_links = page_links, category = category)
        all_product_links += cleaned_links
    return all_product_links

In [28]:
print(scrape_category_product_links(categories=categories))

[{'url': 'https://www.konga.com/product/nokia-c1-2nd-edition-5-45-16gb-rom-1gb-ram-dual-sim-3g-android-11-2500mah-blue-5387899', 'product_id': '5387899', 'category': {'name': 'smartphones', 'url': 'https://www.konga.com/category/smartphones-7539'}}, {'url': 'https://www.konga.com/product/samsung-galaxy-a12-dual-sim-64gb-rom-4gb-ram-4g-lte-6-5-48mp-5000maah-fingerprint-black-5076081', 'product_id': '5076081', 'category': {'name': 'smartphones', 'url': 'https://www.konga.com/category/smartphones-7539'}}, {'url': 'https://www.konga.com/product/nokia-nokia-g21-6-5-64gb-rom-4gb-ram-dual-sim-4g-lte-5050mah-nordic-blue-5661386', 'product_id': '5661386', 'category': {'name': 'smartphones', 'url': 'https://www.konga.com/category/smartphones-7539'}}, {'url': 'https://www.konga.com/product/infinix-smart-6-x657-black-32gb-rom-2gb-ram-5682011', 'product_id': '5682011', 'category': {'name': 'smartphones', 'url': 'https://www.konga.com/category/smartphones-7539'}}, {'url': 'https://www.konga.com/prod

In [29]:
def extract_categories_and_save(categories=[]):
    all_product_links =scrape_category_product_links(categories)
    category_df = pd.DataFrame(all_product_links)
    category_df.to_csv(product_category_links_outout, index=False)

In [31]:
extract_categories_and_save(categories=categories)

In [33]:
def scrape_product_page(url, title_lookup = "._24849_2Ymhg", price_lookup = "._678e4_e6nqh"):
    driver.get(url)
    time.sleep(1.2)
    body_el = driver.find_element(by=By.CSS_SELECTOR, value='body')
    html_str = body_el.get_attribute('innerHTML')
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price

In [34]:
def perform_scrape(cleaned_items = []):
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price != None:
            print(link, title, price)
        product_data = {
            'url':link,
            'product_id':product_id,
            'title':title,
            'price':price
        }  
        data_extracted.append(product_data)
    return data_extracted

In [35]:
#extracted_data = perform_scrape(cleaned_items = final_page_links)

In [36]:
# print(extracted_data)

In [37]:
def row_scrape_event(row, *args, **kwargs):
    link = row['url']
    scraped = 0
    try:
        scraped = row['scraped']
    except:
        pass
    if scraped == 1 or scraped == '1':
        return row
    product_id = row['product_id']
    title, price = (None, None)
    try:
        title, price = scrape_product_page(link)
    except:
        pass
    row['title'] = title
    row['price'] = price
    row['scraped'] = 1
    row['timestamp'] = datetime.datetime.now().timestamp()
    return row

In [40]:
df = pd.read_csv(product_category_links_outout)
df.head()

Unnamed: 0,url,product_id,category
0,https://www.konga.com/product/tecno-camon-18-p...,5480394,"{'name': 'smartphones', 'url': 'https://www.ko..."
1,https://www.konga.com/product/tecno-phantom-x-...,5314645,"{'name': 'smartphones', 'url': 'https://www.ko..."
2,https://www.konga.com/product/infinix-32-hd-sm...,5102735,"{'name': 'smartphones', 'url': 'https://www.ko..."
3,https://www.konga.com/product/infinix-smart-hd...,5066095,"{'name': 'smartphones', 'url': 'https://www.ko..."
4,https://www.konga.com/product/asus-x415ja-bv19...,5497503,"{'name': 'smartphones', 'url': 'https://www.ko..."


In [41]:
#df_sub = df.copy #df.head(n=40)

In [43]:
df = df.apply(row_scrape_event, axis=1)

In [44]:
df.to_csv(product_outout, index=False)

In [45]:
products_df = pd.read_csv(product_outout)


In [47]:
final_df = pd.concat([products_df, df])
final_df.to_csv(product_outout, index=False)

In [48]:
final_df.tail()

Unnamed: 0,url,product_id,category,title,price,scraped,timestamp
10,https://www.konga.com/product/tecno-camon-18-p...,5480394,"{'name': 'home-kitchen', 'url': 'https://www.k...","Tecno Camon 18 Premier -CH9- 6'7"" - 256gb - 8g...","₦180,200",1,1649075000.0
11,https://www.konga.com/product/tecno-phantom-x-...,5314645,"{'name': 'home-kitchen', 'url': 'https://www.k...",Tecno Phantom X (ac8) 256+8gb + Branded Bag - ...,"₦240,000",1,1649075000.0
12,https://www.konga.com/product/infinix-32-hd-sm...,5102735,"{'name': 'home-kitchen', 'url': 'https://www.k...",Infinix 32' HD Smart Android Television,"₦87,600",1,1649075000.0
13,https://www.konga.com/product/infinix-smart-hd...,5066095,"{'name': 'home-kitchen', 'url': 'https://www.k...",Infinix Smart Hd-x612- 2021 - Blue 32gb /2gb +...,"₦46,300",1,1649075000.0
14,https://www.konga.com/product/asus-x415ja-bv19...,5497503,"{'name': 'home-kitchen', 'url': 'https://www.k...","Asus X415ja-bv192t - 14"" Hd -Intel® Core™ I3-1...","₦220,000",1,1649075000.0
