In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
# Function to extract Product Name
def get_ProductName(soup):
    try:
        # Outer Tag Object
        Product_Name = soup.find(class_="-fs20 -pts -pbxs")
        
        # Inner NavigatableString Object
        ProductName_value = Product_Name.text

        # Title as a string value
        ProductName_string = ProductName_value.strip()

    except AttributeError:
        ProductName_string = ""

    return ProductName_string

# Function to extract Product Price
def get_price(soup):
    try:
        price = soup.find(class_="-b -ubpt -tal -fs24 -prxs").string.strip()

    except AttributeError:
        price = ""

    return price

# Function to extract old price
def get_Oldprice(soup):
    try:
        price = soup.find(class_="-tal -gy5 -lthr -fs16 -pvxs -ubpt").string.strip()

    except AttributeError:
        price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):
    try:
       rating_div = soup.find("div", class_="stars _m _al")
       rating = rating_div.text.strip()

    except AttributeError:
        rating = ""    

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    review_count = ""
    try:
        verified_ratings_link = soup.find("a", class_="-plxs _more")
        if verified_ratings_link:
            review_count = verified_ratings_link.text.strip()

    except AttributeError:
        pass

    return review_count


if __name__ == '__main__':
    HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'}
    
    URL="https://www.jumia.co.ke/category-url-en-wide-screen-tv/?rating=1-5&page=4#catalog-listing"
    
    webpage = requests.get(URL, headers=HEADERS)
    
    soup = BeautifulSoup(webpage.content, "html.parser")
    
    links = soup.find_all("a", attrs={'class': 'core'})
    
    links_list = []
    
    for link in links:
        links_list.append(link.get('href'))
    
    d = {"Product Name":[], "Product Price":[], "Price Before Discount":[], "rating":[], "review count":[]}
    
    # Loop for extracting product details from each link 
    for link in links_list:
        new_webpage = requests.get("https://jumia.co.ke" + link, headers=HEADERS, allow_redirects=True)
    
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")
         
        # Function calls to display all necessary product information
        d['Product Name'].append(get_ProductName(new_soup))
        d['Product Price'].append(get_price(new_soup))
        d['Price Before Discount'].append(get_Oldprice(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['review count'].append(get_review_count(new_soup))

In [3]:
jumia_df = pd.DataFrame.from_dict(d)
jumia_df['Product Name'].replace('', np.nan, inplace=True)
jumia_df = jumia_df.dropna(subset=['Product Name'])
jumia_df.to_csv("jumia_data40.csv", header=True, index=False)

In [4]:
jumia_df

Unnamed: 0,Product Name,Product Price,Price Before Discount,rating,review count
0,Samsung 55CU8000 CRYSTAL UHD 4K SMART TV,"KSh 83,999","KSh 100,000",5 out of 5,(1 verified rating)
1,"Vitron 4K UHD Android 50"" Inch-TV,BLUETOOTH-EN...","KSh 38,990","KSh 60,000",4.3 out of 5,(7 verified ratings)


In [2]:
import pandas as pd
import os

# Define the directory containing the CSV files
csv_directory = r'C:\Users\user\Desktop\Project Data'  # Use raw string to handle backslashes

# Create an empty list to hold dataframes
dataframes = []

# Loop through each file in the directory
for filename in os.listdir(csv_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(csv_directory, filename)
        # Read the CSV file into a dataframe
        df = pd.read_csv(file_path)
        # Append the dataframe to the list
        dataframes.append(df)

# Concatenate all dataframes in the list
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a new CSV file
combined_df.to_csv('combined_data.csv', index=False)



In [3]:
combined_df

Unnamed: 0,Product Name,Product Price,Price Before Discount,rating,review count
0,Hisense 43A6K 43 inch 4K UHD Smart TV (2YRs WRTY),"KSh 37,199","KSh 43,000",5 out of 5,(1 verified rating)
1,"TCL 43S5400A,43"" Inch Smart ANDROID TV BT Ica...","KSh 34,490","KSh 42,999",4.4 out of 5,(8 verified ratings)
2,"Vitron 32"" Inch Smart Android Tv,Frameless, Ne...","KSh 14,999","KSh 21,300",4.2 out of 5,(18 verified ratings)
3,"Gld 43"" Smart Frameless Full HD LED Television...","KSh 19,497","KSh 30,000",4 out of 5,(1 verified rating)
4,Hifinit 43'' Inch Android Smart FHD NETFLIX YO...,"KSh 21,990","KSh 37,200",4.3 out of 5,(4 verified ratings)
...,...,...,...,...,...
1526,CTC Smart Android Television-32'' With Inbuilt...,"KSh 13,999","KSh 21,000",1 out of 5,(1 verified rating)
1527,"Vitron 32 Inch Frameless Smart Android TV,Inbu...","KSh 14,999","KSh 19,999",0 out of 5,(No ratings available)
1528,"Gld 32"" CLEAR Frameless Smart Android TV NETF...","KSh 13,284","KSh 15,899",4 out of 5,(3 verified ratings)
1529,PowMr 20A MPPT Solar Charger Controller 60Voc ...,"KSh 4,270","KSh 6,500",0 out of 5,(No ratings available)
