In [1]:
#Importing libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import numpy as np

### Scraping Fridges data

In this process, we are scraping fridge product data from the Kilimall site. The data is then cleaned by handling missing values, removing duplicates, and standardizing the format for consistency. This ensures that the dataset is accurate, structured, and ready for analysis or further processing.


In [2]:
# Initialize a list to store all fridges data
fridges = []

# Loop through all 16 pages
for x in range(1, 17):  # Pages 1 to 16
    print(f"Scraping page {x}...")
    
    # Send a GET request to the page
    result = requests.get(f'https://www.kilimall.co.ke/search?q=FRIDGE&page={x}&source=search|enterSearch|FRIDGE')
    
    # Check if the request was successful
    if result.status_code == 200:
        soup = BeautifulSoup(result.text, 'html.parser')  # Parse the HTML content
        
        # Extract fridge details from divs with the class "info-box".
        fridges_info = soup.find_all('div', class_="info-box")
        
        # Extract relevant details
        for fridge_info in fridges_info:
            # Safely extract data, handle cases where tags are missing
            fridge_name = fridge_info.find('p', class_='product-title')
            fridge_price = fridge_info.find('div', class_='product-price')
            fridge_reviews = fridge_info.find('span', class_='reviews')
            
            # Clean and append extracted data
            fridges.append({
                "Name": fridge_name.text.strip() if fridge_name else "N/A",
                "Price": fridge_price.text.strip() if fridge_price else "N/A",
                "Reviews": fridge_reviews.text.strip() if fridge_reviews else "N/A"
            })
    else:
        print(f"Failed to fetch page {x}, Status code: {result.status_code}")

# Print results
for fridge in fridges:
    print(fridge)


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
{'Name': 'Volsmart 138L Fridge Freezer VL-BCD138 Energy Saving Double Doors Refrigerator', 'Price': 'KSh 25,299', 'Reviews': '(155)'}
{'Name': 'Hisense 94  Liters fridge single door Energy Saving  REFO94DR Refrigerator', 'Price': 'KSh 19,299', 'Reviews': '(483)'}
{'Name': 'Volsmart 108L Double Doors VL-BCD108 Fridge Freezer Energy Saving Refrigerator with Lock and Keys', 'Price': 'KSh 20,299', 'Reviews': '(94)'}
{'Name': '„ÄêNew Year Sale„ÄëVolsmart 138L Fridge Freezer VL-BCD138 Energy Saving Double Doors Refrigerator 138L fridge double door Direct Cool fridge Refrigerator fridges and freezers fridge138ltrs', 'Price': 'KSh 25,555', 'Reviews': '(28)'}
{'Name': 'ROCH Single Door Min

In [None]:
#Save results as a DataFrame
df_fridges = pd.DataFrame(fridges)
df_fridges.sample(20)

Unnamed: 0,Name,Price,Reviews
21,„Äêspecial off„ÄëRamtons - RF/214 1 Door Fridge En...,"KSh 18,999",(0)
223,Ramtons RF/216 - 2 Door Direct Cool Fridge - 2...,"KSh 43,999",(0)
316,"Ramtons 2 Door Direct Cool Fridge, 128 Litres...","KSh 38,900",(0)
152,Volsmart 108L Double Door Fridge With A Larger...,"KSh 24,000",(0)
325,Premier 90 Litres Single Door Mini-Fridge,"KSh 19,899",(0)
210,LG LinearCooling fridge GV-B212PLGB 217L Top F...,"KSh 87,999",(0)
320,Hisense Fridge 176 Liters With Dispenser REF17...,"KSh 39,999",(0)
429,"RAMTONS 204 LITERS 2 DOOR DIRECT COOL FRIDGE, ...","KSh 48,900",(0)
52,Ramtons RF/130- 213L 2 Door Direct Cool Fridge...,"KSh 48,700",(0)
546,Nunix (BC-92) 92L Single Door Fridge Energy Ef...,"KSh 24,650",(0)


##### Data cleaning

In [4]:
# Function to clean the product name
def clean_name(name):
    
    # Remove words in parentheses or curly brackets if they contain "offer", "offers", "sale", or "sales"
    name = re.sub(r'\(([^)]*?(OFFER|OFFERS|SALE|SALES)[^)]*?)\)', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\{([^}]*?(OFFER|OFFERS|SALE|SALES)[^}]*?)\}', '', name, flags=re.IGNORECASE)

    # Remove variations of "offer" and "sale" (including "offers", "sales")
    name = re.sub(r'\b(\w+)\s+(OFFER|OFFERS|SALE|SALES)\b', '', name, flags=re.IGNORECASE)
    
    # Remove unnecessary marketing phrases
    name = re.sub(r'\b(BLACK FRIDAY|BLACK FRIDAY OFFERS|BEST DEALS|LIMITED|LIMITED TIME|TECH WEEK|OFFER|BEST WHOLESALE PRICE|SPECIAL OFFERS)\b', '', name, flags=re.IGNORECASE)
    
    # Remove all remaining parentheses, curly braces, brackets, and clean extra spaces
    name = re.sub(r'[\(\)\{\}\[\]]', '', name)  # Remove parentheses, braces, and brackets
    name = re.sub(r'\s+', ' ', name)  # Replace multiple spaces with a single space
    
    # Remove special characters like '!', '+' if they appear as the first word
    name = re.sub(r'^[!+\[\]]+', '', name).strip()  # Strip unwanted characters at the start
    
    # Remove emojis using a regex for unicode emoji ranges
    name = re.sub(r'[^\w\s,.-]', '', name)  # Remove non-alphanumeric characters (including emojis)

    # Final trim to remove leading/trailing spaces
    name = name.strip()
    
    return name

# Apply the cleaning function to the 'Name' column in the DataFrame

df_fridges['Name'] = df_fridges['Name'].apply(clean_name)

In [5]:
#Remove commas and any text from Price column
df_fridges["Price"] = df_fridges['Price'].str.replace(r'[^\d]', '', regex=True)
# Rename the Price column
df_fridges = df_fridges.rename(columns={'Price': 'Price(kshs)'})



#Remove brackets from Reviews column
df_fridges['Reviews'] = df_fridges['Reviews'].str.extract(r'(\d+)')
df_fridges.sample(20)

Unnamed: 0,Name,Price(kshs),Reviews
476,Royal Double Door 138L RF-150D Fridge,42995,0
3,New Volsmart 138L Fridge Freezer VL-BCD138 Ene...,25555,28
196,Haier 357L Double Door No-Frost Fridge HRF-385BS,95695,0
301,TODAY Hisense RS-12DR4SA Single Door Direct Co...,25999,0
102,"MIKA Fridge, 247L, 2 Door Top Mount Freezer, N...",69999,0
472,MIKA 90 LITRES SINGLE DOOR FRIDGE.,29999,0
475,150 litres Roch 190 si single door refrigerato...,39150,0
210,LG LinearCooling fridge GV-B212PLGB 217L Top F...,87999,0
320,Hisense Fridge 176 Liters With Dispenser REF17...,39999,0
24,Roch RFR-150-DT-I 118L Double Door Refrigerato...,28749,31


##### Remove duplicates

In [18]:
#Checking how many duplicates
duplicate_count = int(df_fridges.duplicated( keep=False).sum())
print(f"There are {duplicate_count} duplicates")

#Find all duplicates
duplicates = df_fridges[df_fridges.duplicated( keep=False)]
duplicates


There are 41 duplicates


Unnamed: 0,Name,Price(kshs),Reviews
20,RAMTONS 138 LITERS 2 DOOR DIRECT COOL 3 STAR F...,36990,1
64,"Mika Fridge, Double Door- 138 Litres",40000,0
84,"MIKA Fridge, 168L, 2 Door Top Mount Freezer, D...",36999,0
87,"MIKA Fridge, 168L, 2 Door Top Mount Freezer, D...",36999,0
95,"MIKA Fridge, 197L, 2 Door Top Mount Freezer, N...",66500,0
96,"MIKA Fridge, 197L, 2 Door Top Mount Freezer, N...",66500,0
97,"MIKA Fridge, 197L, 2 Door Top Mount Freezer, N...",66500,0
111,"MIKA Fridge, 168L, 2 Door Top Mount Freezer, D...",38999,0
116,AILYONS Double Door RefrigeratorFridge Top Mou...,35000,0
117,AILYONS Double Door RefrigeratorFridge Top Mou...,35000,0


In [22]:
# Remove duplicates, keeping the first occurrence
df_fridges = df_fridges.drop_duplicates()



##### Feature Extraction

In [26]:
# Extract the number of doors
def extract_doors(description):
    # Define the regex pattern to match numbers/keywords before "Door" or "Doors"
    pattern = r'\b(1|one|2|two|3|three|4|four|Single|Double)\b(?:\s*Doors)?'
    # Search for the pattern in the description
    match = re.search(pattern, description, re.IGNORECASE)
    # Map matches to corresponding numeric values
    door_mapping = {
        "1": 1,
        "one": 1,
        "single": 1,
        "2": 2,
        "two": 2,
        "double": 2,
        "4": 4,
        "four": 4}
    if match:
        door_type = match.group(1).lower()  # Convert the match to lowercase
        return door_mapping.get(door_type, "Unknown")  # Map to the number of doors
    return "Unknown"  # If no match is found



# Extract capacity in litres
def extract_capacity(description):
    # Define the regex pattern
    pattern = r'(\d+(\.\d+)?)\s*(L|litres|ltrs|lt)'
    # Search for the pattern in the description
    match = re.search(pattern, description, re.IGNORECASE)
    if match:
        return float(match.group(1))  # Return the number as float
    return None  # Return None if no match is found




#Extract brand names
brands = ['Volsmart','Hisense','Roch','Nunnix','Smartpro','Nunix','Ecomax','Ramtons','Mika','Von','Haier','Exzel','GLD','Vitron','Smartpro','Bruhm','Premier','Samsung', 'Ailyons', 'LG', 'Solstar', 'Royal','Beko','Syinix','ICECOOL','Rebune','Legacy','FK','Smart pro']
# Function to extract the brand name
def extract_brand(product_name):
    for brand in brands:
        if brand.lower() in product_name.lower():  # Case insensitive match
            return brand
    return 'Unknown'  # Return 'Unknown' if no brand is found





# Apply the extraction functions to the DataFrame
df_fridges["Doors"] = df_fridges["Name"].apply(extract_doors)
df_fridges['Capacity(ltrs)'] = df_fridges['Name'].apply(extract_capacity)
df_fridges['Brand'] = df_fridges['Name'].apply(extract_brand)


# Display the updated DataFrame
df_fridges 

In [31]:
# Standardize brand names (replace "Smart Pro" with "Smartpro")
df_fridges['Brand'] = df_fridges['Brand'].replace({'Smart pro': 'Smartpro','Nunnix': 'Nunix'})
df_fridges['Brand'].value_counts()

In [55]:
# Save to CSV
df_fridges.to_csv('fridges_clean.csv', index=True)

### Scraping Laptops data

In [48]:
# Initialize a list to store all laptops' data
laptops = []

# Loop through all 90 pages
for x in range(1, 91):  # Pages 1 to 90
    print(f"Scraping page {x}...")
    
    # Send a GET request to the page
    result = requests.get(f'https://www.kilimall.co.ke/search?q=laptop&page={x}&source=search|enterSearch|laptop')
    
    # Check if the request was successful
    if result.status_code == 200:
        soup = BeautifulSoup(result.text, 'html.parser')  # Parse the HTML content
        
        # Extract laptop details from divs with the class "info-box".
        laptops_info = soup.find_all('div', class_="info-box")
        
        # Extract relevant details
        for laptop_info in laptops_info:
            # Safely extract data, handle cases where tags are missing
            laptop_name = laptop_info.find('p', class_='product-title')
            laptop_price = laptop_info.find('div', class_='product-price')
            laptop_reviews = laptop_info.find('span', class_='reviews')
            
            # Clean and append extracted data
            laptops.append({
                "Name": laptop_name.text.strip() if laptop_name else "N/A",
                "Price": laptop_price.text.strip() if laptop_price else "N/A",
                "Reviews": laptop_reviews.text.strip() if laptop_reviews else "N/A"
            })
    else:
        print(f"Failed to fetch page {x}, Status code: {result.status_code}")

# Print results
for laptop in laptops:
    print(laptop)


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

In [49]:
#Save results as a DataFrame
df_laptops = pd.DataFrame(laptops)
df_laptops.sample(20)

Unnamed: 0,Name,Price,Reviews
2767,Lenovo Thinkpad X280 Intel Core I5-Refurbished-8GB RAM 256GB SSD 8th gen.,"KSh 26,000",(0)
1271,Refurbished Laptop HP Elitebook 840 G6 Laptop Intel Core i5 8th Gen @2.40GHz 8GB Ram 256GB SSD - 14'' Refurbished computer Notebook - Windows 10 Silver 14 inch,"KSh 31,199",(0)
64,"(NEW YEAR SALE!) Refurbished Hp EliteBook Folio 9470 Core I5 8GB RAM| 500GB HDD,14 ""Inch Display ,5th Generation ,Refurbished Laptop, Silver, Windows 10 Pro MS Office 2019 Silver 6 Months Warranty","KSh 18,999",(0)
858,Refurbished: HP ProBook 430 G5 - Intel Core i5 8th Gen | 8GB RAM | 256GB SSD | 13.3'' Anti-Glare Display | Lightweight Business Laptop,"KSh 25,999",(0)
1517,HP 840 G6 i7 8th Gen 16/512GB Touch Screen,"KSh 42,000",(0)
2737,refublished hp elitebook 840 g3 coi5 6th generation,"KSh 23,000",(0)
1040,"(Free Mouse)Refurbished Laptop Lenovo Thinkpad T470 Core i5 6th 8GB+256GB+14"" Windows 10 14"" FHD Display Bluetooth Webcam WiFi Intel Graphics 6 Months Warranty laptops","KSh 21,999",(0)
2275,"HP Refurbished 8440p Core I5 8GB RAM 500GB HDD 14"", Windows 11 PRO Activated, Office & Basic software Installed, Free Bag + Charger + Wireless Mouse + Mouse Pad","KSh 18,700",(0)
1365,"(HOT LIMITED OFFER)Hp Elitebook 830 G5 (8th Gen) - Intel Core i5 - 8GB RAM 256GB SSD ROM - 13.3"" Refurbished Laptop - Windows 10 6 Months Warranty Notebook","KSh 26,999",(0)
1383,"(OFFER OFFER) HP Probook 430 G3 Laptop - Intel Core i5 - 8GB Ram / 256GB SSD Rom ,6th gen ,Windows 10 Notebook Computer Black 13.3 inch","KSh 26,100",(0)


#### Data Cleaning and feature extraction

In [71]:
# Extract the screen size (integer or float)
def extract_screen_size(description):
    # Regex to capture floats/integers before "inch", "inches", or `"`
    match = re.search(r"(\d+\.\d+|\d+)(?=\s*(?:''|\"|inch|inches?))", description, re.IGNORECASE)
    # If a match is found, return the captured number
    # Regex to capture floats/integers directly before `"`
    match_quote = re.search(r'(\d+\.?\d*)\s*(?=")', description)
    if match:
        return float(match.group(1))  # Return match before "inch" or "inches"
    elif match_quote:
        return match_quote.group(1)  # Return match with `"`
    else:
        return np.nan  # Return NaN if no match is found
    



# Extract RAM
def extract_ram(name):
    match = re.search(r'(\d+GB)\s*RAM', name, re.IGNORECASE)  
    return match.group(1) if match else 'Unknown'



# Extract ROM (HDD/SSD)
def extract_rom(name):
     # Search for a word before "HDD" or "SSD" along with "HDD" or "SSD"
    match = re.search(r'(\b\w+\b)\s+(HDD|SSD)', name, re.IGNORECASE)
    return match.group(0) if match else 'Unknown'


# Apply the extraction/cleaning functions to the DataFrame
df_laptops['Name'] = df_laptops['Name'].apply(clean_name)
df_laptops['Screen_size'] = df_laptops['Name'].apply(extract_screen_size)
df_laptops['RAM']=df_laptops['Name'].apply(extract_ram)
df_laptops['ROM'] =df_laptops['Name'].apply(extract_rom)

In [50]:
df_laptops['Name'] = df_laptops['Name'].apply(clean_name)

In [69]:
# using pd.set_option() to widen the output display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
df_laptops

Unnamed: 0,Name,Price,Reviews,Screen_size,RAM,ROM
0,"LENOVO YOGA 11E 4GB RAM 128GB SSD 2 IN 1 TOUCHSCREEN X360 LAPTOP REFURBISHED LAPTOP , BLACK , 12 INCH , INSTALLED WINDOWS 10 PRO AND FREE MOUSE","KSh 11,499",(149),12.0,4GB,128GB SSD
1,HP ProBook X360 11 G2 EE Core i5 7th Gen 8GB RAM 256GB SSD 11.6 Inches HD Touchscreen 1.2GHz up to 3.2GHz Dual Core Processor Windows 10 Pro 2 in 1 Convertible Slim Refurbished Laptop - 6 MNTHS WRNTY,"KSh 21,500",(0),11.6,8GB,256GB SSD
2,HP ProBook X360 11 G5 EE 9th Generation 4GB RAM 192GB SSD 11.6 Inches Touchscreen 1.1GHz up to 2.7GHz Dual Core Processor Windows 10 Pro 2 - in - 1 Convertible Refurbished Laptop,"KSh 17,000",(0),11.6,4GB,192GB SSD
3,"Lenovo ThinkPad L380 Yoga x360 Core I5 8th generation Quad core, 8GB RAM 256GB SSD 13.3 Inches FHD Touchscreen with a stylus Pen 2 in 1 Refurbished Laptop","KSh 29,999",(1),13.3,8GB,256GB SSD
4,Refurbished Macbook Air 2015 Silver 13 inch apple laptop,"KSh 32,500",(1),13.0,Unknown,Uknown
5,"Apple MacBook Pro 13.3 Core I5 2.5GHz 8GB RAM, 256GB SSD Early 2011 Laptop","KSh 25,000",(0),,8GB,256GB SSD
6,"REFURBISHED HP ELITEBOOK 8460P CORE INTEL I5 8GB RAM 500GB HDD COMPUTER LAPTOP ,WINDOWS 10 PRO FREE MOUSE","KSh 13,499",(69),,8GB,500GB HDD
7,Brand NEW Lenovo Ideapad 1 Celeron N4020 8GB RAM 256GB SSD 14 Inch HD display School Business Laptop Computer Notebook Windows 10 New Laptops Lenovo Laptop Computers,"KSh 31,999",(41),14.0,8GB,256GB SSD
8,Core i716gb512gb13.3 Touch Refurbished Hp 1040 G6 Laptop Core i7 Touchscreen X360 16GB RAM 512GB SSD Laptops 13.3 inch Computer Notebook,"KSh 42,999",(0),13.3,16GB,512GB SSD
9,"Refurbished NEC PC-VK25GVGU 360 Yoga FlexibilityTouchscreen Brilliance, Featherweight Performance, Core i5 7th Gen, 8GB RAM, 256GB SSD, Windows 10 Laptop","KSh 14,999",(10),,8GB,256GB SSD


In [None]:
#Remove commas and any text from Price column
df_laptops["Price"] = df_laptops['Price'].str.replace(r'[^\d]', '', regex=True)
# Rename the Price column
df_laptops = df_laptops.rename(columns={'Price': 'Price(kshs)'})


#Remove brackets from Reviews column
df_laptops['Reviews'] = df_laptops['Reviews'].str.extract(r'(\d+)')
df_laptops.sample(20)

##### Checking duplicates

In [None]:
#Checking how many duplicates
duplicate_count = int(df_fridges.duplicated( keep=False).sum())
print(f"There are {duplicate_count} duplicates")

#Find all duplicates
duplicates = df_fridges[df_fridges.duplicated( keep=False)]
duplicates


In [None]:
# using pd.set_option() to widen the output display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
df

In [61]:
df_laptops.isnull().sum()

Name              0
Price             0
Reviews           0
Screen_size    1962
RAM               0
dtype: int64

In [64]:
df_laptops["RAM"].value_counts()

RAM
8GB        1444
Unknown     740
4GB         390
16GB        189
8gb          86
4gb          21
16gb         12
8Gb           8
12GB          7
58GB          5
32GB          3
2GB           3
6GB           2
16Gb          2
516GB         1
64GB          1
78GB          1
716GB         1
18GB          1
4Gb           1
Name: count, dtype: int64

In [65]:
# Function to harmonize RAM values
def harmonize_ram(ram):
    match = re.match(r'(\d+)\s*GB', ram, flags=re.IGNORECASE)  # Match numbers followed by GB
    return match.group(1) + 'GB' if match else ram  # Return harmonized "XGB" or original value

# Apply the harmonization function
df_laptops['RAM'] = df_laptops['RAM'].apply(harmonize_ram)

In [72]:
df_laptops["ROM"].value_counts()

ROM
256GB SSD                 931
Unknown                   606
500GB HDD                 511
128GB SSD                 169
512GB SSD                 156
GB SSD                     88
256 SSD                    50
256gb ssd                  42
1TB HDD                    23
GB HDD                     23
320GB HDD                  19
128 SSD                    19
128gb ssd                  18
500GB SSD                  14
512 SSD                    14
256gb SSD                  12
i58GB512GB SSD             11
1TB SSD                    11
500gb HDD                  11
500gb hdd                  10
500gb Hdd                   9
NVME SSD                    8
256 ssd                     7
750GB HDD                   7
256GB ssd                   6
500 HDD                     5
i58GB256GB SSD              5
256GB HDD                   5
i58gb256gb ssd              5
256 HDD                     5
8GB256GB SSD                5
Gen SSD                     4
i58GB128GB SSD              4
192GB 

In [None]:
# Save to CSV
df.to_csv('laptops_cleann.csv', index=True)

In [None]:
I need to filter using variations of offer and sale then remove the word
row 1..remove  (logic that removes every word in the brackets normal or curly brackets)2, 5,6,29,31, 33, 39, 40,45
Also logic that accomodates different letters whether small or capital-different variation of offers eg offers 
Add black friday, black friday offers, best deals,limited
Speciual case 36
