In [39]:
import pandas as pd
import re

def kilimall_microwaves_clean(csv_path):
    df = pd.read_csv(csv_path)
    df_cleaned = df.dropna()
    df.drop(columns = 'Unnamed: 0', inplace = True)
    df = df.drop_duplicates()
    df["price"] = df['microwaves_price'].str.strip('KSh,').str.replace(",","")
    df['number_of_reviews'] = df["microwaves_reviews"].str.extract(r'(\d+)').astype(int)

    unwanted_words = ['clearance', 'CLEARANCE', 'SALE', 'OFFER', 'Best', 'CHOOSE', 'Offers', 'QUALITY', 'THE', 'FUTURE',
                  'EMBRACE', 'LATEST', 'TREND', 'MEGASALE', 'Buy', 'NOW', 'AND', 'ENJOY', 'UPTO', 'NEW', 'IMPROVED', 
                  'STAY', 'LOCKED','ASSURANCE ', 'WITH', 'STOCK', 'kilimall', 'special', 'MAKE', 'YOUR', 'HOUSE', 'FEEL', 'LIKE', 
                  'Super','LUXURY FOR LESS' ,'CLERAANCE','deal','BUY','OFF', 'HOME With','quality', 'RESTOCKED', 'Share', 'this', 'product', 'Best', 'ARRIVALS', 'HURRY', 
                  'AND', 'PICK', 'YOURS', 'LIMITED', 'AN', 'NO', 'OTHER', 'PRICE', 'REDUCED', 'NOWBLACK', 'Angry', 
                  'mama', 'kitchen','EXPERIENCE', 'New', 'Arrival', 'Classy', 'sale', 'offer', 'best', 'discount', 
                  'cheap', 'deal', 'SALE','Promotions','OFFER','offer','TRUSTED','SOURCE','UPGRADE','THESE','TOP','DURABLE','LISTING','ON','Cooking','End','Original','OF','ALL','affordable']


    pattern = r'\b(?:' + '|'.join(map(re.escape, unwanted_words)) + r')\b'
    df['microwaves_name'] = df["microwaves_name"].str.replace(pattern, '', regex=True).str.strip()
     # Functions to clean the data
    def remove_brackets(column):
        return column.str.replace(r'\[.*?\]', '', regex=True)

    def clean_column(column):
        return column.str.replace(pattern, '', flags=re.IGNORECASE).str.replace(r'\s+', ' ', regex=True).str.strip()

    def remove_symbols(column):
        return column.str.replace(r'[!"+]', '', regex=True)
    
    df['microwaves_name'] = remove_brackets(df['microwaves_name'])
    df['microwaves_name'] = clean_column(df['microwaves_name'])
    df['microwaves_name'] = remove_symbols(df['microwaves_name'])

    def microwaves_clean_name(name):
        name = re.sub(r'[\(\)\{\}\[\]\"!]+', '', name)  # Remove parentheses, brackets, quotes, and exclamation marks
        name = re.sub(r'^\d+\s*', '', name)  # Remove numbers at the start of the string
        return name.strip()  # Remove leading and trailing whitespace
    df["microwaves_name"] = df["microwaves_name"].apply(microwaves_clean_name)
    # Extract brand name and clean it
    def clean_text(text):
        # Remove punctuation, emojis, and parentheses
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\s*\([^)]*\)\s*', '', text)  # Remove parentheses and their content
        text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove emojis and non-ASCII characters
        return text.strip()
    df['microwaves_name'] = df ['microwaves_name'].apply(clean_text)
    def extract_brand_name(text):
        words = clean_text(text).split()[:5]  # Get first five words
        return ' '.join(words)
    def extract_capacity(text):
        if not isinstance(text, str):
            return None
        match = re.search(r'\b(\d+\.?\d*)\s*(L|Ltrs|Liters|Litres)\b', text, re.IGNORECASE)
        return match.group(1) if match else None
    def remove_parentheses(text):
        if not isinstance(text, str):
            return text
        return re.sub(r'\s*\([^)]*\)\s*', '', text)
    def extract_brand(microwaves_name):
        # Split the name into words and take the first two or three
        words = microwaves_name.split()
        return ' '.join(words[:3])  
    df["brand"] = df["microwaves_name"].apply(extract_brand)
    df['microwaves_name'] = df['microwaves_name'].apply(extract_brand_name) 
    df['capacity'] = df['microwaves_name'].apply(extract_capacity) 
    df["description"] = df["microwaves_name"]
    df["source"] = ["kilimall"] *len(df)
    df["id"] = [i for i in range(1,len(df)+1)]
    df['urls'] = df['microwaves_url']
    df = df.drop(columns = ["microwaves_name","microwaves_reviews","microwaves_price","microwaves_url"])
    columns = ["id","description",'brand','price','number_of_reviews','capacity','source','urls']
    df = df[columns]
    df.to_csv("../data/clean/kilimall_clean_microwaves.csv", index = False)
    return df




csv_path = (r"C:\Users\Vivian.Obino1\Desktop\e-commerce analysis\data\scraped\kilimall_microwaves_scraped.csv")
df = kilimall_microwaves_clean(csv_path)
df





Unnamed: 0,id,description,brand,price,number_of_reviews,capacity,source,urls
0,1,Hisense Microwave Oven H20MOWS11 700W,Hisense Microwave Oven,8899,372,,kilimall,https://www.kilimall.co.ke/listing/2158891-his...
1,2,SmartPro 20Ltrs Digital Microwave Oven,SmartPro 20Ltrs Digital,8549,17,20,kilimall,https://www.kilimall.co.ke/listing/1000360221-...
2,3,Roch 20Ltrs Digital Microwave Oven,Roch 20Ltrs Digital,8499,46,20,kilimall,https://www.kilimall.co.ke/listing/2748812-roc...
3,4,Hisense 20L Digital Microwave Oven,Hisense 20L Digital,8790,0,20,kilimall,https://www.kilimall.co.ke/listing/1001005503-...
4,5,SmartPro 20Ltrs Digital Microwave Oven,SmartPro 20Ltrs Digital,8199,2,20,kilimall,https://www.kilimall.co.ke/listing/1000360181-...
...,...,...,...,...,...,...,...,...
1507,1505,Eurochef EGT553G1EE 5550 3 Gas,Eurochef EGT553G1EE 5550,25999,0,,kilimall,https://www.kilimall.co.ke/listing/1000221056-...
1508,1506,Eurochef Full gas Free Standing,Eurochef Full gas,19499,0,,kilimall,https://www.kilimall.co.ke/listing/1000288798-...
1509,1507,Nunix 60x60 31 Electric Free,Nunix 60x60 31,25999,0,,kilimall,https://www.kilimall.co.ke/listing/1000248307-...
1510,1508,Eurochef Full Gas Free Standing,Eurochef Full Gas,20599,0,,kilimall,https://www.kilimall.co.ke/listing/1000221052-...


In [2]:
df = pd.read_csv(r"C:\Users\Vivian.Obino1\Desktop\e-commerce analysis\data\scraped\kilimall_microwaves_scraped.csv")

In [3]:
print(df)

                                        microwaves_name microwaves_reviews  \
0     Hisense Microwave Oven H20MOWS11 700W 20L Digi...              (370)   
1     SmartPro 20Ltrs Digital Microwave Oven High Qu...               (17)   
2     Roch 20Ltrs Digital Microwave Oven High Qualit...               (46)   
3     CLEARANCE SALE!! Hisense 20L Digital Microwave...                (0)   
4     SmartPro 20Ltrs Digital Microwave Oven High Qu...                (2)   
...                                                 ...                ...   
1495  CLEARANCE OFFER Eurochef EGT55-3G1E-E 55*50 3 ...                (0)   
1496  CLEARANCE OFFER Eurochef Full gas Free Standin...                (0)   
1497  CLEARANCE OFFER Nunix 60x60 3+1 Electric Free ...                (0)   
1498  CLEARANCE OFFER Eurochef Full Gas Free Standin...                (0)   
1499  3+1 Eurochef Standing CookerEurochef 3 Gas 1 e...                (0)   

     microwaves_price   microwaves_links  
0           KSh 8,89

In [4]:
df_cleaned = df.dropna()

In [5]:
df_cleaned = df.drop_duplicates()
print(df_cleaned.head())

                                     microwaves_name microwaves_reviews  \
0  Hisense Microwave Oven H20MOWS11 700W 20L Digi...              (370)   
1  SmartPro 20Ltrs Digital Microwave Oven High Qu...               (17)   
2  Roch 20Ltrs Digital Microwave Oven High Qualit...               (46)   
3  CLEARANCE SALE!! Hisense 20L Digital Microwave...                (0)   
4  SmartPro 20Ltrs Digital Microwave Oven High Qu...                (2)   

  microwaves_price   microwaves_links  
0        KSh 8,899  No link available  
1        KSh 8,549  No link available  
2        KSh 8,499  No link available  
3        KSh 8,790  No link available  
4        KSh 8,199  No link available  


In [6]:
df.shape

(1500, 4)

In [7]:
df['microwaves_price']= df['microwaves_price'].str.strip('KSh,')
df

Unnamed: 0,microwaves_name,microwaves_reviews,microwaves_price,microwaves_links
0,Hisense Microwave Oven H20MOWS11 700W 20L Digi...,(370),8899,No link available
1,SmartPro 20Ltrs Digital Microwave Oven High Qu...,(17),8549,No link available
2,Roch 20Ltrs Digital Microwave Oven High Qualit...,(46),8499,No link available
3,CLEARANCE SALE!! Hisense 20L Digital Microwave...,(0),8790,No link available
4,SmartPro 20Ltrs Digital Microwave Oven High Qu...,(2),8199,No link available
...,...,...,...,...
1495,CLEARANCE OFFER Eurochef EGT55-3G1E-E 55*50 3 ...,(0),25999,No link available
1496,CLEARANCE OFFER Eurochef Full gas Free Standin...,(0),19499,No link available
1497,CLEARANCE OFFER Nunix 60x60 3+1 Electric Free ...,(0),25999,No link available
1498,CLEARANCE OFFER Eurochef Full Gas Free Standin...,(0),20599,No link available


In [8]:
df['microwaves_reviews'] = df['microwaves_reviews'].str.extract(r'(\d+)').astype(int)
df

Unnamed: 0,microwaves_name,microwaves_reviews,microwaves_price,microwaves_links
0,Hisense Microwave Oven H20MOWS11 700W 20L Digi...,370,8899,No link available
1,SmartPro 20Ltrs Digital Microwave Oven High Qu...,17,8549,No link available
2,Roch 20Ltrs Digital Microwave Oven High Qualit...,46,8499,No link available
3,CLEARANCE SALE!! Hisense 20L Digital Microwave...,0,8790,No link available
4,SmartPro 20Ltrs Digital Microwave Oven High Qu...,2,8199,No link available
...,...,...,...,...
1495,CLEARANCE OFFER Eurochef EGT55-3G1E-E 55*50 3 ...,0,25999,No link available
1496,CLEARANCE OFFER Eurochef Full gas Free Standin...,0,19499,No link available
1497,CLEARANCE OFFER Nunix 60x60 3+1 Electric Free ...,0,25999,No link available
1498,CLEARANCE OFFER Eurochef Full Gas Free Standin...,0,20599,No link available


In [9]:


# List of unwanted words (make sure all variations are included)
unwanted_words = ['clearance', 'CLEARANCE', 'SALE', 'OFFER', 'Best', 'CHOOSE', 'Offers', 'QUALITY', 'THE', 'FUTURE',
                  'EMBRACE', 'LATEST', 'TREND', 'MEGASALE', 'Buy', 'NOW', 'AND', 'ENJOY', 'UPTO', 'NEW', 'IMPROVED', 
                  'STAY', 'LOCKED', 'WITH', 'STOCK', 'kilimall', 'special', 'MAKE', 'YOUR', 'HOUSE', 'FEEL', 'LIKE', 
                  'Super', 'deal', 'quality', 'RESTOCKED', 'Share', 'this', 'product', 'Best', 'ARRIVALS', 'HURRY', 
                  'AND', 'PICK', 'YOURS', 'LIMITED', 'AN', 'NO', 'OTHER', 'PRICE', 'REDUCED', 'NOWBLACK', 'Angry', 
                  'mama', 'kitchen','EXPERIENCE', 'New', 'Arrival', 'Classy', 'sale', 'offer', 'best', 'discount', 
                  'cheap', 'deal', 'SALE','Promotions','OFFER','offer','TRUSTED','SOURCE','UPGRADE','THESE','TOP','DURABLE','LISTING','ON','Cooking','End','Original','OF','ALL','affordable']

# Convert the unwanted_words list to lowercase for case-insensitive comparison
unwanted_words = set(word.lower() for word in unwanted_words)

# Function to remove unwanted words from a text
def remove_unwanted_words(text):
    if not isinstance(text, str):
        return text  # Return as is if not a string
    words = text.split()  # Split the text into individual words
    cleaned_words = [word for word in words if word.lower() not in unwanted_words]  # Filter out unwanted words
    return ' '.join(cleaned_words)  # Join the cleaned words back into a string

file_path =(r"C:\Users\Vivian.Obino1\Desktop\e-commerce analysis\data\scraped\kilimall_microwaves_scraped.csv") # Replace with your actual file path
df = pd.read_csv(file_path)

# Apply the function to the relevant column (e.g., 'microwaves_name')
df['microwaves_name'] = df['microwaves_name'].apply(remove_unwanted_words)

# Save the updated DataFrame to a new CSV file
output_file_path = 'kilimall_clean_microwaves.csv'
df.to_csv(output_file_path, index=False)


In [10]:


# Load the CSV file into a DataFrame
file_path = 'kilimall_clean_microwaves.csv'  # Replace with your CSV file path
df = pd.read_csv(file_path)

# Function to clean text
def clean_text(text):
    # Remove punctuation, emojis, and parentheses
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s*\([^)]*\)\s*', '', text)  # Remove parentheses and their content
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove emojis and non-ASCII characters
    return text.strip()
  
# Extract brand name and clean it
def extract_brand_name(text):
    words = clean_text(text).split()[:5]  # Get first five words
    return ' '.join(words)
def extract_capacity(text):
    if not isinstance(text, str):
        return None
    match = re.search(r'\b(\d+\.?\d*)\s*(L|Ltrs|Liters|Litres)\b', text, re.IGNORECASE)
    return match.group(1) if match else None
def remove_parentheses(text):
    if not isinstance(text, str):
        return text
    return re.sub(r'\s*\([^)]*\)\s*', '', text)

# Apply the function to the 'microwave_reviews' column


# Apply the brand name extraction
df['microwaves_name'] = df['microwaves_name'].apply(extract_brand_name)  # Replace 'ColumnName' with the relevant column name
df['Capacity'] = df['microwaves_name'].apply(extract_capacity) 
# Lines to remove
lines_to_remove = [*range(12, 19), 83, 84, 127, 128, 131, *range(155, 160), *range(608, 624), 
                   *range(652, 658), *range(675, 683), *range(686, 692), 461, 463, *range(187, 197), 
                   *range(257, 267), 275, 342, 343, *range(769, 806), *range(832, 839)]

# Drop specified rows (subtract 1 to account for zero-based index in Python)
df.drop(index=[i-1 for i in lines_to_remove], inplace=True, errors='ignore')
# Rename the columns
df.columns = ['brand', 'reviews', 'price', 'microwaves_url','capacity']   
# Save the cleaned DataFrame to a new CSV file
output_file_path = 'kilimall_clean_microwaves.csv'
df.to_csv(output_file_path, index=False)

print("Data processing complete. Cleaned file saved as:", output_file_path)


Data processing complete. Cleaned file saved as: kilimall_clean_microwaves.csv


In [11]:
df

Unnamed: 0,brand,reviews,price,microwaves_url,capacity
0,Hisense Microwave Oven H20MOWS11 700W,(370),"KSh 8,899",No link available,
1,SmartPro 20Ltrs Digital Microwave Oven,(17),"KSh 8,549",No link available,20
2,Roch 20Ltrs Digital Microwave Oven,(46),"KSh 8,499",No link available,20
3,SALE Hisense 20L Digital Microwave,(0),"KSh 8,790",No link available,20
4,SmartPro 20Ltrs Digital Microwave Oven,(2),"KSh 8,199",No link available,20
...,...,...,...,...,...
1495,Eurochef EGT553G1EE 5550 3 Gas,(0),"KSh 25,999",No link available,
1496,Eurochef Full gas Free Standing,(0),"KSh 19,499",No link available,
1497,Nunix 60x60 31 Electric Free,(0),"KSh 25,999",No link available,
1498,Eurochef Full Gas Free Standing,(0),"KSh 20,599",No link available,


In [12]:
df['price']= df['price'].str.strip('KSh,')
df

Unnamed: 0,brand,reviews,price,microwaves_url,capacity
0,Hisense Microwave Oven H20MOWS11 700W,(370),8899,No link available,
1,SmartPro 20Ltrs Digital Microwave Oven,(17),8549,No link available,20
2,Roch 20Ltrs Digital Microwave Oven,(46),8499,No link available,20
3,SALE Hisense 20L Digital Microwave,(0),8790,No link available,20
4,SmartPro 20Ltrs Digital Microwave Oven,(2),8199,No link available,20
...,...,...,...,...,...
1495,Eurochef EGT553G1EE 5550 3 Gas,(0),25999,No link available,
1496,Eurochef Full gas Free Standing,(0),19499,No link available,
1497,Nunix 60x60 31 Electric Free,(0),25999,No link available,
1498,Eurochef Full Gas Free Standing,(0),20599,No link available,


In [13]:
def retain_text_in_parentheses(text):
    if not isinstance(text, str):
        return text  # Return as is if not a string
    return re.sub(r'[()]', '', text)  # Remove only parentheses

# Apply the function to the desired column
df['reviews'] = df['reviews'].apply(retain_text_in_parentheses)
df

Unnamed: 0,brand,reviews,price,microwaves_url,capacity
0,Hisense Microwave Oven H20MOWS11 700W,370,8899,No link available,
1,SmartPro 20Ltrs Digital Microwave Oven,17,8549,No link available,20
2,Roch 20Ltrs Digital Microwave Oven,46,8499,No link available,20
3,SALE Hisense 20L Digital Microwave,0,8790,No link available,20
4,SmartPro 20Ltrs Digital Microwave Oven,2,8199,No link available,20
...,...,...,...,...,...
1495,Eurochef EGT553G1EE 5550 3 Gas,0,25999,No link available,
1496,Eurochef Full gas Free Standing,0,19499,No link available,
1497,Nunix 60x60 31 Electric Free,0,25999,No link available,
1498,Eurochef Full Gas Free Standing,0,20599,No link available,


In [14]:
df['price'] = df['price'].replace({',': ''}, regex=True)

df['price'] = pd.to_numeric(df['price'])

In [15]:
df

Unnamed: 0,brand,reviews,price,microwaves_url,capacity
0,Hisense Microwave Oven H20MOWS11 700W,370,8899,No link available,
1,SmartPro 20Ltrs Digital Microwave Oven,17,8549,No link available,20
2,Roch 20Ltrs Digital Microwave Oven,46,8499,No link available,20
3,SALE Hisense 20L Digital Microwave,0,8790,No link available,20
4,SmartPro 20Ltrs Digital Microwave Oven,2,8199,No link available,20
...,...,...,...,...,...
1495,Eurochef EGT553G1EE 5550 3 Gas,0,25999,No link available,
1496,Eurochef Full gas Free Standing,0,19499,No link available,
1497,Nunix 60x60 31 Electric Free,0,25999,No link available,
1498,Eurochef Full Gas Free Standing,0,20599,No link available,


In [16]:
df.to_csv(r"C:\Users\Vivian.Obino1\Desktop\e-commerce analysis\data\clean\kilimall_clean_microwaves.csv", sep=',', index=False, encoding='utf-8')


In [17]:
file_path = 'kilimall_clean_microwaves.csv'  
df = pd.read_csv(file_path)
words_to_remove = [
    'SALE','SPECIAL','QUALITY ','ASSURANCE','OFFERS' ,'OFFERS','OFER','LTR','liters', 'Analog','litre','Litres','CLEARANCE','official','store','Crazy','Deal','AT','SALE','L','DICOUNTED','ltrs','ARRIVALS','Signal','NEW','ARRIVAL','LIMITED','OF','LISTING','Promotions','OFFER','Offer','TRUSTED','SOURCE','UPGRADE','THESE','TOP','DURABLE','LISTING','ON','Cooking','End','Original','OF','ALL','affordable'
]

# Create a regex pattern to match any of the words in the list
pattern = r'\b(?:' + '|'.join(map(re.escape, words_to_remove)) + r')\b'

# Function to clean the brand column
def clean_brand(text):
    # Remove unwanted words
    cleaned = re.sub(pattern, '', text, flags=re.IGNORECASE)
    # Remove numeric characters
    cleaned = re.sub(r'\d+', '', cleaned)
    # Remove extra spaces and strip
    return ' '.join(cleaned.split()).strip()

# Apply the cleaning function to the brand column
df['brand'] = df['brand'].apply(clean_brand)
df.to_csv(r'C:\Users\Vivian.Obino1\Desktop\e-commerce analysis\data\clean\kilimall_clean_microwaves.csv', sep=',', index=False, encoding='utf-8')


In [18]:
df['source'] = 'kilimall'
df.to_csv(r"C:\Users\Vivian.Obino1\Desktop\e-commerce analysis\data\clean\kilimall_clean_microwaves.csv", index=False)