## 1. Importing libraries

In [11]:
import pandas as pd
import numpy as np
import re

## 2. Exploratory Data Analysis

I have two categories of cookers dataset:
- Cooktops
- Standing Cooker

### Cooktops

In [4]:
# load the csv data
cooktops_df = pd.read_csv('kilimall_cooktops_no_rating.csv')

# preview the df
cooktops_df.head()


Unnamed: 0,cooktop_name,cooktop_reviews,cooktop_price
0,【HOT！!】AILYONS/LYONS GS014-4 Glass Top Infrare...,(659),"KSh 2,199"
1,IPCONE 7102 Double Gas Cooker Auto lgnition D...,(209),"KSh 1,399"
2,【HOT！】AILYONS/LYONS GS017 Gas Cooker Double Bu...,(655),"KSh 1,428"
3,IPCONE 2 Gas Cooker 02 Glass Top Double Burne...,(64),"KSh 1,999"
4,AILYONS GS014-1/GS005A-1 Glass Top Infrared Do...,(1131),"KSh 2,299"


In [9]:
# check df shape
cooktops_df.shape

(4869, 4)

The 'cooktop_name' column contains emojis, the code below performs cleaning usin the Regular Expresion python module (RE).

- The regex pattern [^\w\s,.-] matches anything that is not a word character (\w), whitespace (\s), comma (,), period (.), or hyphen (-). This helps remove emojis and special characters while keeping alphanumeric characters and common punctuation.

In [5]:

# Function to remove emojis and special characters
def remove_emojis(text):
    return re.sub(r'[^\w\s,.-]', '', text)

# Apply the function to 'cooktop_name'
cooktops_df['clean_cooktop_name'] = cooktops_df['cooktop_name'].apply(remove_emojis)

# Save the 'clean_cooktop_name' column to a CSV file
cooktops_df[['clean_cooktop_name']].to_csv('no_emojis_cooktop_name_column.csv', index=False)

cooktops_df.head()


Unnamed: 0,cooktop_name,cooktop_reviews,cooktop_price,clean_cooktop_name
0,【HOT！!】AILYONS/LYONS GS014-4 Glass Top Infrare...,(659),"KSh 2,199",HOTAILYONSLYONS GS014-4 Glass Top Infrared Dou...
1,IPCONE 7102 Double Gas Cooker Auto lgnition D...,(209),"KSh 1,399",IPCONE 7102 Double Gas Cooker Auto lgnition D...
2,【HOT！】AILYONS/LYONS GS017 Gas Cooker Double Bu...,(655),"KSh 1,428",HOTAILYONSLYONS GS017 Gas Cooker Double Burner...
3,IPCONE 2 Gas Cooker 02 Glass Top Double Burne...,(64),"KSh 1,999",IPCONE 2 Gas Cooker 02 Glass Top Double Burne...
4,AILYONS GS014-1/GS005A-1 Glass Top Infrared Do...,(1131),"KSh 2,299",AILYONS GS014-1GS005A-1 Glass Top Infrared Dou...


In [12]:
# statistical description of the df
cooktops_df.describe()

Unnamed: 0,cooktop_name,cooktop_reviews,cooktop_price,clean_cooktop_name
count,4869,4869,4869,4869
unique,3254,64,1320,3246
top,Electric Cooker / Single Spiral Coil Hotplate,(0),"KSh 2,999",Electric Cooker Single Spiral Coil Hotplate
freq,49,4408,118,49


- In the above statistics, most cooktops on the website has 0 reviews, 4408 out of the total 4896 products.
- In pricing, the most frequent price was Ksh 2,999 that was repeated 118 times across the whole product list.

Check for duplicates

In [15]:
# check for duplicates across all columns
cooktop_duplicate_rows = cooktops_df[cooktops_df.duplicated()]
#cooktop_duplicate_rows

print(f'Number of duplicate rows: {cooktop_duplicate_rows.shape[0]}')


Number of duplicate rows: 492


In [16]:
# handling duplicates
cooktop_no_dupes_df = cooktops_df.drop_duplicates(subset='clean_cooktop_name', keep='first')

- The above code retains the the first occurrence of each row, indexed by 'clean_cooktop_name' column of the df.

In [17]:
# recheck duplicates
print(cooktop_no_dupes_df['clean_cooktop_name'].duplicated().sum())


0


- The duplicates have been successfully handled, with the returned sum of duplicates as 0.

In [18]:
#check the shape to confirm number of rows after removing duplicates
cooktop_no_dupes_df.shape

(3246, 4)

- There are 3246 rows down from 4869 rows before handling duplicates.

Create a list of available brands and types to help with creating additional columns in the df

In [19]:
# list of available brands
cooktop_brands = ['generic', 'nunix', 'ailyons', 'eurochef', 'rashnik', 'sokany', 'ramtons', 'eurochef', 'mara', 'premier', 'sweet home', 'edison', 'sayona', 'roch', 'silvercrest', 'hisense', 'ipcone', 'kitchen37', 'toseeu', 'amaze', 'microsoft lumia', 'fashion king', 'mika', 'rebune', 'annov', 'euroken', 'hotpoint', 'jamesport', 'jtc', 'jikokoa', 'lenovo', 'sterling', 'u7', 'vitron', 'fenghua', '& other fairies', 'ahitar', 'bosch', 'gt sonic', 'rebune', 'thl', 'vention', 'weiqin', 'kilimall', 'armco', 'aucma', 'alldocube', 'amazon', 'androidly']
cooktop_types = ['gas', 'electric', 'electric and gas', 'not specified']

print('Available number of cooktop brands from kilimall websites: ', len(cooktop_brands))
print(f'There are {len(cooktop_type)} different types of cooktops ')


Available number of cooktop brands from kilimall websites:  49
There are 4 different types of cooktops 


In [20]:
# extract brand and type from "clean_cooktop_name" using above lists

# 1. functions to match brands
def match_brand(cooktop_name):
    for brand in cooktop_brands:
        if brand.lower() in cooktop_name.lower():
            return brand
    return 'Unknown'


# 2. function to match cooktop type
def match_type(cooktop_name):
    for cooktop_type in cooktop_types:
        if cooktop_type.lower() in cooktop_name.lower():
            return cooktop_type
    return 'Unknown'

In [22]:
# Apply the functions to the DataFrame

cooktop_no_dupes_df.loc[:, 'brand'] = cooktop_no_dupes_df['clean_cooktop_name'].apply(match_brand)
cooktop_no_dupes_df.loc[:, 'cooktop_type'] = cooktop_no_dupes_df['clean_cooktop_name'].apply(match_type)
cooktop_no_dupes_df.head()

Unnamed: 0,cooktop_name,cooktop_reviews,cooktop_price,clean_cooktop_name,brand,cooktop_type
0,【HOT！!】AILYONS/LYONS GS014-4 Glass Top Infrare...,(659),"KSh 2,199",HOTAILYONSLYONS GS014-4 Glass Top Infrared Dou...,ailyons,gas
1,IPCONE 7102 Double Gas Cooker Auto lgnition D...,(209),"KSh 1,399",IPCONE 7102 Double Gas Cooker Auto lgnition D...,ipcone,gas
2,【HOT！】AILYONS/LYONS GS017 Gas Cooker Double Bu...,(655),"KSh 1,428",HOTAILYONSLYONS GS017 Gas Cooker Double Burner...,ailyons,gas
3,IPCONE 2 Gas Cooker 02 Glass Top Double Burne...,(64),"KSh 1,999",IPCONE 2 Gas Cooker 02 Glass Top Double Burne...,ipcone,gas
4,AILYONS GS014-1/GS005A-1 Glass Top Infrared Do...,(1131),"KSh 2,299",AILYONS GS014-1GS005A-1 Glass Top Infrared Dou...,ailyons,gas


- The 'SettingWithCopyWarning' occurs in pandas when you try to modify a DataFrame that is a "view" of another DataFrame, rather than a copy. This can lead to unexpected results because changes made might not apply to the original DataFrame as intended. The warning is corrected by using '.loc' or creating a copy of the 'view' dataframe

In [23]:
# shape
cooktop_no_dupes_df.shape

(3246, 6)

In [26]:
# exploring 'cooktop_no_dupes'

cooktop_unique_brands = cooktop_no_dupes_df["brand"].unique()
print(cooktop_unique_brands)

#number of unique counts
cooktop_unique_brands_counts = cooktop_no_dupes_df["brand"].value_counts()
print("Frequency of available brands:\n", cooktop_unique_brands_counts)




['ailyons' 'ipcone' 'nunix' 'eurochef' 'rashnik' 'jikokoa' 'Unknown'
 'generic' 'sokany' 'ramtons' 'rebune' 'amaze' 'premier' 'sayona' 'roch'
 'annov' 'mika' 'silvercrest' 'hotpoint' 'hisense' 'sterling']
Frequency of available brands:
 brand
Unknown        1114
nunix           740
eurochef        324
ailyons         220
rashnik         199
sokany          136
jikokoa         105
generic          84
ipcone           76
amaze            76
ramtons          64
roch             43
premier          24
rebune           18
sayona            7
mika              6
silvercrest       4
annov             2
hisense           2
hotpoint          1
sterling          1
Name: count, dtype: int64


### Standing Cooker

In [27]:
# load the csv data
standingcooker_df = pd.read_csv('kilimall_standing_cooker_no_rating.csv')

# preview the df
standingcooker_df.head()


Unnamed: 0,standing_cooker_name,standing_cooker_reviews,standing_cooker_price
0,Volsmart 4 Burners VGS-580 Free Standing Gas C...,(241),"KSh 18,299"
1,(Special Offer)Nunix KZ-560-3G1E Free Stand...,(223),"KSh 23,899"
2,SPECIAL OFFER Eurochef EGT-55-3G1E-E 3 Gas Bur...,(39),"KSh 22,100"
3,Nunix KZ-560-GO Free Standing 4 Gas Burners Ov...,(157),"KSh 19,799"
4,Nunix 3 Gas Burners 1 Electric Standing Gas C...,(18),"KSh 9,299"


In [28]:
# check df shape
standingcooker_df.shape

(1151, 3)

In [29]:
# Function to remove emojis and special characters
def remove_emojis(text):
    return re.sub(r'[^\w\s,.-]', '', text)

# Apply the function to 'standing_cooker_name'
standingcooker_df['clean_standingcooker_name'] = standingcooker_df['standing_cooker_name'].apply(remove_emojis)

# Save the 'clean_standingcooker_name' column to a CSV file
standingcooker_df[['clean_standingcooker_name']].to_csv('no_emojis_standingcooker_name_column.csv', index=False)

standingcooker_df.head()

Unnamed: 0,standing_cooker_name,standing_cooker_reviews,standing_cooker_price,clean_standingcooker_name
0,Volsmart 4 Burners VGS-580 Free Standing Gas C...,(241),"KSh 18,299",Volsmart 4 Burners VGS-580 Free Standing Gas C...
1,(Special Offer)Nunix KZ-560-3G1E Free Stand...,(223),"KSh 23,899",Special OfferNunix KZ-560-3G1E Free Standin...
2,SPECIAL OFFER Eurochef EGT-55-3G1E-E 3 Gas Bur...,(39),"KSh 22,100",SPECIAL OFFER Eurochef EGT-55-3G1E-E 3 Gas Bur...
3,Nunix KZ-560-GO Free Standing 4 Gas Burners Ov...,(157),"KSh 19,799",Nunix KZ-560-GO Free Standing 4 Gas Burners Ov...
4,Nunix 3 Gas Burners 1 Electric Standing Gas C...,(18),"KSh 9,299",Nunix 3 Gas Burners 1 Electric Standing Gas C...


In [30]:
# statistical description of the df
standingcooker_df.describe()

Unnamed: 0,standing_cooker_name,standing_cooker_reviews,standing_cooker_price,clean_standingcooker_name
count,1151,1151,1151,1151
unique,955,17,629,953
top,Nunix Free Standing 4 Gas Burner Cooker With Oven,(0),"KSh 32,999",Nunix Free Standing 4 Gas Burner Cooker With Oven
freq,21,1097,31,21


- In the above statistics, most cooktops on the website has 0 reviews, 1097 out of the total 1151 products.
- In pricing, the most frequent price was Ksh 32,999 that was repeated 31 times across the whole product list.

Check for duplicates

In [31]:
standingcooker_df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1146    False
1147    False
1148    False
1149    False
1150    False
Length: 1151, dtype: bool

In [32]:
# check for duplicates across all columns
standingcooker_duplicate_rows = standingcooker_df[standingcooker_df.duplicated()]


print(f'Number of duplicate rows: {standingcooker_duplicate_rows.shape[0]}')

Number of duplicate rows: 55


In [33]:
# handling duplicates
standingcooker_no_dupes_df = standingcooker_df.drop_duplicates(subset='clean_standingcooker_name', keep='first')

# recheck duplicates
print(standingcooker_no_dupes_df['clean_standingcooker_name'].duplicated().sum())

0


- The duplicates have been successfully handled, with the returned sum of duplicates as 0.

In [34]:
#check the shape to confirm number of rows after removing duplicates
standingcooker_no_dupes_df.shape

(953, 4)

- There are 953 rows down from 1151 rows before handling duplicates.

Create a list of available brands and types to help with creating additional columns in the df

In [40]:
# list of available brands
standingcooker_brands = ['generic', 'nunix', 'mika', 'hotpoint', 'eurochef', 'ramtons', 'premier', 'volsmart', 'sayona', 'haier', 'hisense', 'roch', 'bruhm', 'euroken', 'ailyons', 'amaze', 'icecool', 'exzel', 'lg', 'rebune']
standingcooker_types = ['3 gas+1 electric', '4 gas', '2 gas+2 electric']
oven_capacities = ['40-60 l', 'without oven', '30-40 l', '10-20 l']

print('Available number of standing cooker brands from kilimall website: ', len(standingcooker_brands))
print(f'There are {len(standingcooker_types)} different types of cooktops ')
print(f'There are {len(oven_capacities)} different oven capacities')

Available number of standing cooker brands from kilimall website:  20
There are 3 different types of cooktops 
There are 4 different oven capacities


In [41]:
# extract brand, type and oven capacity from "clean_standingcooker_name" using above lists

# 1. function to match brands
def match_standingcooker_brand(standing_cooker_name):
    for brand in standingcooker_brands:
        if brand.lower() in standing_cooker_name.lower():
            return brand
    return 'Unknown'


# 2. function to match standing cooker type
def match_standingcooker_type(standing_cooker_name):
    for standingcooker_type in standingcooker_types:
        if standingcooker_type.lower() in standing_cooker_name.lower():
            return standingcooker_type
    return 'Unknown'

# 3. function to match oven capacity
def match_capacity(standing_cooker_name):
    for capacity in oven_capacities:
        if capacity.lower() in standing_cooker_name.lower():
            return capacity
    return 'Unknown'

In [43]:
# Apply the functions to the DataFrame

standingcooker_no_dupes_df = standingcooker_no_dupes_df.copy()

# Now safely modify the DataFrame
standingcooker_no_dupes_df['brand'] = standingcooker_no_dupes_df['clean_standingcooker_name'].apply(match_standingcooker_brand)
standingcooker_no_dupes_df['standing_cooker_type'] = standingcooker_no_dupes_df['clean_standingcooker_name'].apply(match_standingcooker_type)
standingcooker_no_dupes_df['oven_capacity'] = standingcooker_no_dupes_df['clean_standingcooker_name'].apply(match_capacity)


In [45]:
standingcooker_no_dupes_df.head()

Unnamed: 0,standing_cooker_name,standing_cooker_reviews,standing_cooker_price,clean_standingcooker_name,brand,standing_cooker_type,oven_capacity
0,Volsmart 4 Burners VGS-580 Free Standing Gas C...,(241),"KSh 18,299",Volsmart 4 Burners VGS-580 Free Standing Gas C...,volsmart,4 gas,Unknown
1,(Special Offer)Nunix KZ-560-3G1E Free Stand...,(223),"KSh 23,899",Special OfferNunix KZ-560-3G1E Free Standin...,nunix,Unknown,Unknown
2,SPECIAL OFFER Eurochef EGT-55-3G1E-E 3 Gas Bur...,(39),"KSh 22,100",SPECIAL OFFER Eurochef EGT-55-3G1E-E 3 Gas Bur...,eurochef,Unknown,Unknown
3,Nunix KZ-560-GO Free Standing 4 Gas Burners Ov...,(157),"KSh 19,799",Nunix KZ-560-GO Free Standing 4 Gas Burners Ov...,nunix,4 gas,Unknown
4,Nunix 3 Gas Burners 1 Electric Standing Gas C...,(18),"KSh 9,299",Nunix 3 Gas Burners 1 Electric Standing Gas C...,nunix,Unknown,Unknown


In [46]:
standingcooker_no_dupes_df.describe()

Unnamed: 0,standing_cooker_name,standing_cooker_reviews,standing_cooker_price,clean_standingcooker_name,brand,standing_cooker_type,oven_capacity
count,953,953,953,953,953,953,953
unique,953,17,554,953,19,2,1
top,"VON VAC6SV31UY - 60*55, 3 Gas+1 Electric Stand...",(0),"KSh 32,999","VON VAC6SV31UY - 6055, 3 Gas1 Electric Standin...",Unknown,Unknown,Unknown
freq,1,903,27,1,273,811,953


In [47]:

standingcooker_unique_brands = standingcooker_no_dupes_df["brand"].unique()
print(standingcooker_unique_brands)

#number of unique counts
standingcooker_unique_brands_counts = standingcooker_no_dupes_df["brand"].value_counts()
print("Frequency of available brands:\n", standingcooker_unique_brands_counts)

['volsmart' 'nunix' 'eurochef' 'sayona' 'mika' 'premier' 'Unknown'
 'ramtons' 'hisense' 'bruhm' 'haier' 'hotpoint' 'roch' 'euroken' 'amaze'
 'rebune' 'exzel' 'ailyons' 'generic']
Frequency of available brands:
 brand
Unknown     273
nunix       199
mika        122
eurochef     76
volsmart     58
ramtons      58
premier      35
sayona       25
roch         19
haier        18
amaze        17
generic      13
hisense      11
euroken       9
bruhm         6
hotpoint      5
exzel         4
ailyons       4
rebune        1
Name: count, dtype: int64
