## Data Cleaning

### Importing libraries

In [47]:
import pandas as pd
import numpy as np 
import re

### Data extraction

In [48]:
# Load the csv data
tv_data = pd.read_csv(r"..\data\scraped\kilimall_tvs.csv")

# preview the data
tv_data.head()

Unnamed: 0,product_name,product_reviews,product_price,product_link
0,Vitron 32 Inch Frameless Smart TV HD Netflix T...,(1423),"KSh 10,999",https://www.kilimall.co.ke/listing/2517665-vit...
1,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,(1200),"KSh 18,999",https://www.kilimall.co.ke/listing/2325142-vit...
2,"VITRON BLUETOOTH-ENABLED 32"" INCH HTC3200S,FRA...",(1005),"KSh 13,978",https://www.kilimall.co.ke/listing/2519402-vit...
3,"VITRON 32"" Inch DIGITAL TELEVISION FRAMELESS H...",(248),"KSh 12,998",https://www.kilimall.co.ke/listing/2055284-vit...
4,GLD 32 Inch Frameless Smart Android TV（G3200R1...,(366),"KSh 11,999",https://www.kilimall.co.ke/listing/2384332-gld...


In [49]:
# the product name includes emojis, that need to be edited out before cleaning

# Function to remove emojis and special characters
def remove_emojis(text):
    return re.sub(r"[^\w\s,.-]", "", text)

# Apply the function to the "product_name" column
tv_data["clean_product_name"] = tv_data["product_name"].apply(remove_emojis)

# Save the "clean_product_name" column to a CSV file
#tv_data[["clean_product_name"]].to_csv("no_emojis_tv_product_names.csv", index=False)

tv_data.head()

Unnamed: 0,product_name,product_reviews,product_price,product_link,clean_product_name
0,Vitron 32 Inch Frameless Smart TV HD Netflix T...,(1423),"KSh 10,999",https://www.kilimall.co.ke/listing/2517665-vit...,Vitron 32 Inch Frameless Smart TV HD Netflix T...
1,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,(1200),"KSh 18,999",https://www.kilimall.co.ke/listing/2325142-vit...,Vitron 43 inch Smart TV HTC 4388FS Android Tel...
2,"VITRON BLUETOOTH-ENABLED 32"" INCH HTC3200S,FRA...",(1005),"KSh 13,978",https://www.kilimall.co.ke/listing/2519402-vit...,"VITRON BLUETOOTH-ENABLED 32 INCH HTC3200S,FRAM..."
3,"VITRON 32"" Inch DIGITAL TELEVISION FRAMELESS H...",(248),"KSh 12,998",https://www.kilimall.co.ke/listing/2055284-vit...,VITRON 32 Inch DIGITAL TELEVISION FRAMELESS HT...
4,GLD 32 Inch Frameless Smart Android TV（G3200R1...,(366),"KSh 11,999",https://www.kilimall.co.ke/listing/2384332-gld...,GLD 32 Inch Frameless Smart Android TVG3200R1-...


The regex pattern [^\w\s,.-] matches anything that is not a word character (\w), whitespace (\s), comma (,), period (.), or hyphen (-). This helps remove emojis and special characters while keeping alphanumeric characters and common punctuation.

In [50]:
# shape
tv_data.shape

(4095, 5)

### Extracting Brand & Size

In [51]:
# definine lists of available brands and size from kilimall website
tv_brands = ["vitron", "hisense","tcl", "generic", "vision","gld", "amtec", "samsung", "ctc", "skyworth", "syinix", "lg", "synix", "ailyons", "sony", "artel", "eoco", "& other fairies", "glaze", "infinix", "haier", "euroken", "iconix", "golden tech", "Microsoft lumia", "hotpoint", "fenghua", "konka", "power", "premier", "royal", "armco", "bic", "ccit", "ctroniq", "hifinit", "htc", "mara", "sonar", "trinity", "vitafoam", "x-tigi", "xiaomi", "wyinix", "sardin", "solar max", "globalstar", "tornado", "mg", "alyons", "solarmax", "ailynos", "von", "star x", "weyon", "itel", "ica", "skyview", "starmax", "aiylons", "skymax"]
print("Available number of TV brands: ", len(tv_brands))

tv_sizes = ["32", "43", "50", "55", "65", "75", "24", "40", "19", "26","22", "85", "70", "17", "105", "90"]
print("Available number of TV sizes (inch): ", len(tv_sizes))

tv_types = ["smart", "digital", "semi-smart"]
print("Available number of TV type: ", len(tv_types))

Available number of TV brands:  61
Available number of TV sizes (inch):  16
Available number of TV type:  3


There are 31 different brands and 16 varied sizes across 3 television types

In [52]:
# extract brand, size and type from "clean_product_name" using above lists

# 1. functions to match brands
def match_brand(product_name):
    for brand in tv_brands:
        if brand.lower() in product_name.lower():
            return brand
    return "unknown"

# 2. function to extract size by checking against available sizes list

def extract_size(product_name):
    for size in tv_sizes:
        if str(size) in product_name:  # Check if the size number exists in the title as a string
            return size
    return None

# 3. function to match TV type
def match_tv_type(product_name):
    for tv_type in tv_types:
        if tv_type.lower() in product_name.lower():
            return tv_type
    return "unknown"

In [53]:
# Apply the functions to the DataFrame

tv_data["brand"] = tv_data["clean_product_name"].apply(match_brand)
tv_data["size"] = tv_data["clean_product_name"].apply(extract_size)
tv_data["tv_type"] = tv_data["clean_product_name"].apply(match_tv_type)
tv_data.head()

# convert the df to csv
#tv_data_2.to_csv("tv_data_2.csv", index=False)

Unnamed: 0,product_name,product_reviews,product_price,product_link,clean_product_name,brand,size,tv_type
0,Vitron 32 Inch Frameless Smart TV HD Netflix T...,(1423),"KSh 10,999",https://www.kilimall.co.ke/listing/2517665-vit...,Vitron 32 Inch Frameless Smart TV HD Netflix T...,vitron,32,smart
1,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,(1200),"KSh 18,999",https://www.kilimall.co.ke/listing/2325142-vit...,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,vitron,43,smart
2,"VITRON BLUETOOTH-ENABLED 32"" INCH HTC3200S,FRA...",(1005),"KSh 13,978",https://www.kilimall.co.ke/listing/2519402-vit...,"VITRON BLUETOOTH-ENABLED 32 INCH HTC3200S,FRAM...",vitron,32,smart
3,"VITRON 32"" Inch DIGITAL TELEVISION FRAMELESS H...",(248),"KSh 12,998",https://www.kilimall.co.ke/listing/2055284-vit...,VITRON 32 Inch DIGITAL TELEVISION FRAMELESS HT...,vitron,32,digital
4,GLD 32 Inch Frameless Smart Android TV（G3200R1...,(366),"KSh 11,999",https://www.kilimall.co.ke/listing/2384332-gld...,GLD 32 Inch Frameless Smart Android TVG3200R1-...,vision,32,smart


In [54]:
# shape
tv_data.shape

(4095, 8)

In [55]:
# exploring clean dataframe
unique_brands = tv_data["brand"].unique()
print(unique_brands)

#number of unique counts
unique_brands_count = tv_data["brand"].value_counts()
unique_sizes_count = tv_data["size"].value_counts()
unique_tv_types_counts = tv_data["tv_type"].value_counts()

print("Frequency of available brands:\n", unique_brands_count)

['vitron' 'vision' 'artel' 'eoco' 'solarmax' 'ctc' 'hisense' 'tcl'
 'skyworth' 'unknown' 'globalstar' 'gld' 'syinix' 'synix' 'ailyons'
 'euroken' 'royal' 'amtec' 'wyinix' 'tornado' 'samsung' 'glaze' 'konka'
 'mg' 'aiylons' 'premier' 'htc' 'sony' 'lg' 'star x' 'sonar' 'haier'
 'solar max' 'von' 'ailynos' 'power' 'weyon' 'skyview' 'golden tech'
 'starmax' 'iconix' 'alyons' 'infinix' 'ica' 'xiaomi' 'sardin']
Frequency of available brands:
 brand
vitron         1216
vision          945
tcl             461
hisense         448
samsung         142
skyworth        126
ctc             120
amtec            98
gld              95
syinix           58
unknown          57
lg               50
glaze            39
synix            39
sony             29
royal            26
solarmax         20
artel            14
ailyons          13
mg               13
globalstar       12
euroken          10
tornado           8
eoco              7
htc               6
iconix            5
haier             4
star x       

In [56]:
print("Frequency of available tv size:\n", unique_sizes_count)

Frequency of available tv size:
 size
32    1630
43    1105
50     381
55     358
65     152
24      96
40      95
22      55
75      51
19      40
26      18
70      10
85       8
90       1
Name: count, dtype: int64


In [57]:
print("Frequency of available tv type:\n", unique_tv_types_counts)

Frequency of available tv type:
 tv_type
smart      3042
digital     556
unknown     497
Name: count, dtype: int64


In [58]:
# explore rows with unknown brands
unknown_type = tv_data[tv_data["tv_type"] == "unknown"]
unknown_type.head()

Unnamed: 0,product_name,product_reviews,product_price,product_link,clean_product_name,brand,size,tv_type
65,"VITRON HTC5068US 50"" Inch FRAMELESS INBUILT DE...",(22),"KSh 35,995",https://www.kilimall.co.ke/listing/2807621-vit...,VITRON HTC5068US 50 Inch FRAMELESS INBUILT DED...,vitron,50,unknown
90,"BLUETOOTH VITRON 50 inch HTC5068US,50 Inch FRA...",(0),"KSh 35,999",https://www.kilimall.co.ke/listing/1001010149-...,"BLUETOOTH VITRON 50 inch HTC5068US,50 Inch FRA...",vitron,50,unknown
95,Skyworth 43” inch 43E FRAMELESS FHD ANDROID AI...,(42),"KSh 28,995",https://www.kilimall.co.ke/listing/2347912-sky...,Skyworth 43 inch 43E FRAMELESS FHD ANDROID AI ...,skyworth,43,unknown
102,"VITRON 50 inch HTC5068US,50 Inch BLUETOOTH FRA...",(30),"KSh 35,495",https://www.kilimall.co.ke/listing/2166055-vit...,"VITRON 50 inch HTC5068US,50 Inch BLUETOOTH FRA...",vitron,50,unknown
181,"TCL 43"" P635 4K HDR Google TV ,BLUETOOTH TV,FR...",(0),"KSh 32,899",https://www.kilimall.co.ke/listing/1000426054-...,"TCL 43 P635 4K HDR Google TV ,BLUETOOTH TV,FRA...",tcl,43,unknown


In [59]:
# remove brackets from product_reviews
tv_data["product_reviews"] = tv_data["product_reviews"].apply(lambda x: x.strip("()"))
tv_data.head()

Unnamed: 0,product_name,product_reviews,product_price,product_link,clean_product_name,brand,size,tv_type
0,Vitron 32 Inch Frameless Smart TV HD Netflix T...,1423,"KSh 10,999",https://www.kilimall.co.ke/listing/2517665-vit...,Vitron 32 Inch Frameless Smart TV HD Netflix T...,vitron,32,smart
1,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,1200,"KSh 18,999",https://www.kilimall.co.ke/listing/2325142-vit...,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,vitron,43,smart
2,"VITRON BLUETOOTH-ENABLED 32"" INCH HTC3200S,FRA...",1005,"KSh 13,978",https://www.kilimall.co.ke/listing/2519402-vit...,"VITRON BLUETOOTH-ENABLED 32 INCH HTC3200S,FRAM...",vitron,32,smart
3,"VITRON 32"" Inch DIGITAL TELEVISION FRAMELESS H...",248,"KSh 12,998",https://www.kilimall.co.ke/listing/2055284-vit...,VITRON 32 Inch DIGITAL TELEVISION FRAMELESS HT...,vitron,32,digital
4,GLD 32 Inch Frameless Smart Android TV（G3200R1...,366,"KSh 11,999",https://www.kilimall.co.ke/listing/2384332-gld...,GLD 32 Inch Frameless Smart Android TVG3200R1-...,vision,32,smart


In [60]:
# cleaning the product_price column
tv_data["product_price"] = tv_data["product_price"].apply(lambda x: int(x.replace("KSh", "").replace(",", "").strip()))
tv_data.head()

Unnamed: 0,product_name,product_reviews,product_price,product_link,clean_product_name,brand,size,tv_type
0,Vitron 32 Inch Frameless Smart TV HD Netflix T...,1423,10999,https://www.kilimall.co.ke/listing/2517665-vit...,Vitron 32 Inch Frameless Smart TV HD Netflix T...,vitron,32,smart
1,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,1200,18999,https://www.kilimall.co.ke/listing/2325142-vit...,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,vitron,43,smart
2,"VITRON BLUETOOTH-ENABLED 32"" INCH HTC3200S,FRA...",1005,13978,https://www.kilimall.co.ke/listing/2519402-vit...,"VITRON BLUETOOTH-ENABLED 32 INCH HTC3200S,FRAM...",vitron,32,smart
3,"VITRON 32"" Inch DIGITAL TELEVISION FRAMELESS H...",248,12998,https://www.kilimall.co.ke/listing/2055284-vit...,VITRON 32 Inch DIGITAL TELEVISION FRAMELESS HT...,vitron,32,digital
4,GLD 32 Inch Frameless Smart Android TV（G3200R1...,366,11999,https://www.kilimall.co.ke/listing/2384332-gld...,GLD 32 Inch Frameless Smart Android TVG3200R1-...,vision,32,smart


In [61]:
# drop product_name
tv_data.drop(columns=["product_name"], inplace=True)
tv_data.head()

Unnamed: 0,product_reviews,product_price,product_link,clean_product_name,brand,size,tv_type
0,1423,10999,https://www.kilimall.co.ke/listing/2517665-vit...,Vitron 32 Inch Frameless Smart TV HD Netflix T...,vitron,32,smart
1,1200,18999,https://www.kilimall.co.ke/listing/2325142-vit...,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,vitron,43,smart
2,1005,13978,https://www.kilimall.co.ke/listing/2519402-vit...,"VITRON BLUETOOTH-ENABLED 32 INCH HTC3200S,FRAM...",vitron,32,smart
3,248,12998,https://www.kilimall.co.ke/listing/2055284-vit...,VITRON 32 Inch DIGITAL TELEVISION FRAMELESS HT...,vitron,32,digital
4,366,11999,https://www.kilimall.co.ke/listing/2384332-gld...,GLD 32 Inch Frameless Smart Android TVG3200R1-...,vision,32,smart


In [62]:
tv_data.shape

(4095, 7)

In [63]:
# check and remove duplicates
tv_data_no_dupes = tv_data.drop_duplicates(keep="first")
tv_data_no_dupes.shape

(4093, 7)

In [64]:
# change column names
tv_data_no_dupes = tv_data_no_dupes.rename(columns={"clean_product_name": "name", "product_reviews": "reviews", "product_price": "price", "tv_type": "type", "product_link": "url"})

# add a source column
tv_data_no_dupes["source"] = "kilimall" 

# add id coumn based on df index
tv_data_no_dupes["id"] = tv_data_no_dupes.index

# restructure the dataframe to have the columns in a more logical order
tv_data_no_dupes = tv_data_no_dupes[["id", "source", "name", "brand", "size", "type", "reviews", "price", "url"]]

tv_data_no_dupes.head()

Unnamed: 0,id,source,name,brand,size,type,reviews,price,url
0,0,kilimall,Vitron 32 Inch Frameless Smart TV HD Netflix T...,vitron,32,smart,1423,10999,https://www.kilimall.co.ke/listing/2517665-vit...
1,1,kilimall,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,vitron,43,smart,1200,18999,https://www.kilimall.co.ke/listing/2325142-vit...
2,2,kilimall,"VITRON BLUETOOTH-ENABLED 32 INCH HTC3200S,FRAM...",vitron,32,smart,1005,13978,https://www.kilimall.co.ke/listing/2519402-vit...
3,3,kilimall,VITRON 32 Inch DIGITAL TELEVISION FRAMELESS HT...,vitron,32,digital,248,12998,https://www.kilimall.co.ke/listing/2055284-vit...
4,4,kilimall,GLD 32 Inch Frameless Smart Android TVG3200R1-...,vision,32,smart,366,11999,https://www.kilimall.co.ke/listing/2384332-gld...


In [65]:
print(tv_data_no_dupes.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4093 entries, 0 to 4094
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       4093 non-null   int64 
 1   source   4093 non-null   object
 2   name     4093 non-null   object
 3   brand    4093 non-null   object
 4   size     3998 non-null   object
 5   type     4093 non-null   object
 6   reviews  4093 non-null   object
 7   price    4093 non-null   int64 
 8   url      4093 non-null   object
dtypes: int64(2), object(7)
memory usage: 319.8+ KB
None


In [66]:
# save the cleaned data to a csv file
tv_data_no_dupes.to_csv(r"..\data\clean\kilimall_tvs.csv", index=False)