## Data Cleaning

### 1. Importing libraries

In [1]:
import pandas as pd
import re

### 2. Data extraction

In [24]:
# Load the csv data
tv_data = pd.read_csv("kilimall_tvs_no_rating.csv")

# preview the data
tv_data.head()

Unnamed: 0,product_name,product_reviews,product_price
0,Vitron 32 Inch Frameless Smart TV HD Netflix T...,(1347),"KSh 11,199"
1,Vitron 32 inch Frameless Television HTC 3218 L...,(710),"KSh 11,099"
2,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,(1142),"KSh 18,499"
3,"VITRON BLUETOOTH-ENABLED 32"" INCH HTC3200S,FRA...",(947),"KSh 13,978"
4,"VITRON 32"" Inch DIGITAL TELEVISION FRAMELESS H...",(243),"KSh 12,998"


In [36]:
# the product name includes emojis, that need to be edited out before cleaning

# Function to remove emojis and special characters
def remove_emojis(text):
    return re.sub(r"[^\w\s,.-]", "", text)

# Apply the function to the "product_name" column
tv_data["clean_product_name"] = tv_data["product_name"].apply(remove_emojis)

# Save the "clean_product_name" column to a CSV file
tv_data[["clean_product_name"]].to_csv("no_emojis_tv_product_names.csv", index=False)

tv_data.head()

Unnamed: 0,product_name,product_reviews,product_price,clean_product_name
0,Vitron 32 Inch Frameless Smart TV HD Netflix T...,(1347),"KSh 11,199",Vitron 32 Inch Frameless Smart TV HD Netflix T...
1,Vitron 32 inch Frameless Television HTC 3218 L...,(710),"KSh 11,099",Vitron 32 inch Frameless Television HTC 3218 L...
2,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,(1142),"KSh 18,499",Vitron 43 inch Smart TV HTC 4388FS Android Tel...
3,"VITRON BLUETOOTH-ENABLED 32"" INCH HTC3200S,FRA...",(947),"KSh 13,978","VITRON BLUETOOTH-ENABLED 32 INCH HTC3200S,FRAM..."
4,"VITRON 32"" Inch DIGITAL TELEVISION FRAMELESS H...",(243),"KSh 12,998",VITRON 32 Inch DIGITAL TELEVISION FRAMELESS HT...


The regex pattern [^\w\s,.-] matches anything that is not a word character (\w), whitespace (\s), comma (,), period (.), or hyphen (-). This helps remove emojis and special characters while keeping alphanumeric characters and common punctuation.

### Method 1: Extracting Brand, Size and Description

In [77]:
# data extraction logic
def extract_data(tv_data_column):
    # extract the brand name
    brand = tv_data_column.split(" ")[0]

    # Escape the brand for use in the regex
    escaped_brand = re.escape(brand)

    # extract size(e.g. 32 inch)
    size_match = re.search(r"(\d+)\s*inch", tv_data_column, re.IGNORECASE)
    size = size_match.group(1) if size_match else None

    # remove the brand and size from the product name
    description = re.sub(rf"{escaped_brand}|(\d+\s*inch)", "", tv_data_column, flags=re.IGNORECASE).strip()

    return brand, size, description
    

In [78]:
# apply the extraction logic to the product name column

tv_data[["brand", "size", "description"]] = tv_data["clean_product_name"].apply(
    lambda x: pd.Series(extract_data(x))
    )

# Save the updated dataset
tv_data.to_csv("tv_data_extracted.csv", index=False)

tv_data.head()

Unnamed: 0,product_name,product_reviews,product_price,clean_product_name,description,brand,size
0,Vitron 32 Inch Frameless Smart TV HD Netflix T...,(1347),"KSh 11,199",Vitron 32 Inch Frameless Smart TV HD Netflix T...,Frameless Smart TV HD Netflix TV HTC3200S Yout...,Vitron,32
1,Vitron 32 inch Frameless Television HTC 3218 L...,(710),"KSh 11,099",Vitron 32 inch Frameless Television HTC 3218 L...,Frameless Television HTC 3218 LED Digital TV D...,Vitron,32
2,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,(1142),"KSh 18,499",Vitron 43 inch Smart TV HTC 4388FS Android Tel...,Smart TV HTC 4388FS Android Television Full HD...,Vitron,43
3,"VITRON BLUETOOTH-ENABLED 32"" INCH HTC3200S,FRA...",(947),"KSh 13,978","VITRON BLUETOOTH-ENABLED 32 INCH HTC3200S,FRAM...","BLUETOOTH-ENABLED HTC3200S,FRAMELESS Smart An...",VITRON,32
4,"VITRON 32"" Inch DIGITAL TELEVISION FRAMELESS H...",(243),"KSh 12,998",VITRON 32 Inch DIGITAL TELEVISION FRAMELESS HT...,DIGITAL TELEVISION FRAMELESS HTC3218 LED TV U...,VITRON,32


In [79]:
# shape
tv_data.shape

(4239, 7)

In [80]:
# exploring clean dataframe
unique_brands = tv_data["brand"].unique()
print(unique_brands)

#number of unique counts
unique_brands_counts = tv_data["brand"].value_counts()

# write the unique brand counts into a csv
## convert to a df
brand_count_df = unique_brands_counts.reset_index()

## rename column names for clarity
brand_count_df.columns = ["Brand", "Count"]

## write to csv
brand_count_df.to_csv("tv_brand_count.csv", index = False)

print("Frequency of available brands:\n", unique_brands_counts)


['Vitron' 'VITRON' 'GLD' 'BLUETOOTH' 'Special' 'Artel' 'Vision' 'LYONS'
 'EOCO' 'BLUETOOTH-ENABLED' 'CTC' 'AILYONS' 'Skyworth' 'TCL' 'AMTEC'
 'Hisense' 'Solarmax' 'Amtec' 'VISION' 'New' 'Synix' 'Ailyons' 'Wyinix'
 'Haier' 'NEW' 'VITRON50' 'OFFER' 'LG' 'Euroken' 'SOLARMAX' 'Infinix' 'MG'
 'HISENSE' 'CTC32' 'TOP' 'Syinix' 'Royal' 'vision' 'Vitron32' 'Gld'
 'CLEARANCE' 'HAIER' 'Samsung' 'OFFERVITRON' 'GLD32' 'HIFINIT' 'Best'
 'ROYAL' '' 'Hisense32' 'Solar' 'SAMSUNG' 'Glaze' 'Globalstar' 'SKYWORTH'
 'Latest' 'VITRON43' 'KONKA' 'SYINIX' 'Tcl' 'TV' '23' 'Hisense43inch'
 'Hisense43' 'VITRON55' '22quot' 'TORNADO' 'GLOBALSTAR' 'VITRON32'
 'Amtech' 'VITRON50Inch' 'Share' '32' 'GOOD' 'SKYWORTHQUALITY' '55' 'SAVE'
 'Sony' '43' 'TV.' 'The' '43inch' '40INCH' 'CRAZY' 'WK' 'ENJOY'
 'VisionPlus' 'ANNIVERSARY' 'STAR' 'ANNIVESARY' 'BLACK' 'Amtec32'
 'Vision32' 'SONAR' 'Television' 'TCL43' 'Infinix32' 'CL' 'BRING' 'ALYONS'
 'Premier' 'GLD,' 'FESTIVEOFFERSSMAET' 'VISION43' 'VISION32' 'Vitron,40'
 'TCL32' '

### Method 2: Extracting Brand & Size

In [58]:
# definine lists of available brands and size from kilimall website
tv_brands = ["vitron", "hisense","tcl", "vision", "gld", "samsung", "amtec", "ctv", "skyworth", "syinix","lg", "sony","ailyons", "zeemr", "jenovox", "artel", "eoco", "euroken", "infinix", "sonar", "armco", "glaze", "dangbei", "haier", "amazon", "edison", "touyinger", "iconix", "golden tech", "premier", "skywave"]
print("Available number of TV brands: ", len(tv_brands))

tv_sizes = ["32", "43", "50", "55", "65", "75", "24", "40", "19", "26","22", "85", "70", "17", "105", "90"]
print("Available number of TV sizes (inch): ", len(tv_sizes))

tv_types = ["smart", "digital", "semi-smart"]
print("Available number of TV type: ", len(tv_type))

Available number of TV brands:  31
Available number of TV sizes (inch):  16
Available number of TV type:  3


There are 31 different brands and 16 varied sizes across 3 television types

In [72]:
# extract brand, size and type from "clean_product_name" using above lists

# 1. functions to match brands
def match_brand(product_name):
    for brand in tv_brands:
        if brand.lower() in product_name.lower():
            return brand
    return "Unknown"

# 2. function to extract size by checking against available sizes list

def extract_size(product_name):
    for size in tv_sizes:
        if str(size) in product_name:  # Check if the size number exists in the title as a string
            return size
    return None

# 3. function to match TV type
def match_tv_type(product_name):
    for tv_type in tv_types:
        if tv_type.lower() in product_name.lower():
            return tv_type
    return "Unknown Type"

In [92]:
# Apply the functions to the DataFrame

tv_data_2['brand'] = tv_data_2['clean_product_name'].apply(match_brand)
tv_data_2['size'] = tv_data_2['clean_product_name'].apply(extract_size)
tv_data_2['tv_type'] = tv_data_2['clean_product_name'].apply(match_tv_type)
tv_data_2.head()

# convert the df to csv
#tv_data_2.to_csv("tv_data_2.csv", index=False)

Unnamed: 0,product_name,product_reviews,product_price,clean_product_name,brand,size,tv_type
0,Vitron 32 Inch Frameless Smart TV HD Netflix T...,(1347),"KSh 11,199",Vitron 32 Inch Frameless Smart TV HD Netflix T...,vitron,32,smart
1,Vitron 32 inch Frameless Television HTC 3218 L...,(710),"KSh 11,099",Vitron 32 inch Frameless Television HTC 3218 L...,vitron,32,digital
2,Vitron 43 inch Smart TV HTC 4388FS Android Tel...,(1142),"KSh 18,499",Vitron 43 inch Smart TV HTC 4388FS Android Tel...,vitron,43,smart
3,"VITRON BLUETOOTH-ENABLED 32"" INCH HTC3200S,FRA...",(947),"KSh 13,978","VITRON BLUETOOTH-ENABLED 32 INCH HTC3200S,FRAM...",vitron,32,smart
4,"VITRON 32"" Inch DIGITAL TELEVISION FRAMELESS H...",(243),"KSh 12,998",VITRON 32 Inch DIGITAL TELEVISION FRAMELESS HT...,vitron,32,digital


In [87]:
# shape
tv_data_2.shape

(4239, 7)

In [89]:
# exploring clean dataframe
unique_brands_1 = tv_data_2["brand"].unique()
print(unique_brands_1)

#number of unique counts
unique_brands_counts_1 = tv_data_2["brand"].value_counts()

# write the unique brand counts into a csv
## convert to a df
brand_count_df_1 = unique_brands_counts_1.reset_index()

## rename column names for clarity
brand_count_df_1.columns = ["Brand", "Count"]

## write to csv
brand_count_df_1.to_csv("tv_brand_count_1.csv", index = False)

print("Frequency of available brands:\n", unique_brands_counts_1)

['vitron' 'vision' 'artel' 'eoco' 'Unknown' 'skyworth' 'tcl' 'hisense'
 'amtec' 'syinix' 'ailyons' 'lg' 'euroken' 'gld' 'samsung' 'glaze'
 'premier' 'sony' 'sonar' 'haier' 'iconix' 'infinix']
Frequency of available brands:
 brand
vitron      1249
vision       950
tcl          498
hisense      461
Unknown      352
samsung      142
skyworth     123
gld          113
amtec        106
syinix        56
lg            56
glaze         37
sony          30
artel         15
ailyons       14
euroken       10
eoco           7
iconix         6
premier        5
haier          4
sonar          3
infinix        2
Name: count, dtype: int64


In [93]:
# find sum of tvs with matched brands
count_sum = brand_count_df_1["Count"].sum()
count_sum

np.int64(4239)