## Laptops Data.

In [2]:
# Import libraries
import pandas as pd
import re

In [4]:
# Data loading
laptops_df = pd.read_csv("../csv_files/laptops.csv")
laptops_df.head()

Unnamed: 0,Name,Price,Reviews,Ratings
0,"HP Refurbished EliteBook 640GB HDD, 8GB RAM, ...","KSh 20,300",3.7 out of 5(13),3.7 out of 5
1,"HP Refurbished EliteBook 820 Core I5, 8GB RAM ...","KSh 16,499",5 out of 5(1),5 out of 5
2,Lenovo ThinkPad T490 Touchscreen Core I5 -8th ...,"KSh 27,999",4 out of 5(9),4 out of 5
3,HP Refurbished EliteBook 840 G3 Intel Core I5 ...,"KSh 23,500",4 out of 5(1),4 out of 5
4,HP GAMING-LAPTOP -HP ELITEBOOK 755 AMD RYZEN 7...,"KSh 35,000",4.4 out of 5(9),4.4 out of 5


In [6]:
# Cleaning laptops data
# Extract brand name
def extract_brand(name):
    match = re.search(r'(HP|Lenovo|Dell|Acer|)', name, re.IGNORECASE)
    return match.group(0) if match else 'Unknown'

# Extract RAM
def extract_ram(name):
    match = re.search(r'(\d+GB)\s*RAM', name)
    return match.group(1) if match else 'Unknown'

# Extract ROM (HDD/SSD)
def extract_rom(name):
    match = re.search(r'(\d+GB|TB)\s*(HDD|SSD)', name)
    return match.group(0) if match else 'Uknown'

# Extract processor type
def extract_processor(name):
    match = re.search(r'Intel\s*(Core\s*I\d)', name)
    return match.group(1) if match else 'Unknown'
                    
def extract_screen_size(name):
    match = re.search(r'(\d+\.?\d*)"\s*', name)
    return match.group(1) if match else 'Unknown'

# Extract the price from the 'Price' column
def extract_price(price):
    match = re.search(r'KSh\s*(\d+([,]\d{3})*)', price)
    if match:
        return float(match.group(1).replace(',', ''))
    return None

# Extract reviews 
def extract_reviews(reviews):
    match = re.search(r'\((\d+)\)', reviews)
    if match:
        return int(match.group(1))
    return None

# Extract ratings (the number before "out of 5")
def extract_ratings(ratings):
    match = re.search(r'(\d+\.\d+)', ratings)
    if match:
        return float(match.group(1))
    return None

# Apply extraction functions to the DataFrame
laptops_df['name'] = laptops_df['Name']
laptops_df['brand'] = laptops_df['Name'].apply(extract_brand)
laptops_df['ram'] = laptops_df['Name'].apply(extract_ram)
laptops_df['rom'] = laptops_df['Name'].apply(extract_rom)
laptops_df['processor'] = laptops_df['Name'].apply(extract_processor)
laptops_df['screen_size'] = laptops_df['Name'].apply(extract_screen_size)
laptops_df['price'] = laptops_df['Price'].apply(extract_price)
laptops_df['reviews'] = laptops_df['Reviews'].apply(extract_reviews)
laptops_df['ratings'] = laptops_df['Ratings'].apply(extract_ratings)

# Create the new DataFrame with the extracted data
cleaned_data = laptops_df[['name','brand', 'ram', 'rom', 'processor', 'screen_size', 'price','reviews','ratings']]

# Save the cleaned data to a new CSV file
cleaned_data.to_csv('../csv_files/laptops_clean.csv', index=False)

In [7]:
laptops = pd.read_csv("../csv_files/laptops_clean.csv")
laptops.head()

Unnamed: 0,name,brand,ram,rom,processor,screen_size,price,reviews,ratings
0,"HP Refurbished EliteBook 640GB HDD, 8GB RAM, ...",HP,8GB,640GB HDD,Core I5,Unknown,20300.0,13,3.7
1,"HP Refurbished EliteBook 820 Core I5, 8GB RAM ...",HP,8GB,500GB HDD,Unknown,Unknown,16499.0,1,
2,Lenovo ThinkPad T490 Touchscreen Core I5 -8th ...,Lenovo,8GB,256GB SSD,Unknown,14,27999.0,9,
3,HP Refurbished EliteBook 840 G3 Intel Core I5 ...,HP,Unknown,500GB HDD,Core I5,Unknown,23500.0,1,
4,HP GAMING-LAPTOP -HP ELITEBOOK 755 AMD RYZEN 7...,HP,16GB,256GB SSD,Unknown,Unknown,35000.0,9,4.4


## Fridges Data

In [10]:
# Fridges data
df = pd.read_csv("../csv_files/fridges.csv")
df.head()

Unnamed: 0,Name,Price,Reviews,Ratings
0,Ramtons RF/335 - 85L Single Door Refrigerator ...,"KSh 15,999",4.1 out of 5(66),4.1 out of 5
1,Hisense 94 Liters Single Door Fridge REF094DR ...,"KSh 16,799",4.4 out of 5(87),4.4 out of 5
2,"Ramtons RF/203, 2 Door Direct Cool Fridge, 128...","KSh 28,099",4.2 out of 5(69),4.2 out of 5
3,Roch RFR-120S-I Single Door Refrigerator - 90 ...,"KSh 16,299",4.2 out of 5(477),4.2 out of 5
4,Nunix 138L Double Door Fridge Energy Efficient...,"KSh 27,500",3.9 out of 5(149),3.9 out of 5


In [12]:
# Extract brand name and model.
def extract_brand_and_model(name):
    match = re.match(r"([A-Za-z]+(?: [A-Za-z]+)*)(?:\s[RF|REF|FM|DF|D|]{2,4}[\d]+)?", name)
    if match:
        return match.group(1).strip()
    return ''

# Extract size in litres
def extract_size(name):
    match = re.search(r'(\d+)\s*Litres?', name)
    if match:
        return int(match.group(1))
    return None

# Extract number of doors
def extract_doors(name):
    match = re.search(r'(\d+)\s*Door', name)
    if match:
        return int(match.group(1))
    return None

# Extract color
def extract_color(name):
    color_keywords = ['Silver', 'White', 'Black', 'Grey', 'Red', 'Blue', 'Green', 'Beige', 'Stainless', 'Chrome']
    for color in color_keywords:
        if color.lower() in name.lower():
            return color
    return None

# Extract warranty
def extract_warranty(name):
    match = re.search(r'(\d+)\s*YRs?\s*WRTY', name)
    if match:
        return int(match.group(1))
    return None

# Extract the price from the 'Price' column
def extract_price(price):
    match = re.search(r'KSh\s*(\d+([,]\d{3})*)', price)
    if match:
        return float(match.group(1).replace(',', ''))
    return None

# Extract reviews 
def extract_reviews(reviews):
    match = re.search(r'\((\d+)\)', reviews)
    if match:
        return int(match.group(1))
    return None

# Extract ratings (the number before "out of 5")
def extract_ratings(ratings):
    match = re.search(r'(\d+\.\d+)', ratings)
    if match:
        return float(match.group(1))
    return None

# Apply the extraction functions to the DataFrame
df['name'] = df['Name']
df['brand'] = df['Name'].apply(extract_brand_and_model)
df['size_litres'] = df['Name'].apply(extract_size)
df['doors'] = df['Name'].apply(extract_doors)
df['color'] = df['Name'].apply(extract_color)
df['warranty_years'] = df['Name'].apply(extract_warranty)
df['price'] = df['Price'].apply(extract_price)
df['reviews'] = df['Reviews'].apply(extract_reviews)
df['ratings'] = df['Ratings'].apply(extract_ratings)

data = df[['name', 'brand', 'size_litres','doors', 'color', 'warranty_years', 'price', 'reviews', 'ratings']]

# Save the modified DataFrame to a new CSV file
data.to_csv('../csv_files/fridges_clean.csv', index=False)

print("Data extraction and CSV creation completed successfully!")

Data extraction and CSV creation completed successfully!


In [13]:
clean_df = pd.read_csv("../csv_files/fridges_clean.csv")
clean_df.head(10)

Unnamed: 0,name,brand,size_litres,doors,color,warranty_years,price,reviews,ratings
0,Ramtons RF/335 - 85L Single Door Refrigerator ...,Ramtons RF,,,Silver,1.0,15999.0,66,4.1
1,Hisense 94 Liters Single Door Fridge REF094DR ...,Hisense,,,,2.0,16799.0,87,4.4
2,"Ramtons RF/203, 2 Door Direct Cool Fridge, 128...",Ramtons RF,128.0,2.0,Silver,1.0,28099.0,69,4.2
3,Roch RFR-120S-I Single Door Refrigerator - 90 ...,Roch RFR,90.0,,Silver,,16299.0,477,4.2
4,Nunix 138L Double Door Fridge Energy Efficient...,Nunix,,,,1.0,27500.0,149,3.9
5,Smart Pro SFR-120S-I Single Door Refrigerator ...,Smart Pro SFR,90.0,,Silver,1.0,15799.0,22,4.4
6,Ramtons RF/217 - 2 Door Direct Cool Fridge - 2...,Ramtons RF,213.0,2.0,Silver,1.0,33176.0,22,4.7
7,Roch RFR-150DT-I Top-Mounted Defrost Fridge - ...,Roch RFR,,,,1.0,36000.0,106,4.2
8,"Nunix Fridge Double Door, 210L Energy Efficien...",Nunix Fridge Double Door,,,,,36850.0,4,
9,"VON VRT-138DRHX Double Door FRIDGE 136L,INOX, ...",VON VRT,,,,1.0,29009.0,68,
