### Data Preprocessing

In [2]:
import pandas as pd
import ast
import re
import numpy as np

In [3]:
# Load data
df = pd.read_csv("C:/Users/mahmu/Downloads/DataAnalyticsProjects/product-recommender-ai/data/mobiledokan_master.csv")

In [4]:
df

Unnamed: 0,url,name,price,specs
0,https://www.mobiledokan.com/mobile/samsung-gal...,Samsung Galaxy XCover7 Pro,90000,"{'Brand': 'Samsung', 'Model': 'Galaxy XCover7 ..."
1,https://www.mobiledokan.com/mobile/motorola-ed...,Motorola Edge 60s,50000,"{'Brand': 'Motorola', 'Model': 'Edge 60s', 'De..."
2,https://www.mobiledokan.com/mobile/samsung-gal...,Samsung Galaxy Z Fold7,250000,"{'brand': 'Samsung', 'model': 'Galaxy Z Fold7'..."
3,https://www.mobiledokan.com/mobile/oppo-reno14,Oppo Reno14,70000,"{'brand': 'Oppo', 'model': 'Reno14', 'device_t..."
4,https://www.mobiledokan.com/mobile/samsung-gal...,Samsung Galaxy F56,40000,"{'brand': 'Samsung', 'model': 'Galaxy F56', 'd..."
...,...,...,...,...
5022,https://www.mobiledokan.com/mobile/meizu-note-...,Meizu Note 16 Pro,25000,"{'brand': 'Meizu', 'model': 'Note 16 Pro', 'de..."
5023,https://www.mobiledokan.com/mobile/honor-gt-pr...,Honor GT Pro (16GB/512GB),90000,"{'brand': 'Honor', 'model': 'GT Pro (16GB/512G..."
5024,https://www.mobiledokan.com/mobile/honor-gt-pr...,Honor GT Pro (512GB),84500,"{'brand': 'Honor', 'model': 'GT Pro (512GB)', ..."
5025,https://www.mobiledokan.com/mobile/iqoo-z10-tu...,iQOO Z10 Turbo (512GB),45000,"{'brand': 'iQOO', 'model': 'Z10 Turbo (512GB)'..."


In [5]:
# Step 1: Convert 'specs' column to dictionary
def parse_specs(spec_str):
    try:
        return ast.literal_eval(spec_str)
    except Exception:
        return {}

df["specs"] = df["specs"].apply(parse_specs)

In [6]:
# Step 2: Flatten specs column
specs_df = pd.json_normalize(df["specs"])
df = pd.concat([df.drop(columns=["specs"]), specs_df], axis=1)

In [7]:
print(df.columns)

Index(['url', 'name', 'price', 'Brand', 'Model', 'Device Type', 'Release Date',
       'Status', 'Operating System', 'OS Version',
       ...
       'SAR Value', 'sar_value', 'User Available Storage',
       'user_available_storage', 'Image Stabilization', 'image_stabilization',
       'Music Play', 'music_play', 'Virtual RAM', 'virtual_ram'],
      dtype='object', length=226)


In [8]:
# Keep only columns that start with lowercase and do not contain spaces
df = df[[col for col in df.columns if col[0].islower() and ' ' not in col]]


In [9]:
print(df.columns.tolist())

['url', 'name', 'price', 'brand', 'model', 'device_type', 'release_date', 'status', 'operating_system', 'os_version', 'user_interface', 'chipset', 'cpu', 'cpu_cores', 'architecture', 'fabrication', 'gpu', 'display_type', 'screen_size', 'display_resolution', 'pixel_density', 'screen_to_body_ratio', 'screen_protection', 'bezel-less_display', 'touch_screen', 'refresh_rate', 'notch', 'camera_setup', 'primary_camera_resolution', 'autofocus', 'flash', 'image_resolution', 'settings', 'zoom', 'shooting_modes', 'primary_camera_aperture', 'camera_features', 'primary_camera_video_recording', 'video_fps', 'selfie_camera_resolution', 'selfie_camera_video_recording', 'selfie_camera_aperture', 'height', 'width', 'thickness', 'weight', 'colors', 'waterproof', 'ip_rating', 'ruggedness', 'battery_type', 'capacity', 'quick_charging', 'placement', 'usb_type-c', 'internal_storage', 'usb_otg', 'ram', 'network', 'sim_slot', 'sim_size', 'edge', 'gprs', 'volte', 'speed', 'wlan', 'bluetooth', 'gps', 'wi-fi_hots

In [10]:
# Define the final columns 
final_columns = [
    "name", "brand", "model", "price", "url",

    # üñ•Ô∏è Display
    "display_type", "screen_size", "display_resolution", "pixel_density",
    "refresh_rate", "screen_protection", "display_brightness",
    "screen_to_body_ratio", "aspect_ratio", "hdr_10_/_hdr_+_support",

    # üöÄ Performance
    "chipset", "cpu", "gpu", "ram", "ram_type",
    "internal_storage", "storage_type", "virtual_ram",

    # üì∏ Camera
    "camera_setup", "primary_camera_resolution", 'autofocus', 'flash', 'settings', 'zoom', 'shooting_modes', 
    'camera_features', "primary_camera_image_resolution", "primary_camera_video_recording",
    "video_fps", "selfie_camera_resolution", "selfie_camera_video_recording",
    "primary_camera_ois", "primary_camera_aperture", "selfie_camera_aperture",

    # üîã Battery & Charging
    "battery_type", "capacity", "quick_charging", 
    "wireless_charging", "reverse_charging",

    # üß± Build & Design
    "build", "weight", "thickness", "colors", 
    "waterproof", "ip_rating", "ruggedness",

    # üåê Network & Connectivity
    "network", "speed", "sim_slot", "volte", "bluetooth",
    "wlan", "gps", "nfc", "usb_type-c", "usb_otg",

    # üîê Security & Sensors
    "fingerprint_sensor", "finger_sensor_type", "finger_sensor_position",
    "face_unlock", "light_sensor", "sensor", "infrared", "fm_radio",

    # ‚öôÔ∏è OS & Software
    "operating_system", "os_version", "user_interface",

    # üìÖ Additional
    "release_date", "status", "made_by"
]

In [11]:
# Keep only the selected columns (if present)
available_columns = [col for col in final_columns if col in df.columns]
clean_df = df[available_columns].copy()

In [12]:
clean_df.head()

Unnamed: 0,name,brand,model,price,url,display_type,screen_size,display_resolution,pixel_density,refresh_rate,...,light_sensor,sensor,infrared,fm_radio,operating_system,os_version,user_interface,release_date,status,made_by
0,Samsung Galaxy XCover7 Pro,Samsung,Galaxy XCover7 Pro,90000,https://www.mobiledokan.com/mobile/samsung-gal...,IPS LCD,6.6 inches (16.76 cm),1080x2408 px (FHD+),400 ppi,120 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,,,Android,v15,One UI 7,Exp. 08 May 2025,Upcoming,South Korea
1,Motorola Edge 60s,Motorola,Edge 60s,50000,https://www.mobiledokan.com/mobile/motorola-ed...,P-OLED,6.67 inches (16.94 cm),1220x2712 px (FHD+),446 ppi,120 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,,,Android,v15,,Exp. 08 May 2025,Upcoming,USA
2,Samsung Galaxy Z Fold7,Samsung,Galaxy Z Fold7,250000,https://www.mobiledokan.com/mobile/samsung-gal...,Foldable Dynamic LTPO AMOLED 2X,8.2 inches (215.5 cm2),2224x2488 px,407 ppi,120 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,,,Android,v15,One UI 8,Not announced yet,Rumored,South Korea
3,Oppo Reno14,Oppo,Reno14,70000,https://www.mobiledokan.com/mobile/oppo-reno14,AMOLED,6.59 inches (16.74 cm),1256x2760 px (FHD+),460 ppi,120 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,Yes,,Android,v15,ColorOS 15,Exp. 23 May 2025,Upcoming,China
4,Samsung Galaxy F56,Samsung,Galaxy F56,40000,https://www.mobiledokan.com/mobile/samsung-gal...,Super AMOLED Plus,6.74 inches (17.12 cm),1080x2340 px (FHD+),382 ppi,120 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,,,Android,v15,One UI 7,Exp. 14 May 2025,Upcoming,South Korea


In [22]:
clean_df.to_csv("mobiledokan_master_cleaned.csv", index=False)

In [21]:
df = clean_df

In [22]:
df

Unnamed: 0,name,brand,model,price,url,display_type,screen_size,display_resolution,pixel_density,refresh_rate,...,light_sensor,sensor,infrared,fm_radio,operating_system,os_version,user_interface,release_date,status,made_by
0,Samsung Galaxy XCover7 Pro,Samsung,Galaxy XCover7 Pro,90000,https://www.mobiledokan.com/mobile/samsung-gal...,IPS LCD,6.6 inches (16.76 cm),1080x2408 px (FHD+),400 ppi,120 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,,,Android,v15,One UI 7,Exp. 08 May 2025,Upcoming,South Korea
1,Motorola Edge 60s,Motorola,Edge 60s,50000,https://www.mobiledokan.com/mobile/motorola-ed...,P-OLED,6.67 inches (16.94 cm),1220x2712 px (FHD+),446 ppi,120 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,,,Android,v15,,Exp. 08 May 2025,Upcoming,USA
2,Samsung Galaxy Z Fold7,Samsung,Galaxy Z Fold7,250000,https://www.mobiledokan.com/mobile/samsung-gal...,Foldable Dynamic LTPO AMOLED 2X,8.2 inches (215.5 cm2),2224x2488 px,407 ppi,120 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,,,Android,v15,One UI 8,Not announced yet,Rumored,South Korea
3,Oppo Reno14,Oppo,Reno14,70000,https://www.mobiledokan.com/mobile/oppo-reno14,AMOLED,6.59 inches (16.74 cm),1256x2760 px (FHD+),460 ppi,120 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,Yes,,Android,v15,ColorOS 15,Exp. 23 May 2025,Upcoming,China
4,Samsung Galaxy F56,Samsung,Galaxy F56,40000,https://www.mobiledokan.com/mobile/samsung-gal...,Super AMOLED Plus,6.74 inches (17.12 cm),1080x2340 px (FHD+),382 ppi,120 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,,,Android,v15,One UI 7,Exp. 14 May 2025,Upcoming,South Korea
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5022,Meizu Note 16 Pro,Meizu,Note 16 Pro,25000,https://www.mobiledokan.com/mobile/meizu-note-...,OLED,6.78 inches (17.22 cm),1224x2720 px (FHD+),440 ppi,144 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,Yes,,Flyme,AIOS 2,,Exp. 16 May 2025,Upcoming,Japan
5023,Honor GT Pro (16GB/512GB),Honor,GT Pro (16GB/512GB),90000,https://www.mobiledokan.com/mobile/honor-gt-pr...,LTPO AMOLED,6.78 inches (17.22 cm),1224x2800 px (FHD+),453 ppi,144 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,Yes,,Android,v15,MagicOS 9,23 April 2025,Available,China
5024,Honor GT Pro (512GB),Honor,GT Pro (512GB),84500,https://www.mobiledokan.com/mobile/honor-gt-pr...,LTPO AMOLED,6.78 inches (17.22 cm),1224x2800 px (FHD+),453 ppi,144 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,Yes,,Android,v15,MagicOS 9,23 April 2025,Available,China
5025,iQOO Z10 Turbo (512GB),iQOO,Z10 Turbo (512GB),45000,https://www.mobiledokan.com/mobile/iqoo-z10-tu...,AMOLED,6.78 inches (17.22 cm),1260x2800 px (FHD+),453 ppi,144 Hz,...,"Light sensor, Proximity sensor, Accelerometer,...",,Yes,,Android,v15,OriginOS 5,28 April 2025,Available,China


### Data Cleaning

In [23]:
# Strip whitespaces from all string cells
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [24]:
# Convert all column names to snake_case if needed (skip if already done)
df.columns = [col.strip() for col in df.columns]  # remove leading/trailing spaces

In [25]:
# Columns that should be lowercased (e.g., brand, cpu, gpu, os, camera setup)
lowercase_cols = [
    "brand", "chipset", "cpu", "gpu", "operating_system", "camera_setup",
    "user_interface", "network", "build", "usb_type-c", "wlan", "bluetooth", "gps"
]
for col in lowercase_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower().str.strip()

In [26]:
# Columns that should be title case (e.g., model, colors, build, display_type)
titlecase_cols = [
    "model", "colors", "display_type", "build", "storage_type", "battery_type",
    "ram_type", "sensor"
]

for col in titlecase_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.title().str.strip()

In [27]:
# Optional: Normalize 'brand' names (e.g., xiomi ‚Üí xiaomi)
brand_mapping = {
    "xiomi": "xiaomi",
    "samsung ": "samsung",
    "realme ": "realme",
    "redmi": "xiaomi",
    "apple inc.": "apple"
    # add more if needed
}

if "brand" in df.columns:
    df["brand"] = df["brand"].replace(brand_mapping)

In [28]:
# Optional: Fix common typos or variants in OS names
os_mapping = {
    "android 13": "android 13",
    "android13": "android 13",
    "ios 17": "ios 17"
}

if "operating_system" in df.columns:
    df["operating_system"] = df["operating_system"].replace(os_mapping)