In [1]:
pip install beautifulsoup4 requests fake-useragent

Defaulting to user installation because normal site-packages is not writeable
Collecting fake-useragent
  Downloading fake_useragent-2.2.0-py3-none-any.whl.metadata (17 kB)
Downloading fake_useragent-2.2.0-py3-none-any.whl (161 kB)
Installing collected packages: fake-useragent
Successfully installed fake-useragent-2.2.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import pandas as pd
import re

# Load dataset
df = pd.read_csv('smartphones_cleaned.csv')

# View basic info
print(df.shape)
print(df.columns)
print(df.head())

# Example: Clean and normalize price column (remove currency symbols, commas, convert to int)
def clean_price(price):
    if pd.isnull(price):
        return None
    price = str(price)
    price = re.sub(r'[^\d]', '', price)  # Remove anything not digit
    return int(price) if price else None

df['price_clean'] = df['price'].apply(clean_price)

# Normalize RAM column (e.g. '6GB' -> 6)
def clean_ram(ram):
    if pd.isnull(ram):
        return None
    ram = str(ram).upper()
    match = re.search(r'(\d+)', ram)
    return int(match.group(1)) if match else None

df['ram_clean'] = df['ram'].apply(clean_ram)

# Similarly, clean storage, battery, camera specs as needed

print(df[['price', 'price_clean', 'ram', 'ram_clean']].head())

(1020, 11)
Index(['model', 'price', 'rating', 'sim', 'processor', 'ram', 'battery',
       'display', 'camera', 'card', 'os'],
      dtype='object')
                       model   price  rating  \
0              OnePlus 11 5G  54,999    89.0   
1  OnePlus Nord CE 2 Lite 5G  19,989    81.0   
2      Samsung Galaxy A14 5G  16,499    75.0   
3       Motorola Moto G62 5G  14,999    81.0   
4         Realme 10 Pro Plus  24,999    82.0   

                                       sim  \
0  Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC   
1       Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi   
2       Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi   
3       Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi   
4       Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi   

                                         processor                        ram  \
0  Snapdragon 8 Gen2, Octa Core, 3.2 GHz Processor  12 GB RAM, 256 GB inbuilt   
1     Snapdragon 695, Octa Core, 2.2 GHz Processor   6 GB RAM, 128 GB inbuilt   
2        Exynos 1330, Octa Core, 2.4 GHz P

In [None]:
print(df.isnull().sum())

model            0
price            0
rating         141
sim              0
processor        0
ram              0
battery          0
display          0
camera           1
card             7
os              17
price_clean      0
ram_clean        1
dtype: int64


In [14]:
# Fill missing rating with "N/A" or average
df['rating'] = df['rating'].fillna('N/A')

# Fill missing camera, card, os with "Unknown"
df['camera'] = df['camera'].fillna('Unknown')
df['card'] = df['card'].fillna('Unknown')
df['os'] = df['os'].fillna('Unknown')

# Fill missing ram_clean with median RAM value
df['ram_clean'] = df['ram_clean'].fillna(df['ram_clean'].median())

# Final check
print(df.isnull().sum())


model          0
price          0
rating         0
sim            0
processor      0
ram            0
battery        0
display        0
camera         0
card           0
os             0
price_clean    0
ram_clean      0
dtype: int64


In [5]:
import csv

phones = []
current_phone = {}
fieldnames_order = []  # Keep track of the original feature order

with open("phones_formatted.txt", "r", encoding="utf-8") as file:
    for line in file:
        line = line.strip()
        if not line:  # Skip empty lines
            continue
        if "model:" in line:  # Detect new phone entry
            if current_phone:  # Save previous phone entry
                phones.append(current_phone)
            current_phone = {}  # Reset for new phone
        key_value = line.split(": ", 1)  # Split key and value
        if len(key_value) == 2:
            key = key_value[0].strip().replace('"', '')  # Remove unexpected quotes
            value = key_value[1].strip()
            
            if key not in fieldnames_order:
                fieldnames_order.append(key)  # Preserve order of first appearance
            
            current_phone[key] = value

    if current_phone:  # Save last phone entry
        phones.append(current_phone)

# Write to CSV file with fieldnames in preserved order
with open("phones_data.csv", "w", encoding="utf-8", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames_order)
    writer.writeheader()
    writer.writerows(phones)

print("Conversion successful! 'phones_data.csv' has been created.")

Conversion successful! 'phones_data.csv' has been created.


In [6]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("phones_data.csv")

# Remove columns where all values are "Not specified"
df_cleaned = df.loc[:, (df != "Not specified").any()]

# Save the cleaned CSV
df_cleaned.to_csv("phones_data_filtered.csv", index=False)

print("Columns with only 'Not specified' values have been removed. Saved as 'phones_data_filtered.csv'.")

Columns with only 'Not specified' values have been removed. Saved as 'phones_data_filtered.csv'.
