In [2]:
import requests
from bs4 import BeautifulSoup

## read the html documents

with open("data/samsung25s.html", "r") as f:
    html_docs = f.read()
    
soup = BeautifulSoup(html_docs, "html.parser")

In [7]:
soup.find(class_="specs-phone-name-title").get_text().strip()

'Samsung Galaxy S25 Edge'

In [None]:
for spec in soup.find(id="specs-list").children:
    print(spec)

In [9]:
import requests
from bs4 import BeautifulSoup

def scrape_phone_specs():
    """
    Scrapes phone specifications from a given GSMArena URL and returns them in a structured dictionary.

    Args:
        url (str): The URL of the GSMArena phone specs page.

    Returns:
        dict: A dictionary containing the phone's specifications, organized by category.
              Returns an empty dictionary if scraping fails.
    """
    try:
        with open("data/samsung25s.html", "r") as f:
            html_docs = f.read()
        
        soup = BeautifulSoup(html_docs, "html.parser")

        phone_name = soup.find(class_="specs-phone-name-title").get_text().strip()
        specs_data = {"Name": phone_name}
        specs_list = soup.find(id="specs-list")

        if specs_list:
            for table in specs_list.find_all("table"):
                category = table.find("th", scope="row")  # Find the category header
                if category:
                    category_name = category.get_text().strip()
                    specs_data[category_name] = {}

                for row in table.find_all("tr"):
                    title_cell = row.find("td", class_="ttl")
                    info_cell = row.find("td", class_="nfo")

                    if title_cell and info_cell:
                        title = title_cell.get_text().strip()
                        info = info_cell.get_text().strip()
                        if category_name:
                            specs_data[category_name][title] = info
                        else:
                            specs_data[title] = info # if there is no category add it directoy to the main dict.
        return specs_data

    except requests.exceptions.RequestException as e:
        print(f"Error during request: {e}")
        return {}
    except Exception as e:
        print(f"An error occurred during scraping: {e}")
        return {}

# Example usage:

phone_data = scrape_phone_specs()

if phone_data:
    for category, specs in phone_data.items():
        print(f"--- {category} ---")
        if isinstance(specs, dict):
            for title, info in specs.items():
                print(f"{title}: {info}")
        else:
            print(specs) # print data that is not dict
        print()
else:
    print("Failed to scrape phone data.")

--- Name ---
Samsung Galaxy S25 Edge

--- Network ---
Technology: GSM / HSPA / LTE / 5G
2G bands: GSM 850 / 900 / 1800 / 1900
3G bands: HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100
4G bands: 1, 2, 3, 4, 5, 7, 8, 12, 13, 17, 18, 19, 20, 25, 26, 28,
                      32, 38, 39, 40, 41, 66 - International
: 1, 2, 5, 7, 25, 28, 41, 66, 71, 77, 78, 257, 258, 260, 261
                      SA/NSA/Sub6/mmWave - USA
5G bands: 1, 2, 3, 5, 7, 8, 12, 20, 25, 26, 28, 38, 40, 41, 66, 75,
                      77, 78 SA/NSA/Sub6 - International
Speed: HSPA, LTE (CA), 5G

--- Launch ---
Announced: 2025, May 13
Status: Available. Released 2025, May 29

--- Body ---
Dimensions: 158.2 x 75.6 x 5.8 mm (6.23 x 2.98 x 0.23 in)
Weight: 163 g (5.75 oz)
Build: Glass front (Gorilla Glass Ceramic 2), titanium frame,
                      glass back (Gorilla Glass Victus 2)
SIM: Nano-SIM + Nano-SIM +
                      eSIM + eSIM (max 2
                      at a time)Nano-SIM + eSIM + eSIM (max 2 at a tim

In [10]:
import requests
from bs4 import BeautifulSoup

def scrape_phone_specs():
    """
    Scrapes phone specifications from a given GSMArena URL and returns them in a structured dictionary
    optimized for database storage.

    Args:
        url (str): The URL of the GSMArena phone specs page.

    Returns:
        dict: A dictionary containing the phone's specifications, ready for database insertion.
              Returns an empty dictionary if scraping fails.
    """
    try:
        with open("data/samsung25s.html", "r") as f:
            html_docs = f.read()
        
        soup = BeautifulSoup(html_docs, "html.parser")

        phone_name = soup.find(class_="specs-phone-name-title").get_text().strip()
        specs_data = {"phone_name": phone_name}  # Consistent naming for DB

        specs_list = soup.find(id="specs-list")

        if specs_list:
            for table in specs_list.find_all("table"):
                category = table.find("th", scope="row")
                if category:
                    category_name = category.get_text().strip().lower().replace(" ", "_")  # DB-friendly category name
                    # Use a prefix for category columns to avoid naming conflicts
                    category_prefix = category_name + "_"
                    for row in table.find_all("tr"):
                        title_cell = row.find("td", class_="ttl")
                        info_cell = row.find("td", class_="nfo")

                        if title_cell and info_cell:
                            title = title_cell.get_text().strip().lower().replace(" ", "_") #DB-friendly title
                            info = info_cell.get_text().strip()
                            specs_data[category_prefix + title] = info # Added prefix to column name
                else:
                   #Handling table without category, adding each row in root.
                    for row in table.find_all("tr"):
                        title_cell = row.find("td", class_="ttl")
                        info_cell = row.find("td", class_="nfo")

                        if title_cell and info_cell:
                            title = title_cell.get_text().strip().lower().replace(" ", "_") #DB-friendly title
                            info = info_cell.get_text().strip()
                            specs_data[title] = info # Added prefix to column name

        return specs_data

    except requests.exceptions.RequestException as e:
        print(f"Error during request: {e}")
        return {}
    except Exception as e:
        print(f"An error occurred during scraping: {e}")
        return {}

# Example usage:
 # Replace with the actual URL
phone_data = scrape_phone_specs()

if phone_data:
    for column, value in phone_data.items():
        print(f"{column}: {value}") #Print column names and values for debugging.
else:
    print("Failed to scrape phone data.")

phone_name: Samsung Galaxy S25 Edge
network_technology: GSM / HSPA / LTE / 5G
network_2g_bands: GSM 850 / 900 / 1800 / 1900
network_3g_bands: HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100
network_4g_bands: 1, 2, 3, 4, 5, 7, 8, 12, 13, 17, 18, 19, 20, 25, 26, 28,
                      32, 38, 39, 40, 41, 66 - International
network_: 1, 2, 5, 7, 25, 28, 41, 66, 71, 77, 78, 257, 258, 260, 261
                      SA/NSA/Sub6/mmWave - USA
network_5g_bands: 1, 2, 3, 5, 7, 8, 12, 20, 25, 26, 28, 38, 40, 41, 66, 75,
                      77, 78 SA/NSA/Sub6 - International
network_speed: HSPA, LTE (CA), 5G
launch_announced: 2025, May 13
launch_status: Available. Released 2025, May 29
body_dimensions: 158.2 x 75.6 x 5.8 mm (6.23 x 2.98 x 0.23 in)
body_weight: 163 g (5.75 oz)
body_build: Glass front (Gorilla Glass Ceramic 2), titanium frame,
                      glass back (Gorilla Glass Victus 2)
body_sim: Nano-SIM + Nano-SIM +
                      eSIM + eSIM (max 2
                      at a t

In [11]:
import csv
from bs4 import BeautifulSoup

def scrape_phone_specs():
    """
    Scrapes phone specifications from a given GSMArena HTML file and returns them in a structured dictionary
    optimized for CSV storage.

    Returns:
        dict: A dictionary containing the phone's specifications, ready for CSV writing.
              Returns an empty dictionary if scraping fails.
    """
    try:
        with open("data/samsung25s.html", "r", encoding="utf-8") as f:  # Explicit encoding
            html_docs = f.read()

        soup = BeautifulSoup(html_docs, "html.parser")

        phone_name = soup.find(class_="specs-phone-name-title").get_text().strip()
        specs_data = {"phone_name": phone_name}

        specs_list = soup.find(id="specs-list")

        if specs_list:
            for table in specs_list.find_all("table"):
                category = table.find("th", scope="row")
                if category:
                    category_name = category.get_text().strip().lower().replace(" ", "_")
                    category_prefix = category_name + "_"
                    for row in table.find_all("tr"):
                        title_cell = row.find("td", class_="ttl")
                        info_cell = row.find("td", class_="nfo")

                        if title_cell and info_cell:
                            title = title_cell.get_text().strip().lower().replace(" ", "_")
                            info = info_cell.get_text().strip()
                            specs_data[category_prefix + title] = info
                else:
                    for row in table.find_all("tr"):
                        title_cell = row.find("td", class_="ttl")
                        info_cell = row.find("td", class_="nfo")

                        if title_cell and info_cell:
                            title = title_cell.get_text().strip().lower().replace(" ", "_")
                            info = info_cell.get_text().strip()
                            specs_data[title] = info

        return specs_data

    except FileNotFoundError:
        print("Error: The file 'data/samsung25s.html' was not found.")
        return {}
    except Exception as e:
        print(f"An error occurred during scraping: {e}")
        return {}


def save_to_csv(data, filename="phone_specs.csv"):
    """
    Saves the phone specifications data to a CSV file.

    Args:
        data (dict): The dictionary containing the phone specifications.
        filename (str): The name of the CSV file to create.
    """
    if not data:
        print("No data to save to CSV.")
        return

    try:
        with open(filename, "w", newline="", encoding="utf-8") as csvfile:
            fieldnames = data.keys()  # Column headers from dictionary keys
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()  # Write the header row
            writer.writerow(data)  # Write the data row

        print(f"Data saved to {filename}")

    except Exception as e:
        print(f"Error writing to CSV: {e}")

# Example usage:
phone_data = scrape_phone_specs()

if phone_data:
    save_to_csv(phone_data)
else:
    print("Failed to scrape phone data.")

Data saved to phone_specs.csv


In [12]:
import pandas as pd
df = pd.read_csv("phone_specs.csv")
df

Unnamed: 0,phone_name,network_technology,network_2g_bands,network_3g_bands,network_4g_bands,network_,network_5g_bands,network_speed,launch_announced,launch_status,...,battery_charging,misc_colors,misc_models,misc_sar,misc_sar_eu,misc_price,tests_performance,tests_display,tests_loudspeaker,tests_battery_(new)
0,Samsung Galaxy S25 Edge,GSM / HSPA / LTE / 5G,GSM 850 / 900 / 1800 / 1900,HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100,"1, 2, 3, 4, 5, 7, 8, 12, 13, 17, 18, 19, 20, 2...","1, 2, 5, 7, 25, 28, 41, 66, 71, 77, 78, 257, 2...","1, 2, 3, 5, 7, 8, 12, 20, 25, 26, 28, 38, 40, ...","HSPA, LTE (CA), 5G","2025, May 13","Available. Released 2025, May 29",...,"25W wired, PD, QC2.0, 55% in 30 min\n ...","Titanium Icyblue, Titanium Silver, Titanium Je...","SM-S937U, SM-S937U1, SM-S937B, SM-S937B/DS, SM...",1.21 W/kg (head) 1.23 W/kg (body),1.24 W/kg (head) 1.25 W/kg (body),"$ 1,055.99 / C$ 1,299.99 / £ 930.13 / € 999.00...",AnTuTu: 2147084 (v10)\n G...,1416 nits max brightness (measured),-26.4 LUFS (Good),Active use score 12:06h


In [13]:
import mysql.connector
conn = mysql.connector.connect(
    host='localhost',
    user='root',
    password='',
    database='samsung_phones'
)


In [14]:
from sqlalchemy import create_engine

engine = create_engine('mysql+pymysql://root:@localhost/samsung_phones')


In [15]:
import pandas as pd

# Read CSV
df = pd.read_csv('phone_specs.csv')  # Adjust path as needed

# Save to MySQL table named 'my_table' (replace as you wish)
df.to_sql(
    name='my_table',
    con=engine,
    if_exists='replace',  # or 'append' if you want to add without deleting
    index=False
)

print("Data successfully loaded into MySQL!")


Data successfully loaded into MySQL!


## **All file name**

In [1]:
all_html_file = [
    "samsung_galaxy_a06_5g.html",
    "samsung_galaxy_a16_5g.html",
    "samsung_galaxy_a16.html",
    "samsung_galaxy_a26.html",
    "samsung_galaxy_a35.html",
    "samsung_galaxy_a36.html",
    "samsung_galaxy_a55.html",
    "samsung_galaxy_a56.html",
    "samsung_galaxy_c55.html",
    "samsung_galaxy_f06_5g.html",
    "samsung_galaxy_f15.html",
    "samsung_galaxy_f16.html",
    "samsung_galaxy_m06.html",
    "samsung_galaxy_m15.html",
    "samsung_galaxy_m16.html",
    "samsung_galaxy_m55.html",
    "samsung_galaxy_s23_ultra.html",
    "samsung_galaxy_s23.html",
    "samsung_galaxy_s24.html",
    "samsung_galaxy_s25.html",
    "samsung_galaxy_s25_ultra.html"
]

In [8]:
import csv
import os
from bs4 import BeautifulSoup

def scrape_phone_specs(html_file):
    """
    Scrapes phone specifications from a given GSMArena HTML file and returns them in a structured dictionary.

    Args:
        html_file (str): The path to the GSMArena HTML file.

    Returns:
        dict: A dictionary containing the phone's specifications, including the 'phone_name'.
              Returns an empty dictionary if scraping fails.
    """
    try:
        with open(html_file, "r", encoding="utf-8") as f:  # Explicit encoding
            html_docs = f.read()

        soup = BeautifulSoup(html_docs, "html.parser")

        phone_name = soup.find(class_="specs-phone-name-title").get_text().strip()
        specs_data = {"phone_name": phone_name}  # Explicitly add phone_name here

        specs_list = soup.find(id="specs-list")

        if specs_list:
            for table in specs_list.find_all("table"):
                category = table.find("th", scope="row")
                if category:
                    category_name = category.get_text().strip().lower().replace(" ", "_")
                    category_prefix = category_name + "_"
                    for row in table.find_all("tr"):
                        title_cell = row.find("td", class_="ttl")
                        info_cell = row.find("td", class_="nfo")

                        if title_cell and info_cell:
                            title = title_cell.get_text().strip().lower().replace(" ", "_")
                            info = info_cell.get_text().strip()
                            specs_data[category_prefix + title] = info
                else:
                    for row in table.find_all("tr"):
                        title_cell = row.find("td", class_="ttl")
                        info_cell = row.find("td", class_="nfo")

                        if title_cell and info_cell:
                            title = title_cell.get_text().strip().lower().replace(" ", "_")
                            info = info_cell.get_text().strip()
                            specs_data[title] = info

        return specs_data

    except FileNotFoundError:
        print(f"Error: The file '{html_file}' was not found.")
        return {}
    except Exception as e:
        print(f"An error occurred during scraping: {e}")
        return {}

def save_to_csv(all_data, filename="all_phone_specs_data.csv"):
    """
    Saves the phone specifications data to a CSV file.

    Args:
        all_data (list): A list of dictionaries, where each dictionary contains
                         the phone specifications for one phone.
        filename (str): The name of the CSV file to create.
    """
    if not all_data:
        print("No data to save to CSV.")
        return

    try:
        with open(filename, "w", newline="", encoding="utf-8") as csvfile:
            fieldnames = set()  # Use a set to collect all unique keys
            for data in all_data:
                fieldnames.update(data.keys())
            fieldnames = sorted(list(fieldnames))  # Convert to list and sort

            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()  # Write the header row
            writer.writerows(all_data)  # Write all data rows

        print(f"Data saved to {filename}")

    except Exception as e:
        print(f"Error writing to CSV: {e}")



# List of HTML files to process
all_html_file = [
    "samsung_galaxy_a06_5g.html",
    "samsung_galaxy_a16_5g.html",
    "samsung_galaxy_a16.html",
    "samsung_galaxy_a26.html",
    "samsung_galaxy_a35.html",
    "samsung_galaxy_a36.html",
    "samsung_galaxy_a55.html",
    "samsung_galaxy_a56.html",
    "samsung_galaxy_c55.html",
    "samsung_galaxy_f06_5g.html",
    "samsung_galaxy_f15.html",
    "samsung_galaxy_f16.html",
    "samsung_galaxy_m06.html",
    "samsung_galaxy_m15.html",
    "samsung_galaxy_m16.html",
    "samsung_galaxy_m55.html",
    "samsung_galaxy_s23_ultra.html",
    "samsung_galaxy_s23.html",
    "samsung_galaxy_s24.html",
    "samsung_galaxy_s25.html",
    "samsung_galaxy_s25_ultra.html"
]

# Directory where HTML files are located
data_directory = "data"

# Collect all phone data into a list
all_phone_data = []

# Loop through each HTML file and scrape the data
for html_file in all_html_file:
    # Construct the full path to the HTML file
    full_html_path = os.path.join(data_directory, html_file)

    # Scrape the phone specifications
    phone_data = scrape_phone_specs(full_html_path)

    if phone_data:
        all_phone_data.append(phone_data) # append all data into a list
    else:
        print(f"Failed to scrape data from {html_file}")

# Save all the data to a single CSV file
save_to_csv(all_phone_data, "all_phone_specs_data.csv") #call the save func and add name of csv file

print("Finished processing all HTML files and saving to all_phone_specs_data.csv")

Data saved to all_phone_specs_data.csv
Finished processing all HTML files and saving to all_phone_specs_data.csv


In [15]:
import pandas as pd
df = pd.read_csv("all_phone_specs_data.csv")
df.shape

(21, 59)

In [17]:
import pandas as pd
df = pd.read_csv("all_phone_specs.csv")
df.sample(3)

Unnamed: 0,phone_name,network_technology,network_2g_bands,network_3g_bands,network_4g_bands,network_5g_bands,network_speed,launch_announced,launch_status,body_dimensions,...,tests_loudspeaker,tests_battery_(new),display_protection,memory_,main_camera_quad,selfie_camera_features,sound_,tests_camera,tests_battery_(old),display_
12,Samsung Galaxy M06,GSM / HSPA / LTE / 5G,GSM 850 / 900 / 1800 / 1900,HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100,"1, 2, 3, 4, 5, 7, 8, 12, 17, 20, 26, 28, 38, 4...","1, 3, 5, 7, 8, 26, 28, 40, 41, 66, 77, 78 SA/N...","HSPA, LTE, 5G","2025, February 27","Available. Released 2025, March 07",167.4 x 77.4 x 8 mm (6.59 x 3.05 x 0.31 in),...,,,,,,,,,,
15,Samsung Galaxy M55,GSM / HSPA / LTE / 5G,GSM 850 / 900 / 1800 / 1900,HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100,"1, 2, 3, 4, 5, 7, 8, 12, 17, 20, 25, 26, 28, 3...","1, 3, 5, 7, 8, 20, 28, 38, 40, 41, 66, 77, 78 ...","HSPA, LTE, 5G","2024, March 28","Available. Released 2024, March 28",163.9 x 76.5 x 7.8 mm (6.45 x 3.01 x 0.31 in),...,,,,,,,,,,
6,Samsung Galaxy A55,GSM / HSPA / LTE / 5G,GSM 850 / 900 / 1800 / 1900,HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100,"1, 2, 3, 4, 5, 7, 8, 12, 17, 20, 25, 26, 28, 3...","1, 3, 5, 7, 8, 20, 28, 38, 40, 41, 66, 77, 78 ...","HSPA, LTE, 5G","2024, March 11","Available. Released 2024, March 15",161.1 x 77.4 x 8.2 mm (6.34 x 3.05 x 0.32 in),...,-25.7 LUFS (Very good),Active use score 13:27h,Corning Gorilla Glass Victus+,,,,,,,


## Transform all Scraping data into pandas dataFrame to csv

In [18]:
import csv
from bs4 import BeautifulSoup
import os

all_html_files = [
    "samsung_galaxy_a06_5g.html",
    "samsung_galaxy_a16_5g.html",
    "samsung_galaxy_a16.html",
    "samsung_galaxy_a26.html",
    "samsung_galaxy_a35.html",
    "samsung_galaxy_a36.html",
    "samsung_galaxy_a55.html",
    "samsung_galaxy_a56.html",
    "samsung_galaxy_c55.html",
    "samsung_galaxy_f06_5g.html",
    "samsung_galaxy_f15.html",
    "samsung_galaxy_f16.html",
    "samsung_galaxy_m06.html",
    "samsung_galaxy_m15.html",
    "samsung_galaxy_m16.html",
    "samsung_galaxy_m55.html",
    "samsung_galaxy_s23_ultra.html",
    "samsung_galaxy_s23.html",
    "samsung_galaxy_s24.html",
    "samsung_galaxy_s25.html",
    "samsung_galaxy_s25_ultra.html"
]

def scrape_phone_specs(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            html_docs = f.read()

        soup = BeautifulSoup(html_docs, "html.parser")

        phone_name = soup.find(class_="specs-phone-name-title").get_text().strip()
        specs_data = {"phone_name": phone_name}
        field_order = ["phone_name"]  # track order

        specs_list = soup.find(id="specs-list")

        if specs_list:
            for table in specs_list.find_all("table"):
                category = table.find("th", scope="row")
                if category:
                    category_name = category.get_text().strip().lower().replace(" ", "_")
                    category_prefix = category_name + "_"
                    for row in table.find_all("tr"):
                        title_cell = row.find("td", class_="ttl")
                        info_cell = row.find("td", class_="nfo")
                        if title_cell and info_cell:
                            title = title_cell.get_text().strip().lower().replace(" ", "_")
                            info = info_cell.get_text().strip()
                            key = category_prefix + title
                            specs_data[key] = info
                            if key not in field_order:
                                field_order.append(key)
                else:
                    for row in table.find_all("tr"):
                        title_cell = row.find("td", class_="ttl")
                        info_cell = row.find("td", class_="nfo")
                        if title_cell and info_cell:
                            title = title_cell.get_text().strip().lower().replace(" ", "_")
                            info = info_cell.get_text().strip()
                            specs_data[title] = info
                            if title not in field_order:
                                field_order.append(title)

        return specs_data, field_order

    except Exception as e:
        print(f"Error scraping {file_path}: {e}")
        return {}, []

# Collect all data
all_data = []
fieldnames_in_order = []

for file_name in all_html_files:
    file_path = os.path.join("data", file_name)
    data, fields = scrape_phone_specs(file_path)
    if data:
        all_data.append(data)
        for f in fields:
            if f not in fieldnames_in_order:
                fieldnames_in_order.append(f)

# Save all to CSV
csv_filename = "all_phone_specs_up.csv"
try:
    with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames_in_order)
        writer.writeheader()
        for phone in all_data:
            writer.writerow(phone)
    print(f"✅ Successfully wrote {len(all_data)} phones to {csv_filename}")
except Exception as e:
    print(f"Error writing to CSV: {e}")


✅ Successfully wrote 21 phones to all_phone_specs_up.csv


In [1]:
import pandas as pd
df = pd.read_csv("all_phone_specs_up.csv")
df.sample(3)

Unnamed: 0,phone_name,network_technology,network_2g_bands,network_3g_bands,network_4g_bands,network_5g_bands,network_speed,launch_announced,launch_status,body_dimensions,...,tests_loudspeaker,tests_battery_(new),display_protection,memory_,main_camera_quad,selfie_camera_features,sound_,tests_camera,tests_battery_(old),display_
3,Samsung Galaxy A26,GSM / HSPA / LTE / 5G,GSM 850 / 900 / 1800 / 1900,HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100,"1, 2, 3, 4, 5, 7, 8, 12, 17, 20, 26, 28, 32, 3...","1, 3, 5, 7, 8, 20, 28, 38, 40, 41, 66, 77, 78 ...","HSPA, LTE, 5G","2025, March 02","Available. Released 2025, March 19",164 x 77.5 x 7.7 mm (6.46 x 3.05 x 0.30 in),...,-27.1 LUFS (Good),Active use score 10:44h,Corning Gorilla Glass Victus+,,,,,,,
14,Samsung Galaxy M16,GSM / HSPA / LTE / 5G,GSM 850 / 900 / 1800 / 1900,HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100,"1, 2, 3, 4, 5, 7, 8, 12, 17, 20, 26, 28, 38, 4...","1, 3, 5, 7, 8, 28, 40, 41, 66, 77, 78 SA/NSA/Sub6","HSPA, LTE, 5G","2025, February 27","Available. Released 2025, March 05",164.4 x 77.9 x 7.9 mm (6.47 x 3.07 x 0.31 in),...,,,,,,,,,,
15,Samsung Galaxy M55,GSM / HSPA / LTE / 5G,GSM 850 / 900 / 1800 / 1900,HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100,"1, 2, 3, 4, 5, 7, 8, 12, 17, 20, 25, 26, 28, 3...","1, 3, 5, 7, 8, 20, 28, 38, 40, 41, 66, 77, 78 ...","HSPA, LTE, 5G","2024, March 28","Available. Released 2024, March 28",163.9 x 76.5 x 7.8 mm (6.45 x 3.01 x 0.31 in),...,,,,,,,,,,


## Data Preprocessing

In [2]:
df.isnull().sum()

phone_name                 0
network_technology         0
network_2g_bands           0
network_3g_bands           0
network_4g_bands           0
network_5g_bands           1
network_speed              0
launch_announced           0
launch_status              0
body_dimensions            0
body_weight                0
body_build                 1
body_sim                   0
body_                      6
display_type               0
display_size               0
display_resolution         0
platform_os                0
platform_chipset           0
platform_cpu               0
platform_gpu               0
memory_card_slot           0
memory_internal            0
main_camera_dual          18
main_camera_features       0
main_camera_video          0
selfie_camera_single       0
selfie_camera_video        0
sound_loudspeaker          0
sound_3.5mm_jack           0
comms_wlan                 0
comms_bluetooth            0
comms_positioning          0
comms_nfc                  0
comms_radio   

In [3]:
cols_to_drop = [
    'network_', 'body_', 'features_', 'sound_', 'tests_', 
    'display_', 'memory_', 'main_camera_quad', 'tests_camera', 'tests_battery_(old)'
]
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
df.shape

(21, 50)

In [4]:
# Drop columns whose names end with an underscore
df = df.loc[:, ~df.columns.str.endswith('_')]

# (optional) Display the columns after dropping
print(df.columns)


Index(['phone_name', 'network_technology', 'network_2g_bands',
       'network_3g_bands', 'network_4g_bands', 'network_5g_bands',
       'network_speed', 'launch_announced', 'launch_status', 'body_dimensions',
       'body_weight', 'body_build', 'body_sim', 'display_type', 'display_size',
       'display_resolution', 'platform_os', 'platform_chipset', 'platform_cpu',
       'platform_gpu', 'memory_card_slot', 'memory_internal',
       'main_camera_dual', 'main_camera_features', 'main_camera_video',
       'selfie_camera_single', 'selfie_camera_video', 'sound_loudspeaker',
       'sound_3.5mm_jack', 'comms_wlan', 'comms_bluetooth',
       'comms_positioning', 'comms_nfc', 'comms_radio', 'comms_usb',
       'features_sensors', 'battery_type', 'battery_charging', 'misc_colors',
       'misc_models', 'misc_sar', 'misc_sar_eu', 'misc_price',
       'main_camera_triple', 'tests_performance', 'tests_display',
       'tests_loudspeaker', 'tests_battery_(new)', 'display_protection',
       'sel

In [5]:
df.shape

(21, 50)

In [6]:
df.isnull().sum()

phone_name                 0
network_technology         0
network_2g_bands           0
network_3g_bands           0
network_4g_bands           0
network_5g_bands           1
network_speed              0
launch_announced           0
launch_status              0
body_dimensions            0
body_weight                0
body_build                 1
body_sim                   0
display_type               0
display_size               0
display_resolution         0
platform_os                0
platform_chipset           0
platform_cpu               0
platform_gpu               0
memory_card_slot           0
memory_internal            0
main_camera_dual          18
main_camera_features       0
main_camera_video          0
selfie_camera_single       0
selfie_camera_video        0
sound_loudspeaker          0
sound_3.5mm_jack           0
comms_wlan                 0
comms_bluetooth            0
comms_positioning          0
comms_nfc                  0
comms_radio                0
comms_usb     

In [7]:
cols_to_drop = [
    "main_camera_dual", "display_protection", "selfie_camera_features"
]
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
df.shape

(21, 47)

In [9]:
# Save the updated DataFrame to a CSV file
df.to_csv('updated_phone_specs.csv', index=False)

print("✅ Saved cleaned data to updated_phone_specs.csv")

✅ Saved cleaned data to updated_phone_specs.csv


# **My Final Data**

In [14]:
df = pd.read_csv("updated_phone_specs.csv")
df.isnull().sum()

phone_name              0
network_technology      0
network_2g_bands        0
network_3g_bands        0
network_4g_bands        0
network_5g_bands        0
network_speed           0
launch_announced        0
launch_status           0
body_dimensions         0
body_weight             0
body_build              0
body_sim                0
display_type            0
display_size            0
display_resolution      0
platform_os             0
platform_chipset        0
platform_cpu            0
platform_gpu            0
memory_card_slot        0
memory_internal         0
main_camera_features    0
main_camera_video       0
selfie_camera_single    0
selfie_camera_video     0
sound_loudspeaker       0
sound_3.5mm_jack        0
comms_wlan              0
comms_bluetooth         0
comms_positioning       0
comms_nfc               0
comms_radio             0
comms_usb               0
features_sensors        0
battery_type            0
battery_charging        0
misc_colors             0
misc_models 

In [13]:
# Save the updated DataFrame to a CSV file
df.to_csv('updated_phone_specs.csv', index=False)

print("✅ Saved cleaned data to updated_phone_specs.csv")

✅ Saved cleaned data to updated_phone_specs.csv
