In [2]:
import requests
from bs4 import BeautifulSoup

## read the html documents

with open("data/samsung25s.html", "r") as f:
    html_docs = f.read()
    
soup = BeautifulSoup(html_docs, "html.parser")

In [7]:
soup.find(class_="specs-phone-name-title").get_text().strip()

'Samsung Galaxy S25 Edge'

In [None]:
for spec in soup.find(id="specs-list").children:
    print(spec)

In [9]:
import requests
from bs4 import BeautifulSoup

def scrape_phone_specs():
    """
    Scrapes phone specifications from a given GSMArena URL and returns them in a structured dictionary.

    Args:
        url (str): The URL of the GSMArena phone specs page.

    Returns:
        dict: A dictionary containing the phone's specifications, organized by category.
              Returns an empty dictionary if scraping fails.
    """
    try:
        with open("data/samsung25s.html", "r") as f:
            html_docs = f.read()
        
        soup = BeautifulSoup(html_docs, "html.parser")

        phone_name = soup.find(class_="specs-phone-name-title").get_text().strip()
        specs_data = {"Name": phone_name}
        specs_list = soup.find(id="specs-list")

        if specs_list:
            for table in specs_list.find_all("table"):
                category = table.find("th", scope="row")  # Find the category header
                if category:
                    category_name = category.get_text().strip()
                    specs_data[category_name] = {}

                for row in table.find_all("tr"):
                    title_cell = row.find("td", class_="ttl")
                    info_cell = row.find("td", class_="nfo")

                    if title_cell and info_cell:
                        title = title_cell.get_text().strip()
                        info = info_cell.get_text().strip()
                        if category_name:
                            specs_data[category_name][title] = info
                        else:
                            specs_data[title] = info # if there is no category add it directoy to the main dict.
        return specs_data

    except requests.exceptions.RequestException as e:
        print(f"Error during request: {e}")
        return {}
    except Exception as e:
        print(f"An error occurred during scraping: {e}")
        return {}

# Example usage:

phone_data = scrape_phone_specs()

if phone_data:
    for category, specs in phone_data.items():
        print(f"--- {category} ---")
        if isinstance(specs, dict):
            for title, info in specs.items():
                print(f"{title}: {info}")
        else:
            print(specs) # print data that is not dict
        print()
else:
    print("Failed to scrape phone data.")

--- Name ---
Samsung Galaxy S25 Edge

--- Network ---
Technology: GSM / HSPA / LTE / 5G
2G bands: GSM 850 / 900 / 1800 / 1900
3G bands: HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100
4G bands: 1, 2, 3, 4, 5, 7, 8, 12, 13, 17, 18, 19, 20, 25, 26, 28,
                      32, 38, 39, 40, 41, 66 - International
: 1, 2, 5, 7, 25, 28, 41, 66, 71, 77, 78, 257, 258, 260, 261
                      SA/NSA/Sub6/mmWave - USA
5G bands: 1, 2, 3, 5, 7, 8, 12, 20, 25, 26, 28, 38, 40, 41, 66, 75,
                      77, 78 SA/NSA/Sub6 - International
Speed: HSPA, LTE (CA), 5G

--- Launch ---
Announced: 2025, May 13
Status: Available. Released 2025, May 29

--- Body ---
Dimensions: 158.2 x 75.6 x 5.8 mm (6.23 x 2.98 x 0.23 in)
Weight: 163 g (5.75 oz)
Build: Glass front (Gorilla Glass Ceramic 2), titanium frame,
                      glass back (Gorilla Glass Victus 2)
SIM: Nano-SIM + Nano-SIM +
                      eSIM + eSIM (max 2
                      at a time)Nano-SIM + eSIM + eSIM (max 2 at a tim

In [10]:
import requests
from bs4 import BeautifulSoup

def scrape_phone_specs():
    """
    Scrapes phone specifications from a given GSMArena URL and returns them in a structured dictionary
    optimized for database storage.

    Args:
        url (str): The URL of the GSMArena phone specs page.

    Returns:
        dict: A dictionary containing the phone's specifications, ready for database insertion.
              Returns an empty dictionary if scraping fails.
    """
    try:
        with open("data/samsung25s.html", "r") as f:
            html_docs = f.read()
        
        soup = BeautifulSoup(html_docs, "html.parser")

        phone_name = soup.find(class_="specs-phone-name-title").get_text().strip()
        specs_data = {"phone_name": phone_name}  # Consistent naming for DB

        specs_list = soup.find(id="specs-list")

        if specs_list:
            for table in specs_list.find_all("table"):
                category = table.find("th", scope="row")
                if category:
                    category_name = category.get_text().strip().lower().replace(" ", "_")  # DB-friendly category name
                    # Use a prefix for category columns to avoid naming conflicts
                    category_prefix = category_name + "_"
                    for row in table.find_all("tr"):
                        title_cell = row.find("td", class_="ttl")
                        info_cell = row.find("td", class_="nfo")

                        if title_cell and info_cell:
                            title = title_cell.get_text().strip().lower().replace(" ", "_") #DB-friendly title
                            info = info_cell.get_text().strip()
                            specs_data[category_prefix + title] = info # Added prefix to column name
                else:
                   #Handling table without category, adding each row in root.
                    for row in table.find_all("tr"):
                        title_cell = row.find("td", class_="ttl")
                        info_cell = row.find("td", class_="nfo")

                        if title_cell and info_cell:
                            title = title_cell.get_text().strip().lower().replace(" ", "_") #DB-friendly title
                            info = info_cell.get_text().strip()
                            specs_data[title] = info # Added prefix to column name

        return specs_data

    except requests.exceptions.RequestException as e:
        print(f"Error during request: {e}")
        return {}
    except Exception as e:
        print(f"An error occurred during scraping: {e}")
        return {}

# Example usage:
 # Replace with the actual URL
phone_data = scrape_phone_specs()

if phone_data:
    for column, value in phone_data.items():
        print(f"{column}: {value}") #Print column names and values for debugging.
else:
    print("Failed to scrape phone data.")

phone_name: Samsung Galaxy S25 Edge
network_technology: GSM / HSPA / LTE / 5G
network_2g_bands: GSM 850 / 900 / 1800 / 1900
network_3g_bands: HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100
network_4g_bands: 1, 2, 3, 4, 5, 7, 8, 12, 13, 17, 18, 19, 20, 25, 26, 28,
                      32, 38, 39, 40, 41, 66 - International
network_: 1, 2, 5, 7, 25, 28, 41, 66, 71, 77, 78, 257, 258, 260, 261
                      SA/NSA/Sub6/mmWave - USA
network_5g_bands: 1, 2, 3, 5, 7, 8, 12, 20, 25, 26, 28, 38, 40, 41, 66, 75,
                      77, 78 SA/NSA/Sub6 - International
network_speed: HSPA, LTE (CA), 5G
launch_announced: 2025, May 13
launch_status: Available. Released 2025, May 29
body_dimensions: 158.2 x 75.6 x 5.8 mm (6.23 x 2.98 x 0.23 in)
body_weight: 163 g (5.75 oz)
body_build: Glass front (Gorilla Glass Ceramic 2), titanium frame,
                      glass back (Gorilla Glass Victus 2)
body_sim: Nano-SIM + Nano-SIM +
                      eSIM + eSIM (max 2
                      at a t

In [11]:
import csv
from bs4 import BeautifulSoup

def scrape_phone_specs():
    """
    Scrapes phone specifications from a given GSMArena HTML file and returns them in a structured dictionary
    optimized for CSV storage.

    Returns:
        dict: A dictionary containing the phone's specifications, ready for CSV writing.
              Returns an empty dictionary if scraping fails.
    """
    try:
        with open("data/samsung25s.html", "r", encoding="utf-8") as f:  # Explicit encoding
            html_docs = f.read()

        soup = BeautifulSoup(html_docs, "html.parser")

        phone_name = soup.find(class_="specs-phone-name-title").get_text().strip()
        specs_data = {"phone_name": phone_name}

        specs_list = soup.find(id="specs-list")

        if specs_list:
            for table in specs_list.find_all("table"):
                category = table.find("th", scope="row")
                if category:
                    category_name = category.get_text().strip().lower().replace(" ", "_")
                    category_prefix = category_name + "_"
                    for row in table.find_all("tr"):
                        title_cell = row.find("td", class_="ttl")
                        info_cell = row.find("td", class_="nfo")

                        if title_cell and info_cell:
                            title = title_cell.get_text().strip().lower().replace(" ", "_")
                            info = info_cell.get_text().strip()
                            specs_data[category_prefix + title] = info
                else:
                    for row in table.find_all("tr"):
                        title_cell = row.find("td", class_="ttl")
                        info_cell = row.find("td", class_="nfo")

                        if title_cell and info_cell:
                            title = title_cell.get_text().strip().lower().replace(" ", "_")
                            info = info_cell.get_text().strip()
                            specs_data[title] = info

        return specs_data

    except FileNotFoundError:
        print("Error: The file 'data/samsung25s.html' was not found.")
        return {}
    except Exception as e:
        print(f"An error occurred during scraping: {e}")
        return {}


def save_to_csv(data, filename="phone_specs.csv"):
    """
    Saves the phone specifications data to a CSV file.

    Args:
        data (dict): The dictionary containing the phone specifications.
        filename (str): The name of the CSV file to create.
    """
    if not data:
        print("No data to save to CSV.")
        return

    try:
        with open(filename, "w", newline="", encoding="utf-8") as csvfile:
            fieldnames = data.keys()  # Column headers from dictionary keys
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()  # Write the header row
            writer.writerow(data)  # Write the data row

        print(f"Data saved to {filename}")

    except Exception as e:
        print(f"Error writing to CSV: {e}")

# Example usage:
phone_data = scrape_phone_specs()

if phone_data:
    save_to_csv(phone_data)
else:
    print("Failed to scrape phone data.")

Data saved to phone_specs.csv


In [12]:
import pandas as pd
df = pd.read_csv("phone_specs.csv")
df

Unnamed: 0,phone_name,network_technology,network_2g_bands,network_3g_bands,network_4g_bands,network_,network_5g_bands,network_speed,launch_announced,launch_status,...,battery_charging,misc_colors,misc_models,misc_sar,misc_sar_eu,misc_price,tests_performance,tests_display,tests_loudspeaker,tests_battery_(new)
0,Samsung Galaxy S25 Edge,GSM / HSPA / LTE / 5G,GSM 850 / 900 / 1800 / 1900,HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100,"1, 2, 3, 4, 5, 7, 8, 12, 13, 17, 18, 19, 20, 2...","1, 2, 5, 7, 25, 28, 41, 66, 71, 77, 78, 257, 2...","1, 2, 3, 5, 7, 8, 12, 20, 25, 26, 28, 38, 40, ...","HSPA, LTE (CA), 5G","2025, May 13","Available. Released 2025, May 29",...,"25W wired, PD, QC2.0, 55% in 30 min\n ...","Titanium Icyblue, Titanium Silver, Titanium Je...","SM-S937U, SM-S937U1, SM-S937B, SM-S937B/DS, SM...",1.21 W/kg (head) 1.23 W/kg (body),1.24 W/kg (head) 1.25 W/kg (body),"$ 1,055.99 / C$ 1,299.99 / £ 930.13 / € 999.00...",AnTuTu: 2147084 (v10)\n G...,1416 nits max brightness (measured),-26.4 LUFS (Good),Active use score 12:06h


In [13]:
import mysql.connector
conn = mysql.connector.connect(
    host='localhost',
    user='root',
    password='',
    database='samsung_phones'
)


In [14]:
from sqlalchemy import create_engine

engine = create_engine('mysql+pymysql://root:@localhost/samsung_phones')


In [15]:
import pandas as pd

# Read CSV
df = pd.read_csv('phone_specs.csv')  # Adjust path as needed

# Save to MySQL table named 'my_table' (replace as you wish)
df.to_sql(
    name='my_table',
    con=engine,
    if_exists='replace',  # or 'append' if you want to add without deleting
    index=False
)

print("Data successfully loaded into MySQL!")


Data successfully loaded into MySQL!
