We used Python version 3.11.5.

# 0. Loading Packages

In [1]:
!python -m venv ENV # creating virtual environment

In [2]:
import requests, re
from bs4 import BeautifulSoup

# 1. Scraping URLS of Phone Products

#### DatArt

In [3]:
# 1 Finding the basic URL that can be extended to obtain the links of certain products
DatArt_basic_url = "https://www.datart.cz"

# 2 Finding category website with all the products listed (here by manually setting the limit of products shown manually in the browser)
DataArt_category_url = "https://www.datart.cz/mobilni-telefony.html?limit=1000" # (17.01.2024: there are 946 phones listed)

# 3 Defining a function to scrape all products URL's from the category website
def get_product_urls_DatArt(basic_url: str, category_url: str):

    # Check data types of arguments
    if not isinstance(basic_url, str):
        raise TypeError("Basic URL must be a string.")
    
    if not isinstance(category_url, str):
        raise TypeError("Category URL must be a string.")
    
    response = requests.get(category_url)

    # Check status code of the response
    if response.status_code == 200: # 200 -> possible to get data 
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract product URLs using appropriate selectors
        product_urls = [[basic_url + item.a.get('href')] for item in soup.select('.item-title')]
        # Flatten the list
        product_urls= [url for sublist in product_urls for url in sublist]
        
    else:
        print(f"Failed fetching. Response error - {response.status_code}: {response.reason}")

    print(f"{len(product_urls)} products found.")
    return product_urls

product_urls_DatArt = get_product_urls_DatArt(DatArt_basic_url, DataArt_category_url)
product_urls_DatArt

953 products found.


['https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-128gb-natural-titanium-mtux3sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-128gb-black-titanium-mtuv3sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-128gb-pink-mtp13sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-256gb-black-titanium-mtv13sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-12-64-gb-blue-mgj83cn-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-max-256gb-black-titanium-mu773sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-128gb-black-mtp03sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-256gb-natural-titanium-mtv53sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-128gb-blue-titanium-mtv03sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-128gb-blue-mtp43sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-256gb-wh

#### CZC

In [118]:
# 1 Finding the basic URL that can be extended to obtain the links of certain products
CZC_basic_url = "https://www.czc.cz"

# 2 Finding category website with all the products listed (here no manual limit for the number of products shown can be set)
CZC_category_url = "https://www.czc.cz/mobilni-telefony/produkty"

# 3 Instead all pages of the category url have to be individually scraped
CZC_category_urls = [CZC_category_url] # category_url is added as it is the first page with products

# 4 Generating URL for all pages of the category website

# Create list of page limits
page_limits = list(range(27, 1000, 27)) # Currently (17.01.2024) there are 818 products; every page consists of a product limit of 27 products

# Iterate over the pages and construct URLs for every page of the category website
for page_limit in page_limits:
    url_with_page_limit = f"{CZC_category_url}?q-first={page_limit}"
    CZC_category_urls.append(url_with_page_limit)

# 5 Defining a function to scrape all products URL's from each page of the category website
def get_product_urls_CZC(basic_url: str, category_urls: list):
    
    # Check data types of arguments
    if not isinstance(basic_url, str):
        raise TypeError("Basic URL must be a string.")
    
    if not isinstance(category_urls, list):
        raise TypeError("Category URLs must be a list.")
    
    product_urls = []

    # Scrape product URL's across all product pages
    for page_num, url in enumerate(category_urls, start=1):

        response = requests.get(url)
        
        # Check status code of the response
        if response.status_code == 200: # 200 -> possible to get data 
            soup = BeautifulSoup(response.text, 'html.parser')
            # Extract product URLs using appropriate selectors
            product_urls.append([[basic_url + item.a.get('href')] for item in soup.select('.tile-title')])
            print(f"Page {page_num} complete")
            
        else:
            print("Failed fetching page {page_num}. Response error - {response.status_code}: {response.reason}")

    # Flatten the list
    product_urls = [url for page_list in product_urls for product_list in page_list for url in product_list]
    print(f"{len(product_urls)} products found.")
    return product_urls


product_urls_CZC = get_product_urls_CZC(CZC_basic_url, CZC_category_urls)
product_urls_CZC

Page 1 complete
Page 2 complete
Page 3 complete
Page 4 complete
Page 5 complete
Page 6 complete
Page 7 complete
Page 8 complete
Page 9 complete
Page 10 complete
Page 11 complete
Page 12 complete
Page 13 complete
Page 14 complete
Page 15 complete
Page 16 complete
Page 17 complete
Page 18 complete
Page 19 complete
Page 20 complete
Page 21 complete
Page 22 complete
Page 23 complete
Page 24 complete
Page 25 complete
Page 26 complete
Page 27 complete
Page 28 complete
Page 29 complete
Page 30 complete
Page 31 complete
Page 32 complete
Page 33 complete
Page 34 complete
Page 35 complete
Page 36 complete
Page 37 complete
Page 38 complete
818 products found.


['https://www.czc.cz/samsung-galaxy-a54-5g-8gb-128gb-awesome-lime/369599/produkt',
 'https://www.czc.cz/apple-iphone-15-128gb-black/383646/produkt',
 'https://www.czc.cz/apple-iphone-15-plus-128gb-black/383647/produkt',
 'https://www.czc.cz/apple-iphone-15-pro-128gb-black-titanium/383648/produkt',
 'https://www.czc.cz/apple-iphone-15-pro-max-256gb-black-titanium/383649/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-256gb-awesome-white/369606/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-256gb-awesome-graphite/369605/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-256gb-awesone-violet/369604/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-256gb-awesome-lime/369603/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-128gb-awesome-white/369602/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-128gb-awesome-graphite/369601/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-128gb-awesone-violet/369600/produkt',
 'https://www.czc.cz/go

#### Electroworld

In [4]:
# 1 Finding the basic URL that can be extended to obtain the links of certain products
electroworld_basic_url = "https://www.electroworld.cz"

# 2 Finding category page with all the products listed (here no manual limit for the number of products shown can be set)
electroworld_category_url =  "https://www.electroworld.cz/chytre-mobily/sort-by_mostExpensive" # sorted by most expensive gives unique products for each page

# 3 Instead all pages of the category url have to be individually scraped
electroworld_category_urls = [electroworld_category_url] # category_url is added as it is the first page with products

# 4 Generating URL for all pages of the category website

# Create list of pages 
pages = list(range(2, 43, 1)) # Currently (17.01.2024) there are 42 pages with 740 products

# Iterate over the pages and construct URLs for every page of the category website
for page in pages:
    url_with_page = f"{electroworld_category_url}?page={page}"
    electroworld_category_urls.append(url_with_page)

# 5 Defining a function to scrape all products URL's from each page of the category website
# Defining a function 
def get_product_urls_electroworld(basic_url, category_urls,):
    
    # Check data types of arguments
    if not isinstance(basic_url, str):
        raise TypeError("Basic URL must be a string.")
    
    if not isinstance(category_urls, list):
        raise TypeError("Category URLs must be a list.")
    
    product_urls = []

    # Scrape product URL's across all product pages
    for page_num, url in enumerate(category_urls, start=1):

        response = requests.get(url)
        
        if response.status_code == 200: # 200 -> possible to get data 
            soup = BeautifulSoup(response.text, 'html.parser')
            # Extract product URLs using appropriate selectors
            product_urls.append([[basic_url + item.get('href')] for item in soup.select('.product-box__link')])
            print(f"Page {page_num} complete")
        else:
            print(f"Failed fetching page {page_num}. Response error - {response.status_code}: {response.reason}")
            return []
    
    # Flatten the list
    product_urls = [url for page_list in product_urls for product_list in page_list for url in product_list]
    print(f"{len(product_urls)} products found.")
    return product_urls


product_urls_electroworld = get_product_urls_electroworld(electroworld_basic_url, electroworld_category_urls) 
product_urls_electroworld

Page 1 complete
Page 2 complete
Page 3 complete
Page 4 complete
Page 5 complete
Page 6 complete
Page 7 complete
Page 8 complete
Page 9 complete
Page 10 complete
Page 11 complete
Page 12 complete
Page 13 complete
Page 14 complete
Page 15 complete
Page 16 complete
Page 17 complete
Page 18 complete
Page 19 complete
Page 20 complete
Page 21 complete
Page 22 complete
Page 23 complete
Page 24 complete
Page 25 complete
Page 26 complete
Page 27 complete
Page 28 complete
Page 29 complete
Page 30 complete
Page 31 complete
Page 32 complete
Page 33 complete
Page 34 complete
Page 35 complete
Page 36 complete
Page 37 complete
Page 38 complete
Page 39 complete
Page 40 complete
Page 41 complete
Page 42 complete
755 products found.


['https://www.electroworld.cz/apple-iphone-15-pro-max-1-tb-natural-titanium-prirodni-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-max-1-tb-black-titanium-cerny-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-max-1-tb-white-titanium-bily-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-max-1-tb-blue-titanium-modry-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-1-tb-natural-titanium-prirodni-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-1-tb-black-titanium-cerny-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-1-tb-blue-titanium-modry-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-1-tb-white-titanium-bily-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-max-512-gb-black-titanium-cerny-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-max-512-gb-natural-titanium-prirodni-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-max-512-gb-white-titanium-bily-titan',
 'https://www.electroworld.cz/ap

Improvements:
- For CZC code: Automatic handling of pages, instead of manually setting the project number limit of 1000 above. 27 can stay if necessary (or trying out numbers 1:50 until data is obtained)

# 2. Scraping Product Information 

#### DatArt

In [40]:
#product_url = 'https://www.datart.cz/mobilni-telefon-samsung-galaxy-a33-5g-6-gb-128-gb-sm-a336bzkgeee-cerny.html'
#product_url = 'https://www.datart.cz/bazar/mobilni-telefon-motorola-edge-40-5g-8-gb-256-gb-eclipse-black-pay40006pl-rozbaleno-24-mesicu-zaruka.html'
product_url = 'https://www.datart.cz/mobilni-telefon-samsung-galaxy-s23-5g-8-gb-128-gb-sm-s911bzkdeue-cerny.html'
page = requests.get(product_url)
soup = BeautifulSoup(page.text, 'html.parser')

In [53]:
# Functions to scrape product information characteristics for a given product (page)
# For some variables (product colour, number of cores) an if-/elif-else clause was built to translate the czech labels into english labels

# Function to return the title for a given product (page) as a string
def get_product_title_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    return str(soup_product_page.select('.product-detail-title')[0].text)

# Function to return the price (in CZK) for a given product (page) as an integer
def get_product_price_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    return int(soup.select('.actual')[0].text.replace('\n', '').replace('\t', '').replace('\xa0', '').replace('Kč', '').strip())

# Function to return rating for a given product (page) as a float number
def get_product_rating_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_ratings = soup.select('.rating-wrap')[0].text.replace('\n', '').replace('\t', '').strip().replace(" ", "")
    soup_rating = re.match(r'(\d+\.\d+)\((\d+)\)', soup_ratings)

    if soup_rating:
        rating = float(soup_rating.group(1))
    else: 
        rating = None
    return rating

# Function to return number of ratings for a given product (page) as an integer
def get_product_no_ratings_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_ratings = soup.select('.rating-wrap')[0].text.replace('\n', '').replace('\t', '').strip().replace(" ", "")
    soup_rating = re.match(r'(\d+\.\d+)\((\d+)\)', soup_ratings)

    if soup_rating:
        no_ratings = int(soup_rating.group(2))
    else: 
        no_rating = None
    return no_ratings

# Function to return the display size (in inches) for a given product (page) as a float number
def get_product_display_size_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    return float(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Úhlopříčka displeje") + td').get_text(strip=True).replace(',', '.'))

# Function to return the display resolution (width in pixels) for a given product (page) as an integer
def get_product_resolution_w_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_display_resolution = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Rozlišení displeje") + td').get_text()
    display_resolution = re.search(r'(\d+) × (\d+)', soup_display_resolution)
    return int(display_resolution.group(1))

# Function to return the display resolution (height in pixels) for a given product (page) as an integer
def get_product_resolution_h_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_display_resolution = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Rozlišení displeje") + td').get_text()
    display_resolution = re.search(r'(\d+) × (\d+)', soup_display_resolution)
    return int(display_resolution.group(2))

# Function to return the display resolution (product of width and height in pixels) for a given product (page) as an integer
def get_product_resolution_tot_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_display_resolution = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Rozlišení displeje") + td').get_text()
    display_resolution = re.search(r'(\d+) × (\d+)', soup_display_resolution)
    display_resolution_w = int(display_resolution.group(1))
    display_resolution_h = int(display_resolution.group(2))
    return display_resolution_w*display_resolution_h

# Function to return the cutout shape for a given product (page) as a string
def get_product_cutout_shape_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    cutout_shape = str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Tvar výřezu") + td').get_text())
    if cutout_shape == "kapka":
        cutout_shape = "drop"
    if cutout_shape == "průstřel":
        cutout_shape = "bullet hole"
    if cutout_shape == "bez výřezu":
        cutout_shape = "without cut-out"
    if cutout_shape not in ["drop", "bullet hole", "without cut-out"]:
        cutout_shape = "Other cutout-shape"
    return cutout_shape

# Function to return the display refresh rate (in Hertz) for a given product (page) as an integer
def get_product_display_refresh_rate_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_display_refresh_rate = soup_product_page.select_one('.table-borderless tbody th:contains("Obnovovací frekvence displeje") + td').get_text()
    display_refresh_rate = re.search(r'(\d+)', soup_display_refresh_rate).group(1)
    return int(display_refresh_rate)

# Function to return the processor manufacturer for a given product (page) as a string
def get_product_processor_manufacturer_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    return str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Výrobce procesoru") + td').get_text())

# Function to return the processor model for a given product (page) as a string
def get_product_processor_model_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    return str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Model procesoru") + td').get_text())

# Function to return the number of cores for a given product (page) as a string
def get_product_no_cores_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    no_cores = str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Počet jader") + td').get_text())
    if no_cores == "osmijádrový":
        no_cores = "Octa-core"
    if no_cores == "šestijádrový":
        no_cores == "Hexa-core"
    if no_cores == "čtyřjádrový":
        no_cores == "Quad-Core"
    if no_cores not in ["Octa-core", "Hexa-core", "Quad-Core"]:
        no_cores == "Other Core"
    return no_cores

# Function to return the processor frequency (in GHZ) for a given product (page) as a float number
def get_product_processor_freq_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    
    soup_processor_frequency = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Frekvence procesoru") + td').get_text()
    return float(re.search(r'\(([\d,]+) GHz\)', soup_processor_frequency).group(1).replace(",", "."))


# Function to return the SIM card type for a given product (page) as a string
def get_product_SIM_card_type_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    
    return str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Typ Sim karty") + td').get_text())

# Function to return the degree of protection for a given product (page) as a string
def get_product_degree_of_protection_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    
    return str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Stupeň krytí") + td').get_text())

# Function to return the operating system for a given product (page) as a string
def get_product_OS_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    
    return str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Nadstavba systému") + td').get_text())

# Function to return the internal memory (in GB) for a given product (page) as an integer
def get_product_int_memory_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_internal_memory = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Interní paměť") + td').get_text()
    return int(re.search(r'(\d+)', soup_internal_memory).group(1))

# Function to return the amount of RAM (in GB) for a given product (page) as an integer
def get_product_RAM_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_ram = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Velikost paměti RAM") + td').get_text()
    return int(re.search(r'(\d+)', soup_ram).group(1))

# Function to return the maximum memory card size (in TB) for a given product (page) as an integer
def get_product_max_memory_card_size_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_max_memory_card_size = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Maximální velikost paměťové karty") + td').get_text()
    try: 
        max_memory_card_size = int(re.search(r'(\d+)', soup_max_memory_card_size).group(1))
    except Exception:
        max_memory_card_size = str(soup_max_memory_card_size)
        if max_memory_card_size == "nepodporuje paměťové karty":
            max_memory_card_size = "Doesn't support memory cards"

    #else:
    return max_memory_card_size


# Function to return a list of wireless technologies for a given product (page) as a list of string objects
def get_product_wireless_tech_list_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    return str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Bezdrátové technologie") + td').get_text()).split(', ')

# Function to return the number of rear lenses for a given product (page) as an integer
def get_product_no_rear_cam_lenses_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    return int(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Počet objektivů zadního fotoaparátu") + td').get_text())

# Function to return the rear camera resolution (in Megapixels) for a given product (page) as a float number
def get_product_rear_cam_resolution_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_rear_cam_resolution = str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Rozlišení zadního fotoaparátu") + td').get_text())
    return max([float(match.group()) for match in re.finditer(r'\b\d+\b', soup_rear_cam_resolution)]) 

# Function to return the front camera resolution (in Megapixels) for a given product (page) as a float number
def get_product_front_cam_resolution_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_front_cam_resolution = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Rozlišení předního fotoaparátu") + td').get_text()
    return float(re.search(r'(\d+)', soup_front_cam_resolution).group(1))

def get_product_camera_features_DatArt(soup_product_page):
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    camera_feature_list = str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Funkce fotoaparátu") + td').get_text()).split(', ')
    if "širokoúhlý objektiv" in camera_feature_list:
        # Find index to replace
        index = camera_feature_list.index("širokoúhlý objektiv")
        # Replace values
        camera_feature_list[index] = "wide angle lens"
    if "noční režim" in camera_feature_list:
        # Find index to replace
        index = camera_feature_list.index("noční režim")
        # Replace values
        camera_feature_list[index] = "night mode"
    if "automatické ostření" in camera_feature_list:
        # Find index to replace
        index = camera_feature_list.index("automatické ostření")
        # Replace values
        camera_feature_list[index] = "auto focus"
    if "přisvětlovací dioda" in camera_feature_list:
        # Find index to replace
        index = camera_feature_list.index("přisvětlovací dioda")
        # Replace values
        camera_feature_list[index] = "flash diode"  
    if "Bokeh efekt" in camera_feature_list:
        # Find index to replace
        index = camera_feature_list.index("Bokeh efekt")
        # Replace values
        camera_feature_list[index] = "Bokeh effect"
    if "optický zoom" in camera_feature_list:
        # Find index to replace
        index = camera_feature_list.index("optický zoom")
        # Replace values
        camera_feature_list[index] = "optical zoom"
    if "macro režim" in camera_feature_list:
        # Find index to replace
        index = camera_feature_list.index("macro režim")
        # Replace values
        camera_feature_list[index] = "macro mode"
    if "teleobjektiv" in camera_feature_list:
        # Find index to replace
        index = camera_feature_list.index("teleobjektiv")
        # Replace values
        camera_feature_list[index] = "telephoto"  
    if camera_feature_list and all(element not in camera_feature_list for element in ["wide angle lens", "night mode", "auto focus", "flash diode", "Bokeh effect", "optical zoom", "macro mode", "telephoto"]):
        camera_feature_list = "Other battery feature(s)"
    return camera_feature_list

# Function to return the battery type for a given product (page) as an integer
def get_product_battery_type_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    return str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Typ akumulátoru") + td').get_text())


# Function to return the battery capacity (in mAh) for a given product (page) as an integer
def get_product_battery_capacity_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_battery_capacity = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Kapacita akumulátoru") + td').get_text()
    return int(re.search(r'(\d+)', soup_battery_capacity).group(1))

# Function to return a list of battery features for a given product (page) as an list of strings
def get_product_battery_features_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    battery_feature_list = str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Vlastnosti baterie") + td').get_text()).split(', ')
    if "rychlé nabíjení" in battery_feature_list:
        # Find index to replace
        index = battery_feature_list.index("rychlé nabíjení")
        # Replace values
        battery_feature_list[index] = "fast charging"
    if "bezdrátové nabíjení" in battery_feature_list:
         # Find index to replace
        index = battery_feature_list.index("bezdrátové nabíjení")
        # Replace values
        battery_feature_list[index] = "wireless charging"
    if "vyjímatelná baterie" in battery_feature_list:
        # Find index to replace
        index = battery_feature_list.index("vyjímatelná baterie")
        # Replace values
        battery_feature_list[index] = "removable battery"
    if "reverzní bezdrátové nabíjení" in battery_feature_list:
        # Find index to replace
        index = battery_feature_list.index("reverzní bezdrátové nabíjení")
        # Replace values
        battery_feature_list[index] = "reverse wireless charging"
    if battery_feature_list and all(element not in battery_feature_list for element in ["fast charging", "wireless charging", "removable battery", "reverse wireless charging"]):
        battery_feature_list = "Other battery feature(s)"
    return battery_feature_list


# Function to return the charging power (in Watt) for a given product (page) as an integer
def get_product_charging_power_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_charging_power = str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Výkon nabíjení") + td').get_text())
    
    return int(re.search(r'(\d+)', soup_charging_power).group(1))

# Function to return the securing option for a given product (page) as a string
def get_product_security_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    security_list = str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Zabezpečení") + td').get_text()).split(', ')
    
    if "čtečka otisku prstů na těle" in security_list:
        # Find index to replace
        index = security_list.index("čtečka otisku prstů na těle")
        # Replace values
        security_list[index] = "body fingerprint reader"
    if "čtečka otisku prstů v displeji" in security_list:
        # Find index to replace
        index = security_list.index("čtečka otisku prstů v displeji")
        # Replace values
        security_list[index] = "in-display fingerprint reader"
    if "odemykání obličejem" in security_list:
        # Find index to replace
        index = security_list.index("odemykání obličejem")
        # Replace values
        security_list[index] = "face unlock"
    if security_list and all(element not in security_list for element in ["body fingerprint reader", "in-display fingerprint reader", "face unlock"]):
        security_list = "Other security option(s)"
    return security_list

# Function to return the connector for a given product (page) as a string
def get_product_connector_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    return str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Konektor") + td').get_text())

# Function to return the warranty (in months) for a given product (page) as an integer
def get_product_warranty_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_warranty = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Záruka") + td').get_text()
    return int(re.search(r'(\d+)', soup_warranty).group(1))

# Function to return the colour for a given product (page) as a string
def get_product_colour_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    colour = str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Barva telefonu") + td').get_text())
    if colour == "modrá":
        colour = "blue"
    if colour == "zelená":
        colour = "green"
    if colour == "černá":
        colour = "black"
    if colour == "fialová":
        colour = "purple"
    if colour == "šedá":
        colour = "grey"
    if colour == "béžová":
        colour = "beige"
    if colour == "stříbrná ":
        colour = "silver"
    if colour == "zlatá":
        colour = "gold"
    if colour == "krémová":
        colour = "cream"
    if colour == "žlutá":
        colour = "yellow"
    if colour == "bílá":
        colour = "white"
    if colour == "růžová":
        colour = "pink"
    if colour not in ["blue", "green", "black", "purple", "grey", "beige", "silver", "gold", "cream", "yellow", "white", "pink"]:
        colour = "Other colour"
    return colour
    

# Function to return the brand for a given product (page) as a string
def get_product_brand_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    return str(soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Značky") + td').get_text())

# Function to return the width (in cm) for a given product (page) as a float number
def get_product_width_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_product_width = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Šířka výrobku") + td').get_text()
    return float(re.search(r'(\d+\.\d+)', soup_product_width).group(1))


# Function to return the height (in cm) for a given product (page) as a float number
def get_product_height_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_product_height = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Výška výrobku") + td').get_text()
    return float(re.search(r'(\d+\.\d+)', soup_product_height).group(1))

# Function to return the depth (in cm) for a given product (page) as a float number
def get_product_depth_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_product_depth = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Hloubka výrobku") + td').get_text()
    return float(re.search(r'(\d+\.\d+)', soup_product_depth).group(1))

# Function to return the volume (in cubic cm) for a given product (page) as a float number
def get_product_volume_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_product_width = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Šířka výrobku") + td').get_text()
    soup_product_depth = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Hloubka výrobku") + td').get_text()
    soup_product_height = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Výška výrobku") + td').get_text()
    product_width = float(re.search(r'(\d+\.\d+)', soup_product_width).group(1))
    product_depth = float(re.search(r'(\d+\.\d+)', soup_product_depth).group(1))
    product_height = float(re.search(r'(\d+\.\d+)', soup_product_height).group(1))
    return product_width*product_depth*product_height

# Function to return the weight (in kg) for a given product (page) as a float number
def get_product_weight_DatArt(soup_product_page):
     # Check type of argument
    if not isinstance(soup_product_page, BeautifulSoup):
        raise TypeError(f"Input must be a BeautifulSoup object. Your input has the type: {type(soup_product_page)}.")
    soup_product_weight = soup_product_page.select_one('.table-borderless tbody th:-soup-contains("Hmotnost výrobku") + td').get_text()
    return float(re.search(r'(\d+\.\d+)', soup_product_weight).group(1))

In [54]:
# Structure 
    # 1) Writing function for getting certain product information, e.g. get_title_DatArt(). Done!
    # 2) Using function and writing result into dictionary with a clear variable name. This variable name has to be the same for the other two data sources. DOne!
    # 3) Add a categorical variable for the online retailer. Done!
    # 4) Add if to change czech into english category names and cores into a number: e.g. octacore means 8 cores
    # 5) Add tests to check whether the function outputs have the right data type
    # 6) Optional: Age of the product?
# Dictionary for phone characteristics
product_entry_DatArt = {} # placeholder for product characteristics
product_entry_DatArt["ID"] = 1
product_entry_DatArt["Online-Retailer"] = "DatArt"
product_entry_DatArt['title'] = get_product_title_DatArt(soup)
product_entry_DatArt['price'] = get_product_price_DatArt(soup)
product_entry_DatArt['rating'] = get_product_rating_DatArt(soup)
product_entry_DatArt['number of ratings'] = get_product_no_ratings_DatArt(soup)
product_entry_DatArt['display size'] = get_product_display_size_DatArt(soup)
product_entry_DatArt['resolution width'] = get_product_resolution_w_DatArt(soup)
product_entry_DatArt['resolution height'] = get_product_resolution_h_DatArt(soup)
product_entry_DatArt['resolution total'] = get_product_resolution_tot_DatArt(soup)
product_entry_DatArt['cutout shape'] = get_product_cutout_shape_DatArt(soup)
product_entry_DatArt['display refresh rate'] = get_product_resolution_h_DatArt(soup)
product_entry_DatArt['processor manufacturer'] = get_product_processor_manufacturer_DatArt(soup)
product_entry_DatArt['processor model'] = get_product_processor_model_DatArt(soup)
product_entry_DatArt['number of cores'] = get_product_no_cores_DatArt(soup)
product_entry_DatArt['processor frequency'] = get_product_processor_freq_DatArt(soup)
product_entry_DatArt['SIM card type'] = get_product_SIM_card_type_DatArt(soup)
product_entry_DatArt['degree of protection'] = get_product_degree_of_protection_DatArt(soup)
product_entry_DatArt['operating system'] = get_product_OS_DatArt(soup)
product_entry_DatArt['internal memory'] = get_product_int_memory_DatArt(soup)
product_entry_DatArt['RAM'] = get_product_RAM_DatArt(soup)
product_entry_DatArt['maximum memory card size'] = get_product_max_memory_card_size_DatArt(soup)

for obj in get_product_wireless_tech_list_DatArt(soup):
    product_entry_DatArt[obj] = 1


product_entry_DatArt['number of rear camera lenses'] = get_product_no_rear_cam_lenses_DatArt(soup)
product_entry_DatArt['rear cam resolution'] = get_product_rear_cam_resolution_DatArt(soup)
product_entry_DatArt['front cam resolution'] = get_product_front_cam_resolution_DatArt(soup)

for obj in get_product_camera_features_DatArt(soup):
    product_entry_DatArt[obj] = 1

product_entry_DatArt['battery type'] = get_product_battery_type_DatArt(soup)
product_entry_DatArt['battery capacity'] = get_product_battery_capacity_DatArt(soup)

for obj in get_product_battery_features_DatArt(soup):
    product_entry_DatArt[obj] = 1

product_entry_DatArt['charging power'] = get_product_charging_power_DatArt(soup)

for obj in get_product_security_DatArt(soup):
    product_entry_DatArt[obj] = 1

product_entry_DatArt['connector'] = get_product_connector_DatArt(soup)
product_entry_DatArt['warranty'] = get_product_warranty_DatArt(soup)
product_entry_DatArt['colour'] = get_product_colour_DatArt(soup)
product_entry_DatArt['brand'] = get_product_brand_DatArt(soup)
product_entry_DatArt['width'] = get_product_width_DatArt(soup)
product_entry_DatArt['height'] = get_product_height_DatArt(soup)
product_entry_DatArt['depth'] = get_product_depth_DatArt(soup)
product_entry_DatArt['volume'] = get_product_volume_DatArt(soup)
product_entry_DatArt['weight'] = get_product_weight_DatArt(soup)

In [55]:
product_entry_DatArt

{'ID': 1,
 'Online-Retailer': 'DatArt',
 'title': 'Mobilní telefon Samsung Galaxy S23 5G 8 GB / 128 GB (SM-S911BZKDEUE) černý',
 'price': 19490,
 'rating': 4.9,
 'number of ratings': 13,
 'display size': 6.1,
 'resolution width': 2400,
 'resolution height': 1080,
 'resolution total': 2592000,
 'cutout shape': 'bullet hole',
 'display refresh rate': 1080,
 'processor manufacturer': 'Qualcomm',
 'processor model': 'Snapdragon 8 Gen 2',
 'number of cores': 'Octa-core',
 'processor frequency': 3.36,
 'SIM card type': 'nano SIM  + eSIM',
 'degree of protection': 'IP68',
 'operating system': 'Samsung One UI',
 'internal memory': 128,
 'RAM': 8,
 'maximum memory card size': "Doesn't support memory cards",
 '4G/LTE': 1,
 '5G': 1,
 'Beidou': 1,
 'Bluetooth': 1,
 'Galileo': 1,
 'GLONASS': 1,
 'GPS': 1,
 'NFC': 1,
 'Wi-Fi': 1,
 'number of rear camera lenses': 3,
 'rear cam resolution': 50.0,
 'front cam resolution': 12.0,
 'wide angle lens': 1,
 'night mode': 1,
 'auto focus': 1,
 'optical zoom':

In [None]:
products = {}
for product_number, product_url in enumerate(product_urls_DatArt, start = 1):

    page = requests.get(product_url)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Placeholder for product characteristics
    product_entry_DatArt = {} 
    product_entry_DatArt["ID"] = product_number
    product_entry_DatArt["Online-Retailer"] = "DatArt"
    product_entry_DatArt['title'] = get_product_title_DatArt(soup)
    product_entry_DatArt['price'] = get_product_price_DatArt(soup)
    product_entry_DatArt['rating'] = get_product_rating_DatArt(soup)
    product_entry_DatArt['number of ratings'] = get_product_no_ratings_DatArt(soup)
    product_entry_DatArt['display size'] = get_product_display_size_DatArt(soup)
    product_entry_DatArt['resolution width'] = get_product_resolution_w_DatArt(soup)
    product_entry_DatArt['resolution height'] = get_product_resolution_h_DatArt(soup)
    product_entry_DatArt['resolution total'] = get_product_resolution_tot_DatArt(soup)
    product_entry_DatArt['cutout shape'] = get_product_cutout_shape_DatArt(soup)
    product_entry_DatArt['display refresh rate'] = get_product_resolution_h_DatArt(soup)
    product_entry_DatArt['processor manufacturer'] = get_product_processor_manufacturer_DatArt(soup)
    product_entry_DatArt['processor model'] = get_product_processor_model_DatArt(soup)
    product_entry_DatArt['number of cores'] = get_product_no_cores_DatArt(soup)
    product_entry_DatArt['processor frequency'] = get_product_processor_freq_DatArt(soup)
    product_entry_DatArt['SIM card type'] = get_product_SIM_card_type_DatArt(soup)
    product_entry_DatArt['operating system'] = get_product_OS_DatArt(soup)
    product_entry_DatArt['internal memory'] = get_product_int_memory_DatArt(soup)
    product_entry_DatArt['RAM'] = get_product_RAM_DatArt(soup)
    product_entry_DatArt['maximum memory card size'] = get_product_max_memory_card_size_DatArt(soup)

    for obj in get_product_wireless_tech_list_DatArt(soup):
        product_entry_DatArt[obj] = 1

    product_entry_DatArt['number of rear camera lenses'] = get_product_no_rear_cam_lenses_DatArt(soup)
    product_entry_DatArt['rear cam resolution'] = get_product_rear_cam_resolution_DatArt(soup)
    product_entry_DatArt['front cam resolution'] = get_product_front_cam_resolution_DatArt(soup)
    product_entry_DatArt['battery capacity'] = get_product_battery_capacity_DatArt(soup)

    for obj in get_product_battery_features_DatArt(soup):
        product_entry_DatArt[obj] = 1

    product_entry_DatArt['charging power'] = get_product_charging_power_DatArt(soup)
    product_entry_DatArt['connector'] = get_product_connector_DatArt(soup)
    product_entry_DatArt['warranty'] = get_product_warranty_DatArt(soup)
    product_entry_DatArt['colour'] = get_product_colour_DatArt(soup)
    product_entry_DatArt['brand'] = get_product_brand_DatArt(soup)
    product_entry_DatArt['width'] = get_product_width_DatArt(soup)
    product_entry_DatArt['height'] = get_product_height_DatArt(soup)
    product_entry_DatArt['depth'] = get_product_depth_DatArt(soup)
    product_entry_DatArt['volume'] = get_product_volume_DatArt(soup)
    product_entry_DatArt['weight'] = get_product_weight_DatArt(soup)

    products.append(product_entry_DatArt)
    

In [None]:
# convert to JSON object and safe as JSON file
import json

with open('product_info.json', 'w') as file:

    json.dump(item, file)

#### Electroworld 

In [61]:
# Repeating the procedure for electroworld
product_url2 = 'https://www.electroworld.cz/apple-iphone-15-pro-max-512-gb-natural-titanium-prirodni-titan'
#product_url = 'https://www.datart.cz/bazar/mobilni-telefon-motorola-edge-40-5g-8-gb-256-gb-eclipse-black-pay40006pl-rozbaleno-24-mesicu-zaruka.html'
page2 = requests.get(product_url2)
soup2 = BeautifulSoup(page2.text, 'html.parser')

In [62]:
soup2

<!DOCTYPE html>

<html data-n-head="%7B%22lang%22:%7B%22ssr%22:%22cs%22%7D%7D" data-n-head-ssr="" lang="cs">
<head>
<title>Apple iPhone 15 Pro Max 512GB Natural Titanium chytrý telefon přírodní titan | Electroworld.cz</title><meta charset="utf-8" data-n-head="ssr"/><meta content="width=device-width, initial-scale=1" data-n-head="ssr" name="viewport"/><meta content="telephone=no" data-n-head="ssr" name="format-detection"/><meta content='Chytrý telefon Apple iPhone 15 Pro Max se vyznačuje konstrukcí z leteckého titanu v přírodním provedení, pohlcujícím 6,7" Super Retina XDR OLED displejem s ProMotion, 120Hz obnovovací frekvencí a pevnou ochranou Ceramic Shield, ultrarychlým čipem A17 Pro, trojitým zadním fotoaparátem s rozlišením 48 Mpx a USB-C portem.' data-n-head="ssr" name="description"/><meta content='Chytrý telefon Apple iPhone 15 Pro Max se vyznačuje konstrukcí z leteckého titanu v přírodním provedení, pohlcujícím 6,7" Super Retina XDR OLED displejem s ProMotion, 120Hz obnovovací f

#### 3. Ideas for Analysis

Steps:
0) Which data can we use legally without problems? Ebay, DataArt etc.

1) Creating a function to save all necessary data for one product simultaneously.
2) Create a function to let the function from 1) run over all product URL's.
2) (How to save the data of the product description -> we need to look at different products and see what variables we can define.)
3) How to work with github, commits, pull etc.?
4) How to create a nice table in github to document aims and checkpoints.
5) How to structure the repository -> documents for functions, loading packages, ReadMe etc.
6) Running scripts on christmas again, to see if their is a price drop because of price offers
7) Starting data analysis -> simple averages, counts etc. -> can we create a dashboard on github? (Share or products etc.)
8) Further Statistical Analysis - Ideas:
    1) Predicing prices for new products (price modelling), finding out which variables drive prices the most (factor)-> Does it makes sense to do a price model as prices consist largely of the purchase price; could we find those purchase prices somewhere and join them with our data?
    2) Brand Analysis: Explore the average prices of smartphones for each brand. Analyze market share based on the number of products or total sales for each brand.
    3) Storage Impact: Investigate how storage capacity (GB) affects smartphone prices. Explore the most popular storage capacity among consumers.
    4) Segment the market based on different characteristics (e.g., high-end, mid-range, budget). One could then do 1) on all different segments to explore differences.
    5) Consumer Preferences: Survey or analyze customer reviews to understand what features users value the most. Identify common positive and negative sentiments related to smartphone characteristics.
    6) Profitability Analysis (given that we know production or purchase prices): Estimate profit margins for different smartphones based on their production costs and prices.
Identify the most profitable products in your dataset. One could use the number of ratings as a proxy for the number of sales.


8) Which data tests and checks can we include?

How we could build a price model and find out which variables drive smartphone prices the most:

Feature Selection:

1) Use techniques like correlation analysis, mutual information, or feature importance from tree-based models to identify the most relevant features.
Consider removing highly correlated features to avoid multicollinearity.
Model Selection:

2) Choose a regression model that suits your data and problem. Common choices include linear regression, decision tree regression, random forest regression, or gradient boosting algorithms.
Model Training:

3) Split your dataset into training and testing sets.
Train your chosen model on the training data.
Model Evaluation:

4) Evaluate the model's performance on the testing set using appropriate metrics (e.g., mean squared error, R-squared, etc.).
Analyze the residuals to check for any patterns or systematic errors.

5) Iterate and Refine hyperparameters etc.