We used Python version 3.11.5.

# 0. Loading Packages

In [113]:
!python -m venv ENV # creating virtual environment

In [208]:
import requests, re
from bs4 import BeautifulSoup

# 1. Scraping URLS of Phone Products

#### DatArt

In [119]:
# 1 Finding the basic URL that can be extended to obtain the links of certain products
DatArt_basic_url = "https://www.datart.cz"

# 2 Finding category website with all the products listed (here by manually setting the limit of products shown manually in the browser)
DataArt_category_url = "https://www.datart.cz/mobilni-telefony.html?limit=1000" # (17.01.2024: there are 946 phones listed)

# 3 Defining a function to scrape all products URL's from the category website
def get_product_urls_DatArt(basic_url: str, category_url: str):

    # Check data types of arguments
    if not isinstance(basic_url, str):
        raise TypeError("Basic URL must be a string.")
    
    if not isinstance(category_url, str):
        raise TypeError("Category URL must be a string.")
    
    response = requests.get(category_url)

    # Check status code of the response
    if response.status_code == 200: # 200 -> possible to get data 
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract product URLs using appropriate selectors
        product_urls = [[basic_url + item.a.get('href')] for item in soup.select('.item-title')]
        # Flatten the list
        product_urls= [url for sublist in product_urls for url in sublist]
        
    else:
        print(f"Failed fetching. Response error - {response.status_code}: {response.reason}")

    print(f"{len(product_urls)} products found.")
    return product_urls

product_urls_DatArt = get_product_urls_DatArt(DatArt_basic_url, DataArt_category_url)
product_urls_DatArt

946 products found.


['https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-128gb-natural-titanium-mtux3sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-128gb-black-titanium-mtuv3sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-12-64-gb-blue-mgj83cn-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-256gb-black-titanium-mtv13sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-128gb-pink-mtp13sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-128gb-black-mtp03sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-128gb-blue-mtp43sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-128gb-blue-titanium-mtv03sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-256gb-natural-titanium-mtv53sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-max-256gb-black-titanium-mu773sx-a.html',
 'https://www.datart.cz/mobilni-telefon-apple-iphone-15-pro-256gb-wh

#### CZC

In [118]:
# 1 Finding the basic URL that can be extended to obtain the links of certain products
CZC_basic_url = "https://www.czc.cz"

# 2 Finding category website with all the products listed (here no manual limit for the number of products shown can be set)
CZC_category_url = "https://www.czc.cz/mobilni-telefony/produkty"

# 3 Instead all pages of the category url have to be individually scraped
CZC_category_urls = [CZC_category_url] # category_url is added as it is the first page with products

# 4 Generating URL for all pages of the category website

# Create list of page limits
page_limits = list(range(27, 1000, 27)) # Currently (17.01.2024) there are 818 products; every page consists of a product limit of 27 products

# Iterate over the pages and construct URLs for every page of the category website
for page_limit in page_limits:
    url_with_page_limit = f"{CZC_category_url}?q-first={page_limit}"
    CZC_category_urls.append(url_with_page_limit)

# 5 Defining a function to scrape all products URL's from each page of the category website
def get_product_urls_CZC(basic_url: str, category_urls: list):
    
    # Check data types of arguments
    if not isinstance(basic_url, str):
        raise TypeError("Basic URL must be a string.")
    
    if not isinstance(category_urls, list):
        raise TypeError("Category URLs must be a list.")
    
    product_urls = []

    # Scrape product URL's across all product pages
    for page_num, url in enumerate(category_urls, start=1):

        response = requests.get(url)
        
        # Check status code of the response
        if response.status_code == 200: # 200 -> possible to get data 
            soup = BeautifulSoup(response.text, 'html.parser')
            # Extract product URLs using appropriate selectors
            product_urls.append([[basic_url + item.a.get('href')] for item in soup.select('.tile-title')])
            print(f"Page {page_num} complete")
            
        else:
            print("Failed fetching page {page_num}. Response error - {response.status_code}: {response.reason}")

    # Flatten the list
    product_urls = [url for page_list in product_urls for product_list in page_list for url in product_list]
    print(f"{len(product_urls)} products found.")
    return product_urls


product_urls_CZC = get_product_urls_CZC(CZC_basic_url, CZC_category_urls)
product_urls_CZC

Page 1 complete
Page 2 complete
Page 3 complete
Page 4 complete
Page 5 complete
Page 6 complete
Page 7 complete
Page 8 complete
Page 9 complete
Page 10 complete
Page 11 complete
Page 12 complete
Page 13 complete
Page 14 complete
Page 15 complete
Page 16 complete
Page 17 complete
Page 18 complete
Page 19 complete
Page 20 complete
Page 21 complete
Page 22 complete
Page 23 complete
Page 24 complete
Page 25 complete
Page 26 complete
Page 27 complete
Page 28 complete
Page 29 complete
Page 30 complete
Page 31 complete
Page 32 complete
Page 33 complete
Page 34 complete
Page 35 complete
Page 36 complete
Page 37 complete
Page 38 complete
818 products found.


['https://www.czc.cz/samsung-galaxy-a54-5g-8gb-128gb-awesome-lime/369599/produkt',
 'https://www.czc.cz/apple-iphone-15-128gb-black/383646/produkt',
 'https://www.czc.cz/apple-iphone-15-plus-128gb-black/383647/produkt',
 'https://www.czc.cz/apple-iphone-15-pro-128gb-black-titanium/383648/produkt',
 'https://www.czc.cz/apple-iphone-15-pro-max-256gb-black-titanium/383649/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-256gb-awesome-white/369606/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-256gb-awesome-graphite/369605/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-256gb-awesone-violet/369604/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-256gb-awesome-lime/369603/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-128gb-awesome-white/369602/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-128gb-awesome-graphite/369601/produkt',
 'https://www.czc.cz/samsung-galaxy-a54-5g-8gb-128gb-awesone-violet/369600/produkt',
 'https://www.czc.cz/go

#### Electroworld

In [121]:
# 1 Finding the basic URL that can be extended to obtain the links of certain products
electroworld_basic_url = "https://www.electroworld.cz"

# 2 Finding category page with all the products listed (here no manual limit for the number of products shown can be set)
electroworld_category_url =  "https://www.electroworld.cz/chytre-mobily/sort-by_mostExpensive" # sorted by most expensive gives unique products for each page

# 3 Instead all pages of the category url have to be individually scraped
electroworld_category_urls = [electroworld_category_url] # category_url is added as it is the first page with products

# 4 Generating URL for all pages of the category website

# Create list of pages 
pages = list(range(2, 43, 1)) # Currently (17.01.2024) there are 42 pages with 740 products

# Iterate over the pages and construct URLs for every page of the category website
for page in pages:
    url_with_page = f"{electroworld_category_url}?page={page}"
    electroworld_category_urls.append(url_with_page)

# 5 Defining a function to scrape all products URL's from each page of the category website
# Defining a function 
def get_product_urls_electroworld(basic_url, category_urls,):
    
    # Check data types of arguments
    if not isinstance(basic_url, str):
        raise TypeError("Basic URL must be a string.")
    
    if not isinstance(category_urls, list):
        raise TypeError("Category URLs must be a list.")
    
    product_urls = []

    # Scrape product URL's across all product pages
    for page_num, url in enumerate(category_urls, start=1):

        response = requests.get(url)
        
        if response.status_code == 200: # 200 -> possible to get data 
            soup = BeautifulSoup(response.text, 'html.parser')
            # Extract product URLs using appropriate selectors
            product_urls.append([[basic_url + item.get('href')] for item in soup.select('.product-box__link')])
            print(f"Page {page_num} complete")
        else:
            print(f"Failed fetching page {page_num}. Response error - {response.status_code}: {response.reason}")
            return []
    
    # Flatten the list
    product_urls = [url for page_list in product_urls for product_list in page_list for url in product_list]
    print(f"{len(product_urls)} products found.")
    return product_urls


product_urls_electroworld = get_product_urls_electroworld(electroworld_basic_url, electroworld_category_urls) 
product_urls_electroworld

Page 1 complete
Page 2 complete
Page 3 complete
Page 4 complete
Page 5 complete
Page 6 complete
Page 7 complete
Page 8 complete
Page 9 complete
Page 10 complete
Page 11 complete
Page 12 complete
Page 13 complete
Page 14 complete
Page 15 complete
Page 16 complete
Page 17 complete
Page 18 complete
Page 19 complete
Page 20 complete
Page 21 complete
Page 22 complete
Page 23 complete
Page 24 complete
Page 25 complete
Page 26 complete
Page 27 complete
Page 28 complete
Page 29 complete
Page 30 complete
Page 31 complete
Page 32 complete
Page 33 complete
Page 34 complete
Page 35 complete
Page 36 complete
Page 37 complete
Page 38 complete
Page 39 complete
Page 40 complete
Page 41 complete
Page 42 complete
739 products found.


['https://www.electroworld.cz/apple-iphone-15-pro-max-1-tb-natural-titanium-prirodni-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-max-1-tb-black-titanium-cerny-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-max-1-tb-white-titanium-bily-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-max-1-tb-blue-titanium-modry-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-1-tb-natural-titanium-prirodni-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-1-tb-black-titanium-cerny-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-1-tb-blue-titanium-modry-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-1-tb-white-titanium-bily-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-max-512-gb-black-titanium-cerny-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-max-512-gb-blue-titanium-modry-titan',
 'https://www.electroworld.cz/apple-iphone-15-pro-max-512-gb-white-titanium-bily-titan',
 'https://www.electroworld.cz/apple-ip

Improvements:
- For CZC code: Automatic handling of pages, instead of manually setting the project number limit of 1000 above. 27 can stay if necessary (or trying out numbers 1:50 until data is obtained)

# 2. Scraping Product Information 

#### DatArt

In [331]:
product_url = 'https://www.datart.cz/mobilni-telefon-samsung-galaxy-a33-5g-6-gb-128-gb-sm-a336bzkgeee-cerny.html'
#product_url = 'https://www.datart.cz/bazar/mobilni-telefon-motorola-edge-40-5g-8-gb-256-gb-eclipse-black-pay40006pl-rozbaleno-24-mesicu-zaruka.html'
page = requests.get(product_url)
soup = BeautifulSoup(page.text, 'html.parser')
# Dictionary for phone characteristics
item = {} # placeholder for product characteristics

# Title
soup_title= str(soup.select('.product-detail-title')[0].text)

# Price
soup_price= int(soup.select('.actual')[0].text.replace('\n', '').replace('\t', '').replace('\xa0', '').replace('Kč', '').strip())

# Rating and number of ratings 
soup_ratings = soup.select('.rating-wrap')[0].text.replace('\n', '').replace('\t', '').strip().replace(" ", "")
soup_ratings = re.match(r'(\d+\.\d+)\((\d+)\)', soup_ratings)

if soup_ratings:
    soup_rating = float(soup_ratings.group(1))
    soup_no_rating = int(soup_ratings.group(2))
else: 
    soup_rating = None
    soup_no_rating = None




In [396]:
# Display size (in inches)
soup_display_size = float(soup.select_one('.table-borderless tbody th:contains("Úhlopříčka displeje") + td').get_text(strip=True).replace(',', '.'))

# Display resolution (in pixels)
display_resolution = soup.select_one('.table-borderless tbody th:contains("Rozlišení displeje") + td').get_text()
soup_display_resolution = re.search(r'(\d+) × (\d+)', display_resolution)
soup_display_resolution_w = int(soup_display_resolution.group(1))
soup_display_resolution_h = int(soup_display_resolution.group(2))
soup_display_resolution_tot = soup_display_resolution_w*soup_display_resolution_h

# Display refresh rate (in herz)
display_refresh_rate = soup.select_one('.table-borderless tbody th:contains("Obnovovací frekvence displeje") + td').get_text()
soup_display_refresh_rate = re.search(r'(\d+)', display_refresh_rate).group(1)

# Processor manufacturer
soup_processor_manufacturer = str(soup.select_one('.table-borderless tbody th:contains("Výrobce procesoru") + td').get_text())

# Processor model
soup_processor_model = str(soup.select_one('.table-borderless tbody th:contains("Model procesoru") + td').get_text())

# Number of cores
number_of_cores = soup.select_one('.table-borderless tbody th:contains("Počet jader") + td').get_text()
number_of_cores # here an if-else could be used

# Processor Frequency (in GH)
processor_frequency = soup.select_one('.table-borderless tbody th:contains("Frekvence procesoru") + td').get_text()
soup_processor_frequency = float(re.search(r'\(([\d,]+) GHz\)', processor_frequency).group(1).replace(",", "."))

# SIM card type
soup_sim_card_type = str(soup.select_one('.table-borderless tbody th:contains("Typ Sim karty") + td').get_text())

# Operating System
soup_operating_system = str(soup.select_one('.table-borderless tbody th:contains("Nadstavba systému") + td').get_text())

# Internal Memory (in GB)
internal_memory = soup.select_one('.table-borderless tbody th:contains("Interní paměť") + td').get_text()
soup_internal_memory = int(re.search(r'(\d+)', internal_memory).group(1))

# Amount of RAM (in GB)
ram_storage = soup.select_one('.table-borderless tbody th:contains("Velikost paměti RAM") + td').get_text()
soup_ram = int(re.search(r'(\d+)', ram_storage).group(1))

# Maximum Memory Card Size (in TB)
max_memory_card_size = soup.select_one('.table-borderless tbody th:contains("Maximální velikost paměťové karty") + td').get_text()
soup_max_mem_card_size = int(re.search(r'(\d+)', max_memory_card_size).group(1))

# Wireless technologies list 
wireless_object_list = str(soup.select_one('.table-borderless tbody th:contains("Bezdrátové technologie") + td').get_text()).split(', ')
soup_dict = {}
for obj in wireless_object_list:
    soup_dict[obj] = 1

# Number of rear camera lenses
soup_num_rear_cam_lenses = int(soup.select_one('.table-borderless tbody th:contains("Počet objektivů zadního fotoaparátu") + td').get_text())

# Rear camera solution (in Mpx)
rear_cam_solution = str(soup.select_one('.table-borderless tbody th:contains("Rozlišení zadního fotoaparátu") + td').get_text())
soup_main_cam_resolution = max([float(match.group()) for match in re.finditer(r'\b\d+\b', rear_cam_solution)]) 

# Front camera resolution (in Mpx)
front_cam_resolution = soup.select_one('.table-borderless tbody th:contains("Rozlišení předního fotoaparátu") + td').get_text()
front_cam_resolution = float(re.search(r'(\d+)', front_cam_resolution).group(1))

# Battery capacity (in mAh)
battery_capacity = soup.select_one('.table-borderless tbody th:contains("Kapacita akumulátoru") + td').get_text()
soup_battery_capacity = int(re.search(r'(\d+)', battery_capacity).group(1))

# Battery features list
battery_features_list = str(soup.select_one('.table-borderless tbody th:contains("Vlastnosti baterie") + td').get_text()).split(', ')
for obj in battery_features_list:
    soup_dict[obj] = 1
# soup_dict['rychlé nabíjení']
    
# Charging power (in Watt)
charging_power = str(soup.select_one('.table-borderless tbody th:contains("Výkon nabíjení") + td').get_text())
soup_charging_power = int(re.search(r'(\d+)', charging_power).group(1))

# Connector
connector = str(soup.select_one('.table-borderless tbody th:contains("Konektor") + td').get_text())

# Warranty (in months)
warranty = soup.select_one('.table-borderless tbody th:contains("Záruka") + td').get_text()
soup_warranty = int(re.search(r'(\d+)', warranty).group(1))

# Colour 
phone_colour = str(soup.select_one('.table-borderless tbody th:contains("Barva telefonu") + td').get_text())

# Brand
brand = str(soup.select_one('.table-borderless tbody th:contains("Značky") + td').get_text())

# Product_width (in cm)
product_width = soup.select_one('.table-borderless tbody th:contains("Šířka výrobku") + td').get_text()
soup_product_width = float(re.search(r'(\d+\.\d+)', product_width).group(1))

# Product height (in cm)
product_height = soup.select_one('.table-borderless tbody th:contains("Výška výrobku") + td').get_text()
soup_product_height = float(re.search(r'(\d+\.\d+)', product_height).group(1))

# Product depth (in cm)
product_depth = soup.select_one('.table-borderless tbody th:contains("Hloubka výrobku") + td').get_text()
soup_product_depth = float(re.search(r'(\d+\.\d+)', product_depth).group(1))

# Product weight (in kg)
product_weight = soup.select_one('.table-borderless tbody th:contains("Hmotnost výrobku") + td').get_text()
soup_product_weight = float(re.search(r'(\d+\.\d+)', product_weight).group(1))



Rebuild everything for dict values!

In [None]:
# Saving all in the dictionary
item['title'] = title
item['price'] = price
item['shipping price'] = rating
item['condition'] = description

In [None]:
item

In [None]:
# convert to JSON object and safe as JSON file
import json

with open('product_info.json', 'w') as file:

    json.dump(item, file)

#### 3. Ideas for Analysis

Steps:
0) Which data can we use legally without problems? Ebay, DataArt etc.

1) Creating a function to save all necessary data for one product simultaneously.
2) Create a function to let the function from 1) run over all product URL's.
2) (How to save the data of the product description -> we need to look at different products and see what variables we can define.)
3) How to work with github, commits, pull etc.?
4) How to create a nice table in github to document aims and checkpoints.
5) How to structure the repository -> documents for functions, loading packages, ReadMe etc.
6) Running scripts on christmas again, to see if their is a price drop because of price offers
7) Starting data analysis -> simple averages, counts etc. -> can we create a dashboard on github? (Share or products etc.)
8) Further Statistical Analysis - Ideas:
    1) Predicing prices for new products (price modelling), finding out which variables drive prices the most (factor)-> Does it makes sense to do a price model as prices consist largely of the purchase price; could we find those purchase prices somewhere and join them with our data?
    2) Brand Analysis: Explore the average prices of smartphones for each brand. Analyze market share based on the number of products or total sales for each brand.
    3) Storage Impact: Investigate how storage capacity (GB) affects smartphone prices. Explore the most popular storage capacity among consumers.
    4) Segment the market based on different characteristics (e.g., high-end, mid-range, budget). One could then do 1) on all different segments to explore differences.
    5) Consumer Preferences: Survey or analyze customer reviews to understand what features users value the most. Identify common positive and negative sentiments related to smartphone characteristics.
    6) Profitability Analysis (given that we know production or purchase prices): Estimate profit margins for different smartphones based on their production costs and prices.
Identify the most profitable products in your dataset. One could use the number of ratings as a proxy for the number of sales.


8) Which data tests and checks can we include?

How we could build a price model and find out which variables drive smartphone prices the most:

Feature Selection:

1) Use techniques like correlation analysis, mutual information, or feature importance from tree-based models to identify the most relevant features.
Consider removing highly correlated features to avoid multicollinearity.
Model Selection:

2) Choose a regression model that suits your data and problem. Common choices include linear regression, decision tree regression, random forest regression, or gradient boosting algorithms.
Model Training:

3) Split your dataset into training and testing sets.
Train your chosen model on the training data.
Model Evaluation:

4) Evaluate the model's performance on the testing set using appropriate metrics (e.g., mean squared error, R-squared, etc.).
Analyze the residuals to check for any patterns or systematic errors.

5) Iterate and Refine hyperparameters etc.