<a href="https://colab.research.google.com/github/Lewandowski-commits/eMILA_fuel_price_scraper/blob/main/emila_fuel_price_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from plotly import express as px  
from datetime import datetime
import re

In [None]:
#set up the address and variables for scraping
url = "https://emila.com.pl/mapa-stacji"
page = requests.get(url)
soup = bs(page.text, "html.parser")

#find all items/cities in the scrollable widget
list_items = soup.find_all("div", "emila-list__item")

#set up our dictionary that later will turn into the dataframe
data_dict = {"link": [], "city": [], "street": []}

#populate the dictionary with the scraped data
for item in list_items:
    link = item.find_all("a", href=True)[-1]["href"]
    city = item.find("p", re.compile("emila-list__city|emila-list__city_moya")).text
    street = item.find("p", "emila-list__street").text
    
    data_dict["link"].append(link)
    data_dict["city"].append(city)
    data_dict["street"].append(street)

#construct the dataframe
df = pd.DataFrame.from_dict(data_dict)

#drop any duplicates just in case
df = df.drop_duplicates()

#select only entries for eMILA stations, as the ones starting with moyastacja.pl were sold off different company and don't display their fuel prices
df = df[df["link"].str.contains("/stacja/")]

In [None]:
#create a function that takes in the station-specific URL and parses both fuel prices (diesel and gas) into floats, as they're contained in separate HTML tags
def get_fuel_price_emila(url: str):
    url = "https://emila.com.pl/"+url
    page = requests.get(url)
    soup = bs(page.text, "html.parser")
    
    #create a dict that will store the fuel type as the key and its price as the value
    fuel = {}
    
    #run this for every fuel type widget on the site
    price_rows = soup.find_all("div", "price__row")[:-1]
    for row in price_rows:
        #read the fuel type
        name = str(row.find("span", "station__icon-title").text)

        #read the numbers that make the price up
        firstNum = str(row.find("span", "station__icon station__icon--big station__icon--width").text)
        secondNum = str(row.find_all("span", "station__icon station__icon--big station__icon--width station__icon--gray")[0].text)
        thirdNum = str(row.find_all("span", "station__icon station__icon--big station__icon--width station__icon--gray")[-1].text)
        
        #write the price to the dict
        fuel[name] = float(f"{firstNum}.{secondNum}{thirdNum}")
        
    return fuel

#append columns with both fuel type prices to the dataframe using the above function
df["diesel"] = df["link"].apply(lambda x: get_fuel_price_emila(x)["DIESEL"])
df["gas"] = df["link"].apply(lambda x: get_fuel_price_emila(x)["PB"])

In [None]:
#create a scatter plot of the prices and display it
fig = px.scatter(df, x="diesel", 
                 y="gas", 
                 color="city", 
                 title=f"Scatter plot of eMILA fuel prices across the country on {datetime.now()}",
                 hover_data=["street"],
                 text="city")
fig.update_traces(textposition='top center')
fig.show()