In [ ]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep

In [ ]:
final_df = pd.read_csv('data/filtered_installations.csv')
weather_cache = {}

# Function to scrape weather data or retrieve it from cache
def scrape_weather(url: str):  
    # Check if the data for this URL is already in the cache
    if url in weather_cache:
        print(f"Using cached data for: {url}")
        return weather_cache[url]

    print(f"Scraping weather data from: {url}")
    # Send a GET request to the webpage
    response = requests.get(url)
    
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Lists to store the weather data
    times, temperatures, cloudiness_list, humidities, wind_speeds = [], [], [], [], []
    
    # Extract time and temperature data
    for entry in soup.find_all('div', {'class': 'weather-entry'}):
        time = entry.find('span', class_='hour').text + ":" + entry.find('span', class_='minutes').text
        temperature = entry.find('span', class_='forecast-temp').text
        wind_speed = entry.find('span', class_='speed-value').text + " " + entry.find('span', class_='speed-unit').text
    
        times.append(time)
        temperatures.append(temperature[:-2])
        wind_speeds.append(wind_speed[:-4])
    
    # Extract cloudiness data
    for cloud_entry in soup.find_all('div', {'class': 'entry-precipitation'}):
        cloudiness = cloud_entry.find('span', class_='entry-precipitation-value cloud-cover')
        if cloudiness:
            cloudiness = int(cloudiness.text[:-1])
        else:
            cloudiness = 0
        cloudiness_list.append(cloudiness)
    
    # Extract humidity data
    for humidity_entry in soup.find_all('div', {'class': 'entry-humidity'}):
        humidity = humidity_entry.find('div', class_='entry-humidity-wrap')
        if humidity:
            humidity = humidity.text.strip()[:-1]
        else:
            humidity = 0
        humidities.append(humidity)
    
    # Create a DataFrame and store it in the cache
    weather_data = {
        'Time': times,
        'Temperature': temperatures,
        'Cloudiness': cloudiness_list,
        'Humidity': humidities,
        'Wind Speed': wind_speeds
    }
    
    df = pd.DataFrame(weather_data)
    
    # Save to cache
    weather_cache[url] = df
    
    # Simulate delay between requests
    sleep(1)  # To avoid hitting rate limits
    
    return df


# If dataset does not have those fields, adjust the code accordingly
installations_with_weather_df = pd.DataFrame(columns=['id_licznika', 'moc', 'dlugosc', 'szerokosc', 'data', "dpv", "efekt", 'temperatura', 'zachmurzenie', 'wilgotnosc', 'wiatr'])

for index, row in final_df.iterrows():
    # Construct the URL for the weather archive page
    row['godzina'] = final_df['data']
    row['godzina'] = pd.to_datetime(row['godzina'])
    date = row['godzina'].strftime('%d-%m-%Y')
    hour = row['godzina'].hour
    url = f"https://pogoda.interia.pl/archiwum-pogody-{date},cId,{int(row['index'])}"
    
    # Scrape the weather data
    weather_df = scrape_weather(url)
    print(f"Weather data for installation {row['id_licznika']} scraped successfully.")

    # Add the 'data' column to the DataFrame where time is the same as hour in final_df
    weather_df['Time'] = pd.to_datetime(weather_df['Time'], format='%H:%M')
    weather_row = weather_df[weather_df['Time'].dt.hour == hour]
    if weather_row.empty:
        print(f"No weather data found for installation {row['id_licznika']} at {hour}:00.")
        continue 
    print(weather_row)
    print(row)
    
    # Add the weather data to the DataFrame
    installations_with_weather_df.loc[installations_with_weather_df.shape[0]] = {
        'id_licznika': row['id_licznika'],
        'moc': row['moc'],
        'data': row['godzina'],
        'dlugosc': row['dlugosc'],
        'szerokosc': row['szerokosc'],
        'dpv': row['dpv'],
        'efekt': row['efekt'],
        'temperatura': weather_row['Temperature'].values[0],
        'zachmurzenie': weather_row['Cloudiness'].values[0],
        'wilgotnosc': weather_row['Humidity'].values[0],
        'wiatr': weather_row['Wind Speed'].values[0]}

print(installations_with_weather_df.head())