# Precipitation Data Downloader
The aim of the notebook is to download and extract precipitation data at a daily granularity

## Import Statements

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import zipfile

In [2]:
# DATA_URL = "https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_hydrologiczne/dobowe/"
DATA_URL = "https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/opad/"

## Functions

In [3]:
def get_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all("a") 
    links = [link.get("href") for link in links]
    return links


def download(url, save_dir="."):
    # Send a GET request to the URL
    response = requests.get(url)
    # Check if request was successful
    if response.status_code == 200:
        # Get the file name from the URL
        filename = os.path.basename(url)
        # Combine the directory and file name to get the full path
        filepath = os.path.join(save_dir, filename)
        # Write the content to a file
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"File '{filename}' downloaded successfully.")
        # Extract the contents of the ZIP file
        with zipfile.ZipFile(filepath, 'r') as zip_ref:
            zip_ref.extractall(save_dir)
        print("ZIP file contents extracted successfully.")
    else:
        print("Failed to download the file.")

def remove_zip_files(directory):
    # Get the list of all files in the directory
    files = os.listdir(directory)
    # Iterate through each file
    for file in files:
        # Check if the file is a .zip file
        if file.endswith('.zip'):
            # Construct the full path to the file
            file_path = os.path.join(directory, file)
            # Remove the file
            os.remove(file_path)
            print(f"File '{file}' removed successfully.")


## Downloading

In [4]:
res = get_links(DATA_URL)
data_links = []
output_data_path = "data/precipitation"

if not os.path.exists(output_data_path):
    os.makedirs(output_data_path, exist_ok=True)

for link in res:
    if link[:4].isdigit():
        temp = DATA_URL+link
        zip_files_by_year = get_links(temp)
        zip_files_by_year = [temp+link for link in zip_files_by_year if ".zip" in link]
        year = str(temp.split("/")[-2])

        year_path = os.path.join(output_data_path, year)
        if not os.path.exists(year_path):
            os.mkdir(year_path)

        for zip_url in zip_files_by_year:
            print(zip_url)
            download(zip_url,year_path)  
        remove_zip_files(year_path)

https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/opad/1950_1955/1950_o.zip
File '1950_o.zip' downloaded successfully.
ZIP file contents extracted successfully.
https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/opad/1950_1955/1951_o.zip
File '1951_o.zip' downloaded successfully.
ZIP file contents extracted successfully.
https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/opad/1950_1955/1952_o.zip
File '1952_o.zip' downloaded successfully.
ZIP file contents extracted successfully.
https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/opad/1950_1955/1953_o.zip
File '1953_o.zip' downloaded successfully.
ZIP file contents extracted successfully.
https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/opad/1950_1955/1954_o.zip
File '1954_o.zip' downloaded successfully.
ZIP file contents extracted succ

BadZipFile: Bad CRC-32 for file 'o_d_03_2023.csv'

In [26]:
os.listdir(output_data_path)

['1951',
 '1952',
 '1953',
 '1954',
 '1955',
 '1956',
 '1957',
 '1958',
 '1959',
 '1960',
 '1961',
 '1962',
 '1963',
 '1964',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '2022']