# Digging up more SMHI data

In [1]:
import requests
import json
import re
import pathlib
import pandas as pd
from tqdm.notebook import tqdm


In [2]:
base_url = 'https://sharkdata.smhi.se/datasets/'
dataset_list_url = base_url + 'list.json'
datasets = json.loads(requests.get(dataset_list_url).text)
print(f"found {len(datasets)} datasets")


found 3283 datasets


Let's break down the datasets by year using a regex to find integers in the filename. We create a dictionary `ds_years` with the number of datasets for each year

In [3]:
ds_years = {}
for ds in datasets:
    name = ds["dataset_name"]
    year = re.findall('\d+', name )[0]
    if year in ds_years.keys():
        ds_years[year]+=1
    else:
        ds_years[year] = 1

ds_years = dict(sorted(ds_years.items()))
ds_years

{'1893': 1,
 '1894': 1,
 '1896': 1,
 '1897': 1,
 '1898': 1,
 '1899': 1,
 '1900': 1,
 '1901': 1,
 '1902': 2,
 '1903': 1,
 '1904': 1,
 '1905': 1,
 '1906': 2,
 '1907': 2,
 '1908': 2,
 '1909': 3,
 '1910': 3,
 '1911': 3,
 '1912': 2,
 '1913': 2,
 '1914': 1,
 '1915': 1,
 '1916': 1,
 '1920': 1,
 '1921': 2,
 '1922': 2,
 '1923': 1,
 '1924': 2,
 '1925': 2,
 '1926': 1,
 '1927': 2,
 '1928': 3,
 '1929': 3,
 '1930': 3,
 '1931': 3,
 '1932': 3,
 '1933': 3,
 '1934': 3,
 '1935': 3,
 '1936': 3,
 '1937': 4,
 '1938': 3,
 '1939': 3,
 '1940': 2,
 '1941': 2,
 '1942': 2,
 '1943': 2,
 '1944': 2,
 '1945': 4,
 '1946': 5,
 '1947': 3,
 '1948': 2,
 '1949': 5,
 '1950': 3,
 '1951': 4,
 '1952': 3,
 '1953': 5,
 '1954': 4,
 '1955': 4,
 '1956': 3,
 '1957': 4,
 '1958': 4,
 '1959': 2,
 '1960': 2,
 '1961': 3,
 '1962': 3,
 '1963': 6,
 '1964': 5,
 '1965': 5,
 '1966': 6,
 '1967': 6,
 '1968': 8,
 '1969': 7,
 '1970': 5,
 '1971': 9,
 '1972': 8,
 '1973': 7,
 '1974': 7,
 '1975': 9,
 '1976': 8,
 '1977': 9,
 '1978': 5,
 '1979': 9,
 '19

Let's request only datasets from 2020

In [4]:
datasets_2020 = []
for ds in datasets:
    name = ds["dataset_name"]
    year = re.findall('\d+', name )[0]
    if int(year) == 2020:
        datasets_2020.append(ds)
print(f"Of these datasets, {len(datasets_2020)} are from 2020")

Of these datasets, 137 are from 2020


As usual, we make a cache dir if it does not already exists and check for existing data before downloading

In [5]:
smhi_dir = pathlib.Path('smhi_data_cache')
if not smhi_dir.exists():
    print(f"creating directory to cache SMHI datasets at {smhi_dir.absolute()}")
    smhi_dir.mkdir(parents=True, exist_ok=True)

In [None]:
for ds_dict in tqdm(datasets_2020):
    ds_name = ds_dict["dataset_name"]
    ds_name_and_date = ds_dict["dataset_file_name"]
    ds_file = smhi_dir / (ds_name_and_date[:-3] + "csv")
    if ds_file.exists():
        # If file exists, don't re-download it
        continue
    same_name_files = list(smhi_dir.glob(f"{ds_name}*"))
    if same_name_files:
        # File exists with the same name but different date. Replace with (presumably newer) file from server
        old_file = same_name_files[0]
        old_file.unlink()
        print(f"Deleting {old_file}")
    print(f"Download {ds_file}")
    download_url = base_url + ds_name + '/data.txt'
    df = pd.read_csv(download_url, encoding='cp1252', sep='\t')
    df.to_csv(ds_file, index=False)

  0%|          | 0/137 [00:00<?, ?it/s]

Download smhi_data_cache/SHARK_Zoobenthos_2020_PAG_NVSKK_version_2022-09-23.csv
Download smhi_data_cache/SHARK_Zoobenthos_2020_SLUA_FORS_version_2022-04-27.csv
Download smhi_data_cache/SHARK_Chlorophyll_2020_2021_UMSC_HAV_version_2022-05-26.csv
Download smhi_data_cache/SHARK_Zooplankton_2020_DEEP_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Zooplankton_2020_SLCK_version_2022-09-14.csv


  df = pd.read_csv(download_url, encoding='cp1252', sep='\t')


Download smhi_data_cache/SHARK_Zooplankton_2020_UMSC_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Zoobenthos_2020_2021_MEDINS_version_2022-04-27.csv
Download smhi_data_cache/SHARK_Zoobenthos_2020_BILKBG_version_2022-04-27.csv
Download smhi_data_cache/SHARK_Zoobenthos_2020_DEEP_DLST_version_2022-04-27.csv
Download smhi_data_cache/SHARK_Zoobenthos_2020_PAG_NLST_5_version_2022-04-27.csv
Download smhi_data_cache/SHARK_Zoobenthos_2020_DEEP_HAV_Asko_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Zoobenthos_2020_PAG_NLST_1_version_2022-09-16.csv
Download smhi_data_cache/SHARK_Zoobenthos_2020_DEEP_ABLST_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Zoobenthos_2020_LNU_KLKK_BLK_version_2022-09-14.csv
Deleting smhi_data_cache/SHARK_Zoobenthos_2020_DEEP_HAV_Asko_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Zoobenthos_2020_DEEP_HAV_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Zoobenthos_2020_DEEP_ILST_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Z

  df = pd.read_csv(download_url, encoding='cp1252', sep='\t')


Download smhi_data_cache/SHARK_Phytoplankton_2020_MEDINS_DVVF_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Phytoplankton_2020_MEDINS_LVVF_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Phytoplankton_2020_MEDINS_MSVVF_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Phytoplankton_2020_MEDINS_NORKOM_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Phytoplankton_2020_MEDINS_UVVVF_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Phytoplankton_2020_PELA_GVVF_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Phytoplankton_2020_PELA_SVAB_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Phytoplankton_2020_SMHI_OLST_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Phytoplankton_2020_SMHI_SLV_biotox_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Phytoplankton_2020_SMHI_ColorFantasy_version_2022-09-14.csv
Deleting smhi_data_cache/SHARK_Phytoplankton_2020_SMHI_OLST_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Phytoplankton_2020_SMHI_vers

  df = pd.read_csv(download_url, encoding='cp1252', sep='\t')


Download smhi_data_cache/SHARK_Phytoplankton_2020_UMSC_ACLST_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Phytoplankton_2020_UMSC_YLST_version_2022-09-14.csv
Deleting smhi_data_cache/SHARK_Phytoplankton_2020_UMSC_ACLST_version_2022-09-14.csv
Download smhi_data_cache/SHARK_Phytoplankton_2020_UMSC_version_2022-09-14.csv


  df = pd.read_csv(download_url, encoding='cp1252', sep='\t')


Download smhi_data_cache/SHARK_Phytoplankton_2020_WEAQ_NLST_version_2022-09-16.csv
Download smhi_data_cache/SHARK_Epibenthos_2020_LNU_version_2022-09-30.csv
Download smhi_data_cache/SHARK_Epibenthos_2020_BDLST_BDK_version_2022-09-30.csv
Download smhi_data_cache/SHARK_Epibenthos_2020_DEEP_Asko_version_2022-09-30.csv
Download smhi_data_cache/SHARK_Epibenthos_2020_DEEP_DLST_version_2022-09-30.csv
Download smhi_data_cache/SHARK_Epibenthos_2020_DEEP_Hoga_kusten_version_2022-09-30.csv
Download smhi_data_cache/SHARK_Epibenthos_2020_DEEP_Singo_version_2022-09-30.csv
Download smhi_data_cache/SHARK_Epibenthos_2020_LITORALIS_ELST_version_2022-09-30.csv
Download smhi_data_cache/SHARK_Epibenthos_2020_LNU_BKVF_version_2022-09-30.csv
Download smhi_data_cache/SHARK_Epibenthos_2020_SGU_dropvideo_version_2022-09-30.csv
