In [1]:
import pandas as pd
import numpy as np
import zipfile
import os
import json
import shutil
from datetime import datetime, timedelta
import glob
import re
from tqdm import tqdm

In [2]:
def file_path(file_name, file_format, base_path=None, folder_name=None):
    '''function that defines the filepath that is sent into load_data'''
    if base_path is None:
        base_path = r"/Users/magnushovmand/Dropbox/UNI/Speciale"  # Default base_path
    if folder_name is None:
        folder_name = ""  # Default folder_name
    file = f"{file_name}.{file_format}"
    path = os.path.join(base_path, folder_name, file)
    return path

def open_file_lines(file_path):
    '''opens the file wanted to extract data from'''
    with open(file_path, 'r') as file:
        return file.readlines()

def get_zip_files(directory_path):
    '''extracts the name and the path of all zip files'''
    # Use glob to find all the zip files in the given directory
    zip_files = glob.glob(os.path.join(directory_path, '*.zip'))
    
    return zip_files

def main(directory_path):
    '''extracts weather data from the bulk extraction zip file and return the wanted variables to json files
    Each years data is extracted to its own json file 
    
    args: 
        - directory_path: path to the directory where the zip files can be found

    return:
        - return none, but save the data into json files. One file for each year
    '''
    # Step 1: Unzip the ZIP file
    for zip_file in tqdm(get_zip_files(directory_path), desc="Processing ZIP files"):
        # Create a directory to extract the files to
        extract_path = os.path.join(directory_path, os.path.basename(zip_file)[:-4])
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(extract_path) # Extract to the specified directory

            # Step 2: Process each txt file
            parameters_wanted = ["temp_mean_past1h", "wind_dir_past1h", "wind_speed_past1h", "sun_last1h_glob"]
            result = {}
            total_files = sum([len(files) for _, _, files in os.walk(extract_path) if any(file_name.endswith('.txt') for file_name in files)])# Get the total number of files for the progress bar
            with tqdm(total=total_files, desc=f"Processing {zip_file}") as pbar:
                for root, _, files in os.walk(extract_path): # Change zip_file to extract_path
                    for file_name in files:
                        if file_name.endswith('.txt'):
                            txt_file_path = os.path.join(root, file_name)
                            txt_file = open_file_lines(txt_file_path)
                            
                            for line in txt_file: #The files do not have proper json format though the individual lines does. We therefore only load the lines as json
                                try:
                                    item = json.loads(line)
                                except json.JSONDecodeError:
                                    print(f"Error decoding line: {line}") #line for debugging problematic lines in the txt files
                                    continue

                                if item['geometry'] is not None:
                                    coordinates = item['geometry']['coordinates']
                                else:
                                    coordinates = item['properties']['stationId']
                                coordinates_key = str(coordinates)

                                    # Correcting the time
                                observed_time = item['properties']['observed']
                                observed_time_dt = datetime.strptime(observed_time, '%Y-%m-%dT%H:%M:%SZ')
                                observed_time_dt -= timedelta(hours=1)

                                    # Convert the corrected datetime object back to a string
                                corrected_observed_time = observed_time_dt.strftime('%Y-%m-%d-%H')

                                parameter_id = item['properties']['parameterId']
                                station_id = item['properties']['stationId']
                                value = item['properties']['value']
                                
                                if parameter_id not in parameters_wanted:
                                    continue
        
                                if coordinates_key not in result:
                                    result[coordinates_key] = {"stationId": station_id, "Data": {}}
                                    
                                if corrected_observed_time not in result[coordinates_key]["Data"]:
                                    result[coordinates_key]["Data"][corrected_observed_time] = {}
                                    
                                result[coordinates_key]["Data"][corrected_observed_time][parameter_id] = value
                            pbar.update(1) #updates the progress bar

        year = re.search(r'\b\d{4}\b', zip_file).group() # extracting year from the path

        file_saved_to = file_path(f"weather_data_{year}", "json", base_path=directory_path, folder_name=None)
        # Save to JSON
        with open(file_saved_to, 'w') as json_file:
            json.dump(result, json_file, indent=4)

        # Step 3: Delete the folder with the data
        shutil.rmtree(extract_path) # Remove the directory with extracted data to save space. Each zip file contains about 10-15 GB data

        print(f"The dataset for {year} is done. It contains data for {len(result)} weather station. The data is saved to {file_saved_to}")

In [3]:
# zip_path = 'path/to/your.zip' # Provide the path to your ZIP file
# output_json = 'output.json' # Output JSON file name
main("/Users/magnushovmand/Desktop")

Processing ZIP files:   0%|          | 0/2 [00:00<?, ?it/s]

Processing /Users/magnushovmand/Desktop/2022.zip: 100%|██████████| 365/365 [1:04:23<00:00, 10.58s/it]
Processing ZIP files:  50%|█████     | 1/2 [1:08:26<1:08:26, 4106.05s/it]

Processed 68 files and saved to /Users/magnushovmand/Desktop/weather_data_2022.json


Processing /Users/magnushovmand/Desktop/2023.zip: 100%|██████████| 236/236 [34:49<00:00,  8.85s/it]
Processing ZIP files: 100%|██████████| 2/2 [1:45:18<00:00, 3159.12s/it]  

Processed 68 files and saved to /Users/magnushovmand/Desktop/weather_data_2023.json



