# Preprocessing
## 3. Extracting data per stations

In [3]:
## Import required  packages
import os  # File and directory operations
from os.path import join as pjoin  # Joining file directories
import time  # To measure the execution time of a code block in Python

import subprocess  # Subprocess execution

import numpy as np  # Numerical operations
import pandas as pd  # Data manipulation and analysis
import matplotlib.pyplot as plt  # Plotting

import rasterio  # Raster data manipulation
from rasterio.warp import calculate_default_transform, reproject, Resampling

from osgeo import gdal  # Geospatial Data Abstraction Library

import pcraster as pcr  # PCRaster library

import xarray as xr  # Multidimensional array manipulation
from netCDF4 import Dataset  # NetCDF data manipulation

import random
import warnings
import math

from multiprocess import Pool
from concurrent import futures
import glob
import re
import tqdm

In [4]:
directory = 'raw_data'
os.chdir(directory)

In [3]:
def check_dir_or_make(path):
    isExist = os.path.exists(path)
    if not isExist:
        # Create a new directory because it does not exist
        os.makedirs(path)

In [25]:
def extract_variables_to_csv(model, model_var):
    filePath = f'{model}/upstream/upstream_timesteps'
    outputpath = f'{model}/upstream_station_all'

    file_pattern = '*.map'

    # Get a list of all map files in the folder
    file_list = glob.glob(os.path.join(filePath, file_pattern))

    # Read the station latitude and longitude from a CSV file
    loc = pd.read_csv('stationLatLon.csv')
    loc = loc[(loc.wmo_reg == 6) & (loc.lat.between(45, 54.5)) & (loc.lon.between(4, 15.5))]

    # Iterate over the stations
    for station_idx, station_row in loc.iterrows():
        lat = station_row['lon']
        lon = station_row['lat']
        station_no = station_row['grdc_no']

        # Create a DataFrame for the current station
        extracted_data = pd.DataFrame(columns=['datetime', model_var])

        # Iterate over the map files
        for file in file_list:
            # Extract the base name and date from the file name
            filename = os.path.basename(file)
            base_name = os.path.splitext(filename)[0]
            date_part = base_name.split("_")[-2:]

            # Extract the year and month from the date part
            year = date_part[0]
            month = date_part[1]
            date = f'{year}_{month}'

            # Read the PCRaster map
            pcr_map = pcr.readmap(file)

            # Extract the value for the specific location
            extracted_value = pcr.cellvalue_by_coordinates(pcr_map, lat, lon)[0]

            # Check if the extracted value is NaN
            if not pd.isna(extracted_value):
                # Append the extracted value to the DataFrame
                extracted_data = pd.concat([extracted_data, pd.DataFrame({'datetime': [date], model_var: [extracted_value]})],
                                           ignore_index=True)


        # Create a file path for the current station
        station_csv = os.path.join(outputpath, f'{station_no}_{model_var}.csv')

        # Save the DataFrame to a CSV file for the current station
        extracted_data.to_csv(station_csv, index=False)

# Call the function
model = 'meteo'
model_var = 'meteo_rain'
extract_variables_to_csv(model, model_var)

In [5]:
def near(array,value):
    idx=(np.abs(array-value)).argmin()
    return idx

def get_latlon():  
    
    xin, yin = np.array(loc['lon']), np.array(loc['lat']) #real life lon, lat

    lon = nc_sample.variables['x'][:]       #netcdf lon    
    lat = nc_sample.variables['y'][:]       #netcdf lat
    
    #find nearest point to desired location
    get_latlon.ix = [None] * len(xin)
    get_latlon.iy = [None] * len(yin)
    
    for i in range(len(xin)):
        get_latlon.ix[i] = near(lon, xin[i])
        get_latlon.iy[i] = near(lat, yin[i])
    get_latlon.ix = np.array(get_latlon.ix)
    get_latlon.iy = np.array(get_latlon.iy)

In [4]:
def extract_variables_to_csv(model, model_var):
    filePath = f'{model}/upstream/upstream_timesteps'
    outputpath = f'{model}/upstream_station_all'

    file_pattern = f"{model}/upstream/upstream_timesteps/{model_var}*.map"

    # Get a list of all map files in the folder
    file_list = glob.glob(file_pattern)

    # Read the station latitude and longitude from a CSV file
    loc = pd.read_csv('stationLatLon.csv')
    loc = loc[(loc.wmo_reg == 6) & (loc.lat.between(45, 54.5)) & (loc.lon.between(4, 15.5))]

    # Iterate over the stations
    for station_idx, station_row in loc.iterrows():
        lat = station_row['lon']
        lon = station_row['lat']
        station_no = station_row['grdc_no']

        # Create a DataFrame for the current station
        extracted_data = pd.DataFrame(columns=['datetime', model_var])

        # Iterate over the map files
        for file in file_list:
            # Extract the base name and date from the file name
            filename = os.path.basename(file)
            base_name = os.path.splitext(filename)[0]
            date_part = base_name.split("_")[-2:]

            # Extract the year and month from the date part
            year = date_part[0]
            month = date_part[1]
            date = f'{year}_{month}'

            # Read the PCRaster map
            pcr_map = pcr.readmap(file)

            # Extract the value for the specific location
            extracted_value = pcr.cellvalue_by_coordinates(pcr_map, lat, lon)[0]

            # Check if the extracted value is NaN
            if not pd.isna(extracted_value):
                # Add a new row to the DataFrame
                extracted_data.loc[len(extracted_data)] = {'datetime': date, model_var: extracted_value}

        # Create a file path for the current station
        station_csv = os.path.join(outputpath, f'{station_no}_{model_var}.csv')

        # Save the DataFrame to a CSV file for the current station
        extracted_data.to_csv(station_csv, index=False)

In [6]:
# Call the function
model = 'meteo'
model_var = 'meteo_rain'
extract_variables_to_csv(model, model_var)

In [7]:
# Call the function for meteo_tair
model = 'meteo'
model_var = 'meteo_tair'
extract_variables_to_csv(model, model_var)

In [9]:
models = ["pcr", "wg3", "lis"]

In [10]:
for model in models:
    
    filePath = f'{model}/upstream/upstream_timesteps'
    outputPath = f'{model}/upstream_station_all/'
    check_dir_or_make(outputPath)
    
    for file in os.listdir(model):
        if file.endswith(".nc"):
            
            var = file.split(".")[0]
            extract_variables_to_csv(model, var)
    print("Finished for variables set", model)

Finished for variables set pcr
Finished for variables set wg3
Finished for variables set lis
