In [1]:
import os 
import pandas as pd
from os.path import join as pjoin # Joining file directories
import glob
from sklearn.preprocessing import StandardScaler

In [2]:
# set your working directory and go there
work_dir = "data"
os.chdir(work_dir)

In [3]:
folder = "predictors/combined/"
output_folder = "predictors/allpredictors/"
file_pattern = f"{folder}/*.csv"
file_list = glob.glob(file_pattern)

In [4]:
stations_without_obs = []

for file_name in file_list:
    file = file_name.split("\\")[-1]
    
    output_path = f'{output_folder}/{file}'
    
    station_id = int(os.path.splitext(os.path.basename(file_name))[0].split("_")[1])
    df = pd.read_csv(file_name)
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    start_date = pd.to_datetime('1979-01-01')
    end_date = pd.to_datetime('2012-12-01')

    df_subset = df[(df['datetime'] >= start_date) & (df['datetime'] <= end_date)]
    
    if 'obs' not in df_subset.columns:
        stations_without_obs.append(station_id)
        continue
        
    # Columns to normalize
    columns_to_normalize = ['meteo_rain', 'meteo_tair', 'wg3_RootMoist', 'wg3_SurfStor',
                            'wg3_SWE', 'lis_SurfMoist', 'lis_SWE', 'pcr_SurfMoist',
                            'pcr_SurfStor', 'pcr_SWE']

    
    # Drop rows with NaN values
    df_subset.dropna(axis=0, inplace=True)
    
    # Perform z-score normalization
    scaler = StandardScaler()
    normalized_data = df_subset.copy()  # Replace 'data' with your actual dataset
    normalized_data[columns_to_normalize] = scaler.fit_transform(df_subset[columns_to_normalize])

    
    # Save cleaned dataframe to file
    normalized_data.to_csv(output_path, index=False)

print(len(stations_without_obs)) #These stations are not within the project study area

39


In [27]:
import glob
import os
import pandas as pd

folder = "predictors/combined/"
file_pattern = f"{folder}/*.csv"
file_list = glob.glob(file_pattern)

station_ids = []
dataframes = []

for file_name in file_list:
    station_id = os.path.splitext(os.path.basename(file_name))[0].split("_")[1]
    df = pd.read_csv(file_name)
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    start_date = pd.to_datetime('1979-01-01')
    end_date = pd.to_datetime('2012-12-01')

    df_subset = df[(df['datetime'] >= start_date) & (df['datetime'] <= end_date)]
    station_ids.append(station_id)
    dataframes.append(df_subset)

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(dataframes, keys=station_ids)

# Get the unique columns across all dataframes
columns = combined_df.columns.unique()

# Create a DataFrame to store the NaN sum for each column and station
nan_sum = pd.DataFrame(index=columns)

# Calculate the sum of NaN values for each variable/column and each station
for station_id, df_subset in zip(station_ids, dataframes):
    nan_sum[station_id] = df_subset.isna().sum()

print(nan_sum)


               6139391  6139400  6140400  6140401  6221100  6221101  6221102   
Unnamed: 0         0.0      0.0        0        0        0        0        0  \
datetime           0.0      0.0        0        0        0        0        0   
meteo_rain         0.0      0.0        0        0        0        0        0   
meteo_tair         0.0      0.0        0        0        0        0        0   
wg3_dis            0.0      0.0        0        0        0        0        0   
wg3_RootMoist      0.0      0.0        0        0        0        0        0   
wg3_SurfStor       0.0      0.0        0        0        0        0        0   
wg3_SWE            0.0      0.0        0        0        0        0        0   
lis_dis            0.0      0.0        0        0        0        0        0   
lis_SurfMoist      0.0      0.0        0        0        0        0        0   
lis_SWE            0.0      0.0        0        0        0        0        0   
pcr_dis            0.0      0.0        0

In [28]:
stations_with_missing_equal_total = []

for station_id, df_subset in results.items():
    num_observations = len(df_subset)
    missing_counts = df_subset.isna().sum()
    variables_with_missing_equal_total = missing_counts[missing_counts == num_observations].index.tolist()
    
    if variables_with_missing_equal_total:
        stations_with_missing_equal_total.append(station_id)

print(stations_with_missing_equal_total)


[]


In [30]:
for station_id, df_subset in nan_sum.items():
    print(df_subset)

Unnamed: 0       0.0
datetime         0.0
meteo_rain       0.0
meteo_tair       0.0
wg3_dis          0.0
wg3_RootMoist    0.0
wg3_SurfStor     0.0
wg3_SWE          0.0
lis_dis          0.0
lis_SurfMoist    0.0
lis_SWE          0.0
pcr_dis          0.0
pcr_SurfMoist    0.0
pcr_SurfStor     0.0
pcr_SWE          0.0
obs              NaN
Name: 6139391, dtype: float64
Unnamed: 0       0.0
datetime         0.0
meteo_rain       0.0
meteo_tair       0.0
wg3_dis          0.0
wg3_RootMoist    0.0
wg3_SurfStor     0.0
wg3_SWE          0.0
lis_dis          0.0
lis_SurfMoist    0.0
lis_SWE          0.0
pcr_dis          0.0
pcr_SurfMoist    0.0
pcr_SurfStor     0.0
pcr_SWE          0.0
obs              NaN
Name: 6139400, dtype: float64
Unnamed: 0       0
datetime         0
meteo_rain       0
meteo_tair       0
wg3_dis          0
wg3_RootMoist    0
wg3_SurfStor     0
wg3_SWE          0
lis_dis          0
lis_SurfMoist    0
lis_SWE          0
pcr_dis          0
pcr_SurfMoist    0
pcr_SurfStor     0
pc