In [1]:
import os
import pandas as pd

# Define root directories for `o_d_` and `k_d_t_` files
opad_root_directory = "/lakehouse/default/Files/dobowe/opad"
klimat_root_directory = "/lakehouse/default/Files/dobowe/klimat"
output_directory = "/lakehouse/default/Files/Data"

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Output file paths
o_d_output_file = os.path.join(output_directory, "combined_o_d_data.csv")
k_d_t_output_file = os.path.join(output_directory, "combined_k_d_t_data.csv")

# Headers for `o_d_` and `k_d_t_` files based on your description
o_d_columns = [
    "Kod_stacji",
    "Nazwa_stacji",
    "Rok",
    "Miesiac",
    "Dzien",
    "Suma_dobowa_opadow",
    "Status_pomiaru_SMDB",
    "Rodzaj_opadu_S_W",
    "Wysokosc_pokrywy_snieznej_cm",
    "Status_pomiaru_PKSN",
    "Wysokosc_swiezospalego_sniegu_cm",
    "Status_pomiaru_HSS",
    "Gatunek_sniegu_kod",
    "Status_pomiaru_GATS",
    "Rodzaj_pokrywy_snieznej_kod",
    "Status_pomiaru_RPSN",
]

k_d_t_columns = [
    "Kod_stacji",
    "Nazwa_stacji",
    "Rok",
    "Miesiac",
    "Dzien",
    "Srednia_dobowa_temperatura_C",
    "Status_pomiaru_TEMP",
    "Srednia_dobowa_wilgotnosc_wzgledna_proc",
    "Status_pomiaru_WLGS",
    "Srednia_dobowa_predkosc_wiatru_ms",
    "Status_pomiaru_FWS",
    "Srednie_dobowe_zachmurzenie_ogolne_oktanty",
    "Status_pomiaru_NOS",
]

# Helper function to load a CSV file with fallback encodings
def load_csv_with_fallback(file_path, columns):
    encodings = ["utf-8", "windows-1250"]
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding, header=None, sep=",")
            df.columns = columns  # Assign custom headers
            return df
        except Exception:
            continue
    raise ValueError(f"Failed to load file: {file_path} with all tested encodings.")

# Function to process files and write to a single output file
def process_and_append_files(root_directory, file_prefix, output_file, columns):
    # Write headers explicitly at the beginning
    with open(output_file, mode="w", encoding="utf-8", newline="") as f:
        pd.DataFrame(columns=columns).to_csv(f, index=False, sep=",")
    
    for subdir, _, files in os.walk(root_directory):
        for file_name in files:
            if file_name.startswith(file_prefix) and file_name.endswith(".csv"):
                file_path = os.path.join(subdir, file_name)
                try:
                    df = load_csv_with_fallback(file_path, columns)
                    # Append to the output file
                    df.to_csv(output_file, mode="a", index=False, header=False, sep=",")
                    print(f"Processed and appended: {file_name}")
                except Exception as e:
                    print(f"Error processing {file_name}: {e}")

# Process and append `o_d_` files
print("Processing o_d_ files...")
process_and_append_files(opad_root_directory, "o_d_", o_d_output_file, o_d_columns)

# Process and append `k_d_t_` files
print("Processing k_d_t_ files...")
process_and_append_files(klimat_root_directory, "k_d_t_", k_d_t_output_file, k_d_t_columns)

print("Data processing completed.")

StatementMeta(, e622cbff-ed2b-4ad4-9548-f7cd749c88f0, 3, Finished, Available, Finished)

Processing o_d_ files...
Processed and appended: o_d_01_2001.csv
Processed and appended: o_d_02_2001.csv
Processed and appended: o_d_03_2001.csv
Processed and appended: o_d_04_2001.csv
Processed and appended: o_d_05_2001.csv
Processed and appended: o_d_06_2001.csv
Processed and appended: o_d_07_2001.csv
Processed and appended: o_d_08_2001.csv
Processed and appended: o_d_09_2001.csv
Processed and appended: o_d_10_2001.csv
Processed and appended: o_d_11_2001.csv
Processed and appended: o_d_12_2001.csv
Processed and appended: o_d_01_2002.csv
Processed and appended: o_d_02_2002.csv
Processed and appended: o_d_03_2002.csv
Processed and appended: o_d_04_2002.csv
Processed and appended: o_d_05_2002.csv
Processed and appended: o_d_06_2002.csv
Processed and appended: o_d_07_2002.csv
Processed and appended: o_d_08_2002.csv
Processed and appended: o_d_09_2002.csv
Processed and appended: o_d_10_2002.csv
Processed and appended: o_d_11_2002.csv
Processed and appended: o_d_12_2002.csv
Processed and a

In [3]:
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/Files/Data/combined_k_d_t_data.csv"
df = pd.read_csv("/lakehouse/default/Files/Data/combined_k_d_t_data.csv")
display(df)


StatementMeta(, 670add35-abc3-4f89-a008-b2cdbea81c63, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 687451b4-5b6f-43ee-9ef4-038afa34576f)

In [2]:
import pandas as pd

# Read the CSV file
df = pd.read_csv("/lakehouse/default/Files/Data/combined_o_d_data.csv")

# Combine the columns into a single Date column
df['Date'] = pd.to_datetime(df['Rok'].astype(str) + '-' + df['Miesiac'].astype(str) + '-' + df['Dzien'].astype(str), errors='coerce')

# Drop the original columns
#df = df.drop(columns=['Rok', 'Miesiac', 'Dzien'])

# Save the updated dataframe to a new CSV file
df.to_csv('/lakehouse/default/Files/Data/combined_o_d_data_year.csv', index=False)


StatementMeta(, e622cbff-ed2b-4ad4-9548-f7cd749c88f0, 4, Finished, Available, Finished)

  df = pd.read_csv("/lakehouse/default/Files/Data/combined_o_d_data.csv")


In [5]:
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/Files/Data/combined_o_d_data_year.csv"
df = pd.read_csv("/lakehouse/default/Files/Data/combined_o_d_data_year.csv")
display(df)


StatementMeta(, 670add35-abc3-4f89-a008-b2cdbea81c63, 7, Finished, Available, Finished)

  df = pd.read_csv("/lakehouse/default/Files/Data/combined_o_d_data_year.csv")


SynapseWidget(Synapse.DataFrame, a288c849-bc5a-431f-a388-5a2d135c5731)

In [3]:
import pandas as pd

# Read the CSV file
df = pd.read_csv("/lakehouse/default/Files/Data/combined_k_d_t_data.csv")

# Combine the columns into a single Date column
df['Date'] = pd.to_datetime(df['Rok'].astype(str) + '-' + df['Miesiac'].astype(str) + '-' + df['Dzien'].astype(str), errors='coerce')

# Drop the original columns
#df = df.drop(columns=['Rok', 'Miesiac', 'Dzien'])

# Save the updated dataframe to a new CSV file
df.to_csv('/lakehouse/default/Files/Data/combined_k_d_t_data_year.csv', index=False)


StatementMeta(, e622cbff-ed2b-4ad4-9548-f7cd749c88f0, 5, Finished, Available, Finished)