## Prepare dataset

In [31]:
import pandas as pd
from pathlib import Path

COLUMN_RENAME_MAP = {
    "TARİX": "timestamp",
    "XÜSUSİ ÇƏKİ\n(kq/m3)": "density_kg_m3",
    "TƏZYİQLƏR\nFƏRQİ (kPa)": "pressure_diff_kpa",
    "TƏZYİQ (kPa)": "pressure_kpa",
    "TEMPERATUR\n(C)": "temperature_c",
    "SAATLIQ\nSƏRF(min m3)": "hourly_flow_m3",
    "SƏRF (min m3)": "total_flow_m3",
}


def load_and_tag_csv(file_path: Path) -> pd.DataFrame:
    df = pd.read_csv(file_path)

    # Rename columns BEFORE concat
    df = df.rename(columns=COLUMN_RENAME_MAP)

    # Add location column
    df = df.copy()
    df["location"] = file_path.stem

    print(f"Loaded {file_path.name}: {len(df):,} rows")

    return df


def combine_location_files(file_paths: list[Path]) -> pd.DataFrame:
    dataframes = []
    total_expected_rows = 0

    for path in file_paths:
        df = load_and_tag_csv(path)
        dataframes.append(df)
        total_expected_rows += len(df)

    combined_df = pd.concat(
        dataframes,
        axis=0,
        ignore_index=True
    )

    print("-" * 50)
    print(f"Expected total rows : {total_expected_rows:,}")
    print(f"Combined DF rows   : {len(combined_df):,}")
    print(
        "Row loss detected  :",
        "YES" if total_expected_rows != len(combined_df) else "NO"
    )

    # Validation
    required_columns = set(COLUMN_RENAME_MAP.values()) | {"location"}
    missing = required_columns - set(combined_df.columns)

    if missing:
        raise RuntimeError(f"Missing expected columns: {missing}")

    return combined_df


if __name__ == "__main__":
    files = [
        Path("../data/Mardakan.csv"),
        Path("../data/Sumqayit.csv"),
        Path("../data/Turkan.csv"),
    ]

    combined_df = combine_location_files(files)

    print("-" * 50)
    print("Final columns:")
    print(combined_df.columns.tolist())

    print("Unique locations:", combined_df["location"].unique())

    # Optional save
    # combined_df.to_csv("../data/combined_locations_clean.csv", index=False)

Loaded Mardakan.csv: 57,978 rows
Loaded Sumqayit.csv: 58,011 rows
Loaded Turkan.csv: 57,978 rows
--------------------------------------------------
Expected total rows : 173,967
Combined DF rows   : 173,967
Row loss detected  : NO
--------------------------------------------------
Final columns:
['timestamp', 'density_kg_m3', 'pressure_diff_kpa', 'pressure_kpa', 'temperature_c', 'hourly_flow_m3', 'total_flow_m3', 'location']
Unique locations: ['Mardakan' 'Sumqayit' 'Turkan']


In [33]:
combined_df.head()

Unnamed: 0,timestamp,density_kg_m3,pressure_diff_kpa,pressure_kpa,temperature_c,hourly_flow_m3,total_flow_m3,location
0,01-01-2018 01:00,0.73703,6.1342,593.74,10.513,4.494,107.86,Mardakan
1,01-01-2018 02:00,0.73703,5.829,606.206,10.33,4.43,106.316,Mardakan
2,01-01-2018 03:00,0.73703,5.9273,581.2,9.987,4.374,104.987,Mardakan
3,01-01-2018 04:00,0.73703,6.0022,561.711,10.138,4.325,103.795,Mardakan
4,01-01-2018 05:00,0.73703,5.6887,586.612,10.315,4.304,103.287,Mardakan
