In [6]:
import pandas as pd
import importlib

# Reload configuration module
import config
importlib.reload(config)

# Data I/O
from data_loading import load_data, save_data

# Configuration constants
from config import (
    CATEGORIES_TO_BE_RENAMED,
    CATEGORIES_TO_BE_RENAMED2,
    CATEGORIES_TO_BE_RENAMED3,
    CATEGORIES_TO_BE_RENAMED5,
    CATEGORIES_TO_BE_MERGED,
    CATEGORIES_TO_BE_MERGED2,
    DESIRED_ORDER1,
    DESIRED_ORDER2,
    REQUIRED,
    DISALLOWED,
    REQUIRED2,
    DISALLOWED2,
)

# Cleaning functions
from cleaning_steps import (
    clean_ts,
    extract_husnummer,
    standardize_abbreviations,
    batch_standardize_location_descriptions,
    split_vejnavn_beskrivelse,
    add_tællestedstype,
    update_coordinates,
    format_date_column,
    filter_negative_counts,
    reorder_columns,
    rename_categories,
    remove_categories,
    merge_categories,
    ÅDT_and_HDT,
    concat_df,
    total,
    rename_cykler_categories,
)

In [7]:
def Lag0():
    # 0. Load the main dataset
    df = load_data(config.INPUT_FILE)
    df_tilføjelse = load_data(config.MANUELLE_TILFØJELSER)

    # 1. Concat the dataframes
    df = concat_df(df, df_tilføjelse)
    save_data(df, config.OUTPUT_FILE_CONCAT_DF)

    # 2. Clean TS column
    df = clean_ts(df)
    save_data(df, config.OUTPUT_FILE_CLEANED_TS)

    # 3. Process VEJNAVN and extract HUSNUMMER
    df = extract_husnummer(df)
    save_data(df, config.OUTPUT_FILE_VEJNAVN_HUSNUMMER)

    # 4. Standardize abbreviations in VEJNAVN
    df = standardize_abbreviations(df)
    save_data(df, config.OUTPUT_FILE_STANDARDIZED_ABBREVIATIONS)

    # 5. Apply batch standardization for location descriptions
    df = batch_standardize_location_descriptions(df)
    save_data(df, config.OUTPUT_FILE_CLEANED_VEJNAVNE)

    # 6. Split VEJNAVN into VEJNAVN and BESKRIVELSE
    df = split_vejnavn_beskrivelse(df)
    save_data(df, config.OUTPUT_FILE_SPLIT_VEJNAVN)

    # 7. Load station information and merge it into main DataFrame
    station_df = load_data(config.STATION_INFO_FILE)
    df = add_tællestedstype(df, station_df)
    save_data(df, config.OUTPUT_FILE_TÆLLESTEDTYPE)

    # 8. Update GPS coordinates
    df = update_coordinates(df)
    save_data(df, config.OUTPUT_FILE_COORDINATES)

    # 9. Format the DATO column
    df = format_date_column(df)
    save_data(df, config.OUTPUT_FILE_FIXED_DATO)

    # 10. Calculate total column
    df = total(df)
    save_data(df, config.OUTPUT_FILE_TOTAL_COLUMN)

    # 11. Filter categories and negative counts
    df = filter_negative_counts(df)
    save_data(df, config.OUTPUT_FILE_FILTERED_TOTALS)

    # 12. Reorder columns
    df = reorder_columns(df, DESIRED_ORDER1)
    save_data(df, config.OUTPUT_FILE_REORDER_COLUMNS)

    # 13. Rename categories part 1
    df = rename_categories(df, CATEGORIES_TO_BE_RENAMED)
    save_data(df, config.OUTPUT_FILE_RENAMED_CATEGORIES)
    
    # 14. Renmane "Cykler i alt" til "Cykler"
    df = rename_cykler_categories(df, REQUIRED, DISALLOWED)
    save_data(df, config.OUTPPUT_FILE_CONTROL_OF_CYKLER_I_ALT)

    # 15. Remove 'I alt' categories
    df = remove_categories(df)
    save_data(df, config.OUTPUT_FILE_REMOVE_CATEGORIES)
    
    # 16. Save Lag0 
    save_data(df, config.OUTPUT_FILE_LAG0)
    print("Lag0 completed successfully.")
if __name__ == "__main__":
    Lag0()



Lag0 completed successfully.


In [8]:
def Lag1():
    df = load_data(config.OUTPUT_FILE_LAG0)

    # 17. Merge categories part 1
    df = merge_categories(df, CATEGORIES_TO_BE_MERGED)
    save_data(df, config.OUTPUT_FILE_MERGED_CATEGORIES)

    # 18. Rename categories part 2
    df = rename_categories(df, CATEGORIES_TO_BE_RENAMED2)
    save_data(df, config.OUTPUT_FILE_RENAMED_CATEGORIES_PART2)

    # 19. Merge categories part 2
    df = merge_categories(df, CATEGORIES_TO_BE_MERGED2)
    save_data(df, config.OUTPUT_FILE_MERGED_CATEGORIES_PART2)

    # 20. Ensure correct columns
    df = rename_cykler_categories(df, REQUIRED2, DISALLOWED2)
    save_data(df, config.OUTPUT_FILE_CYKLER_CATEGORIES)

    # 21. Rename categories part 3
    df = rename_categories(df, CATEGORIES_TO_BE_RENAMED3)
    save_data(df, config.OUTPUT_FILE_RENAMED_CATEGORIES_PART3)
    
    # 22. Save Lag1 
    save_data(df, config.OUTPUT_FILE_LAG1)
    print("Lag1 completed successfully.")
    
if __name__ == "__main__":
    Lag1()


  df = pd.read_csv(filepath, **kwargs)


Lag1 completed successfully.


In [9]:
def Lag2():
    # Load the dataset saved by Lag0
    df = load_data(config.OUTPUT_FILE_LAG1)
    
    # 23. Rename categories PART 4
    df = rename_categories(df, config.CATEGORIES_TO_BE_RENAMED4)
    save_data(df, config.OUTPUT_FILE_RENAMED_CATEGORIES_PART4)

    # 24. Merge categories PART 3
    df = merge_categories(df)
    save_data(df, config.OUTPUT_FILE_MERGED_CATEGORIES_PART3)
    
    # 31. Save final output
    print("Lag2 completed successfully.")
    save_data(df, config.OUTPUT_LAG2)
    
if __name__ == "__main__":
    Lag2()  

  df["KATEGORI"] = df["KATEGORI"].replace(rename_dict)


Lag2 completed successfully.


In [10]:
def Lag3():
    df = load_data(config.OUTPUT_LAG2)
    df_amgermotervejen = load_data(config.AMGAGERMOTERVEJEN)

    # 26. RENAME CATEGORIES PART 5
    df = rename_categories(df, CATEGORIES_TO_BE_RENAMED5)
    save_data(df, config.OUTPUT_FILE_RENAMED_CATEGORIES_PART5)
    
    # 27. Merge categories PART 4
    df = merge_categories(df)
    save_data(df, config.OUTPUT_FILE_MERGED_CATEGORIES_PART4)
    
    # 28. Rename categories part 6
    print("Renaming categories part 6...")
    df = rename_categories(df, config.CATEGORIES_TO_BE_RENAMED6)
    save_data(df, config.OUTPUT_FILE_RENAMED_CATEGORIES_PART6)
    
    # 29. Rereorder columns part 2
    df = reorder_columns(df, DESIRED_ORDER2)
    save_data(df, config.OUTPUT_FILE_REORDER_COLUMNS2)

    # 30. Add AMAGERMOTERVEJEN
    df = concat_df(df, df_amgermotervejen)
    save_data(df, config.OUTPUT_FILE_AMAGEREMOTORVEJEN)

    # 31. Add ÅDT and HDT
    df = ÅDT_and_HDT(df)
    save_data(df, config.OUTPUT_FILE_ÅDT_AND_HDT)

    # 32. Save final output
    save_data(df, config.OUTPUT_LAG3)

    print("Pipeline completed successfully.")
    
    
if __name__ == "__main__":
    Lag3() 

  df["KATEGORI"] = df["KATEGORI"].replace(rename_dict)


Renaming categories part 6...
Pipeline completed successfully.
