In [19]:
import pandas as pd
import os
import csv

In [20]:
files = os.listdir("dataset")
compress_files = "compress_dataset"

if not os.path.exists(compress_files):
    os.makedirs(compress_files)

In [21]:
RELATION = set()
PTCAR_LG_NM_NL = set()
TRAIN_SERV_TYPES = set()

for file in files:

    df = pd.read_csv("dataset/" + file, low_memory=False)
    #Relation
    unique_values = set(df["RELATION"].unique())
    RELATION.update(unique_values)

    #PTCAR_LG_NM_NL and RELATION_DIRECTION
    df["RELATION_DIRECTION"] = df["RELATION_DIRECTION"].fillna('')  # Fill NaN with an empty string
    df["RELATION_DIRECTION"] = df["RELATION_DIRECTION"].str.replace(r"^[^:]+:\s*", "", regex=True)
    split_df = df["RELATION_DIRECTION"].str.split(" -> ", expand=True, n=1)

    df["START"] = split_df[0].fillna('')  
    df["END"] = split_df[1].fillna('') if 1 in split_df else ''
    
    df.drop("RELATION_DIRECTION", axis=1, inplace=True)
    unique_values = set(df[['PTCAR_LG_NM_NL', 'START', 'END']].values.ravel()) #transform numpy into (ravel)1d array
    PTCAR_LG_NM_NL.update(unique_values)

    #TRAIN_SERV
    unique_values = set(df["TRAIN_SERV"].unique())
    TRAIN_SERV_TYPES.update(unique_values)

RELATION = sorted([item for item in RELATION if isinstance(item, str)])
INDEXED_RELATION = {name: idx for idx, name in enumerate(RELATION)}

PTCAR_LG_NM_NL = sorted([item for item in PTCAR_LG_NM_NL if isinstance(item, str)])
INDEXED_PTCAR_LG_NM_NL = {name: idx for idx, name in enumerate(PTCAR_LG_NM_NL)}

TRAIN_SERV_TYPES = sorted([item for item in TRAIN_SERV_TYPES if isinstance(item, str)])
INDEXED_TRAIN_SERV_TYPES = {name: idx for idx, name in enumerate(TRAIN_SERV_TYPES)}

with open("RELATION.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["ID", "RELATION"]) 
    for key, value in INDEXED_RELATION.items():
        writer.writerow([value, key])

with open("UNIQUE_STATION.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["ID", "STATION"]) 
    for key, value in INDEXED_PTCAR_LG_NM_NL.items():
        writer.writerow([value, key])

with open("TRAIN_SERV.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["ID", "TRAIN_SERV"]) 
    for key, value in INDEXED_TRAIN_SERV_TYPES.items():
        writer.writerow([value, key])


In [23]:
Data_used = 0
Data_after = 0

DATE_COLUMNS = ["DATDEP", "PLANNED_DATE_ARR", "PLANNED_DATE_ARR", "PLANNED_DATE_DEP", "REAL_DATE_ARR", "REAL_DATE_DEP"]
HOURLY_COLUMNS = ["REAL_TIME_ARR", "REAL_TIME_DEP", "PLANNED_TIME_ARR", "PLANNED_TIME_DEP"]

for file in files:

    df = pd.read_csv("dataset/" + file,low_memory=False)
    Data_used += df.memory_usage(deep=True).sum() / 1024**2

    df["RELATION"] = df["RELATION"].map(INDEXED_RELATION)

    df["TRAIN_SERV"] = df["TRAIN_SERV"].map(INDEXED_TRAIN_SERV_TYPES)

    df["PTCAR_LG_NM_NL"] = df["PTCAR_LG_NM_NL"].map(INDEXED_PTCAR_LG_NM_NL)

    df["RELATION_DIRECTION"] = df["RELATION_DIRECTION"].fillna('')  # Fill NaN with an empty string
    df["RELATION_DIRECTION"] = df["RELATION_DIRECTION"].str.replace(r"^[^:]+:\s*", "", regex=True)
    split_df = df["RELATION_DIRECTION"].str.split(" -> ", expand=True, n=1)
    df["START"] = split_df[0].fillna('')  
    df["END"] = split_df[1].fillna('') if 1 in split_df else ''
    
    df["START"] = df["START"].map(INDEXED_PTCAR_LG_NM_NL)
    df["END"] = df["END"].map(INDEXED_PTCAR_LG_NM_NL)


    for d in DATE_COLUMNS:
        df[d] = pd.to_datetime(df[d], format="%d%b%Y", errors="coerce")

    for h in HOURLY_COLUMNS:
        df[h] = pd.to_timedelta(df[h])
    
    df.drop("RELATION_DIRECTION", axis=1, inplace=True)
    df.drop("PLANNED_DATE_DEP",axis=1, inplace=True)
    df.drop("PLANNED_DATE_ARR",axis=1, inplace=True)
    df.drop("REAL_DATE_ARR",axis=1, inplace=True)
    df.drop("REAL_DATE_DEP",axis=1, inplace=True)

    Data_after += df.memory_usage(deep=True).sum() / 1024**2

    for h in HOURLY_COLUMNS:
        df[h] = df[h].apply(lambda x: str(x).split()[2] if isinstance(x, pd.Timedelta) else str(x))



    df.to_csv(f"{compress_files}/{file}", index=False)

print(f"Memory used : {Data_used} MB")

print(f"Memory after : {Data_after} MB")











 

Memory used : 242362.30205631256 MB
Memory after : 63740.648359298706 MB
