In [1]:
import polars as pl
import pandas as pd
from pathlib import Path



BRONZE_DIR = Path("C:\\Users\\rajpu\\OneDrive\\Documents\\Projects_git\\blood-supply-risk-monitor\\data\\bronze\\ncdb")



In [None]:
files = list(BRONZE_DIR.glob("*.csv"))
print(f"Loading all CSV files from: {BRONZE_DIR}")
print(f"Found {len(files)} files to merge.")
file_strings = [str(f) for f in files]

In [None]:
sample_df = pl.read_csv(
    files[5],
    encoding='latin1',
    n_rows=0,  # Only read headers, no data
    ignore_errors=True
)
expected_columns = sample_df.columns
print(f"Expected columns: {expected_columns}")

In [None]:
# Check all files for column consistency
all_columns = {}
for file in files:
    df_header = pl.read_csv(
        file,
        encoding='latin1',
        n_rows=0,
        ignore_errors=True
    )
    all_columns[file.name] = df_header.columns
    print(f"{file.name}: {len(df_header.columns)} columns")

# Determine expected columns
column_sets = [set(cols) for cols in all_columns.values()]
if all(cols == column_sets[0] for cols in column_sets):
    expected_columns = list(column_sets[0])
    print(" All files have identical columns")
else:
    print(" Warning: Inconsistent columns")
    expected_columns = sorted(list(set.union(*column_sets)))


In [None]:
expected_columns = sorted(list(set.union(*column_sets)))


In [None]:
dfs = []


In [None]:
schema_overrides = {
    "C_YEAR": pl.Int64, 
    "C_MNTH": pl.Int64, 
    "C_SEV": pl.Int64, 
    "P_AGE": pl.String, 
    "P_SEX": pl.String
}
critical_cols = ["C_YEAR", "C_MNTH", "C_SEV"]
NULL_CODES = ["", "NA", "nan", "N", "NN", "U", "UU", "X", "XX", "Q", "QQ"]
columns_to_keep = [
    # Collision features
    'C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR',
    'C_SEV',  # Target variable for severity prediction
    'C_WTHR', 'C_RSUR', 'C_CONF', 'C_TRAF', 
    'C_RCFG', 'C_VEHS', 'C_RALN',
    
    # Person features
    'P_SEX', 'P_AGE', 'P_ISEV', 'P_PSN', 
    'P_SAFE', 'P_USER',
    
    # Vehicle features
    'V_TYPE', 'V_YEAR'
]
rename_map = {
    "C_ANNÉE": "C_YEAR",
    "C_MOIS": "C_MNTH",
    "C_JSEM": "C_WDAY",
    "C_HEURE": "C_HOUR",
    "C_GRAVITÉ": "C_SEV",
    "C_MÉTEO": "C_WTHR",
    "C_VÉH": "C_VEHS",
    "P_SEXE": "P_SEX",
    "P_ÂGE": "P_AGE",
    "P_SÉCURITÉ": "P_SAFE",
    "P_USAG": "P_USER",
    "V_ANNÉE": "V_YEAR",
}

In [None]:
for f in files:
    # Read with explicit schema overrides
    df = pl.read_csv(
        f,
        encoding="latin1",
        null_values=NULL_CODES,
        schema_overrides=schema_overrides, 
        ignore_errors=True 
    )

    df = df.rename({k: v for k, v in rename_map.items() if k in df.columns})

    
    missing_critical = [c for c in critical_cols if c not in df.columns]
    if missing_critical:
        print(f" WARNING: File {f.name} is missing critical columns: {missing_critical}")
        
    
    available_cols = [c for c in columns_to_keep if c in df.columns]
    df = df.select(available_cols)
    
    if "P_AGE" in df.columns:
         df = df.with_columns(
             pl.col("P_AGE").cast(pl.Int64, strict=False) 
         )

    dfs.append(df)

merged_df = pl.concat(dfs, how="diagonal")

In [None]:
merged_df.describe().show()

In [None]:
print(merged_df["C_SEV"].value_counts())

In [None]:
df_clean = (
    merged_df  
    .lazy()    
    
    .with_columns([
    
        pl.col("C_SEV").cast(pl.Int64, strict=False),
        pl.col("P_AGE").cast(pl.Int64, strict=False)
    ])
    
    .filter(pl.col("C_SEV").is_in([1, 2]))
    .with_columns(
        pl.when(pl.col("P_AGE") >= 98).then(None).otherwise(pl.col("P_AGE")).alias("P_AGE")
    )

    .with_columns(
        pl.date(pl.col("C_YEAR"), pl.col("C_MNTH"), 1).alias("Report_Date")
    )
    
    .with_columns([
        pl.col("C_SEV").replace({1: "Fatal", 2: "Injury"}, default=None).alias("Severity_Label"),
        pl.col("P_SEX").replace({"M": "Male", "F": "Female", "U": "Unknown"}, default="Unknown").alias("Sex_Label")
    ])
    .collect() #
)

In [None]:
print(df_clean.head())

In [None]:
print(df_clean.null_count())

In [None]:
categorical_cols = ['C_WTHR', 'C_RSUR', 'C_CONF', 'C_RCFG', 'C_RALN', 'V_TYPE', 'P_USER', 'P_PSN']

In [None]:
df_final = (
    df_clean
    
    .drop_nulls(subset=["C_MNTH", "Report_Date"])
    
   
    .with_columns([
        pl.col("C_HOUR").fill_null(-1),
        pl.col("C_WDAY").fill_null(-1)
    ])
)

In [None]:
print(df_final.null_count())

In [None]:
df_final.write_csv(
    BRONZE_DIR / "ncdb_merged_filtered.csv")