# Silver – DIM_PHARMACY

**Goal:** Produce a clean pharmacy dimension for the Inventory model.

- Input: `capstone.bronze.pharmacies_raw`
- Output: `capstone.silver.dim_pharmacy`
- Key steps: select → clean/standardize → validate → deduplicate → write

In [0]:
%python
from pyspark.sql import functions as F

bronze_tbl = "capstone.bronze.pharmacies_raw"
silver_tbl = "capstone.silver.dim_pharmacy"

df = spark.table(bronze_tbl)

# -------------------------
# 1) Select only what we need
# -------------------------
keep = [
    "ID", "NAME", "TELEPHONE",
    "ADDRESS", "ADDRESS2",
    "CITY", "STATE", "ZIP", "COUNTY", "FIPS",
    "WEBSITE", "NPI",
    "X", "Y",
    "ingestion_timestamp", "source_system"
]

df = df.select([c for c in keep if c in df.columns])

# -------------------------
# 2) Basic cleaning / standardization
# -------------------------
def clean_str(col):
    return F.when(F.trim(F.col(col)) == "", None).otherwise(F.trim(F.col(col)))

df = (df
    .withColumn("pharmacy_source_id", clean_str("ID"))
    .withColumn("pharmacy_name", clean_str("NAME"))
    .withColumn("phone", clean_str("TELEPHONE"))
    .withColumn("address_line1", clean_str("ADDRESS"))
    .withColumn("address_line2", clean_str("ADDRESS2"))
    .withColumn("city", clean_str("CITY"))
    .withColumn("state", F.upper(clean_str("STATE")))
    .withColumn("county", clean_str("COUNTY"))
    .withColumn("fips", clean_str("FIPS"))
    .withColumn("website", clean_str("WEBSITE"))
    .withColumn("npi", clean_str("NPI"))
)

# ZIP clean (mbaj 5 shifra, nëse ka ZIP+4 prite)
df = (df
    .withColumn("zip_raw", clean_str("ZIP"))
    .withColumn("zip5", F.regexp_extract(F.col("zip_raw"), r"(\d{5})", 1))
    .drop("zip_raw")
)

# Coordinates X/Y -> double (nëse janë strings)
df = (df
    .withColumn("longitude", F.col("X").cast("double"))
    .withColumn("latitude", F.col("Y").cast("double"))
)

# -------------------------
# 3) Generate stable pharmacy_id (hash)
# -------------------------
# përdor ID nëse ekziston, përndryshe kombinim i emrit+adresës+qytetit+state+zip
natural_key = F.coalesce(
    F.col("pharmacy_source_id"),
    F.concat_ws("|",
        F.col("pharmacy_name"),
        F.col("address_line1"),
        F.col("city"),
        F.col("state"),
        F.col("zip5")
    )
)

df = df.withColumn("pharmacy_id", F.sha2(natural_key, 256))

# -------------------------
# 4) Deduplicate (mbaj më të fundit sipas ingestion_timestamp)
# -------------------------
w = (
    F.row_number().over(
        __import__("pyspark").sql.window.Window
        .partitionBy("pharmacy_id")
        .orderBy(F.col("ingestion_timestamp").desc_nulls_last())
    )
)

df = (df
    .withColumn("rn", w)
    .filter(F.col("rn") == 1)
    .drop("rn")
)

# -------------------------
# 5) Final columns for silver DIM
# -------------------------
silver_df = (df.select(
    "pharmacy_id",
    "pharmacy_source_id",
    "pharmacy_name",
    "phone",
    "address_line1",
    "address_line2",
    "city",
    "state",
    "zip5",
    "county",
    "fips",
    "website",
    "npi",
    "latitude",
    "longitude",
    "ingestion_timestamp",
    "source_system"
))

# -------------------------
# 6) Write to Silver (Delta)
# -------------------------
spark.sql("CREATE SCHEMA IF NOT EXISTS capstone.silver")

(silver_df
 .write
 .mode("overwrite")
 .format("delta")
 .saveAsTable(silver_tbl)
)

display(spark.table(silver_tbl).limit(20))

In [0]:
-- sa rreshta ka
SELECT COUNT(*) FROM capstone.silver.dim_pharmacy;

-- sa missing key fields
SELECT
  SUM(CASE WHEN pharmacy_name IS NULL THEN 1 ELSE 0 END) AS missing_name,
  SUM(CASE WHEN state IS NULL THEN 1 ELSE 0 END) AS missing_state,
  SUM(CASE WHEN zip5 IS NULL THEN 1 ELSE 0 END) AS missing_zip
FROM capstone.silver.dim_pharmacy;

-- duplicates check
SELECT pharmacy_id, COUNT(*) c
FROM capstone.silver.dim_pharmacy
GROUP BY pharmacy_id
HAVING c > 1;