In [0]:
# create schema + storage folders
raw_dir   = "/FileStore/lahman/raw"
delta_dir = "/FileStore/lahman/delta"

# Create a dedicated schema for our tables
spark.sql("CREATE DATABASE IF NOT EXISTS lahman")
spark.sql("USE lahman")

# Create DBFS folders for raw CSVs and Delta outputs
dbutils.fs.mkdirs(raw_dir)
dbutils.fs.mkdirs(delta_dir)

print("Database set to:", spark.catalog.currentDatabase())
display(dbutils.fs.ls("/FileStore/lahman"))

In [0]:
# List files we just uploaded
display(dbutils.fs.ls("/FileStore/lahman"))

# (optional) capture the zip path
zip_paths = [f.path for f in dbutils.fs.ls("/FileStore/lahman") if f.name.lower().endswith(".zip")]
print("ZIPs found:", zip_paths)

In [0]:
import zipfile, os

zip_dir   = "/tmp/lahman_zip"
raw_dir   = "/FileStore/lahman/raw"
zip_paths = [f.path.replace("dbfs:","/dbfs") for f in dbutils.fs.ls("/FileStore/lahman") if f.name.lower().endswith(".zip")]

# Prep folders
dbutils.fs.mkdirs(raw_dir)
os.makedirs(zip_dir, exist_ok=True)

# Extract first ZIP we find
assert zip_paths, "No ZIP found in /FileStore/lahman"
zip_path_local = zip_paths[0]
print("Extracting:", zip_path_local)

with zipfile.ZipFile(zip_path_local, "r") as zf:
    zf.extractall(zip_dir)

# Copy only .csv files into DBFS raw folder
count = 0
for root, _, files in os.walk(zip_dir):
    for fn in files:
        if fn.lower().endswith(".csv"):
            src = os.path.join(root, fn)
            dst = f"{raw_dir}/{fn}"
            dbutils.fs.cp(f"file:{src}", dst, True)
            count += 1

print(f"Copied {count} CSV files to {raw_dir}")
display(dbutils.fs.ls(raw_dir))

In [0]:
import os, re
from pyspark.sql import functions as F

raw_dir   = "/FileStore/lahman/raw"
delta_dir = "/FileStore/lahman/delta"

# Ensure database exists
spark.sql("CREATE DATABASE IF NOT EXISTS lahman")
spark.sql("USE lahman")

# Helper to make valid table names
def clean_name(s: str) -> str:
    name = os.path.splitext(s)[0]          # drop .csv
    name = re.sub(r'[^A-Za-z0-9_]', '_', name)
    return name.lower()

# Read options for messy CSVs
read_opts = {
    "header": "true",
    "inferSchema": "true",
    "multiLine": "true",
    "escape": "\"",
    "mode": "PERMISSIVE",
}

# Loop files -> Delta -> Tables
files = [f for f in dbutils.fs.ls(raw_dir) if f.name.lower().endswith(".csv")]
assert files, f"No CSVs found in {raw_dir}"

for f in files:
    tbl = clean_name(f.name)
    src = f"{raw_dir}/{f.name}"
    dst = f"{delta_dir}/{tbl}"

    df = spark.read.options(**read_opts).csv(src)

    # Write Delta (overwrite so the cell is idempotent)
    (df.write.format("delta")
       .mode("overwrite")
       .option("overwriteSchema", "true")
       .save(dst))

    # Register external table at that location
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS lahman.{tbl}
        USING DELTA
        LOCATION '{dst}'
    """)

print("Loaded tables:")
display(spark.sql("SHOW TABLES IN lahman"))

In [0]:
%sql
USE lahman;

-- Top 20 home run seasons (Batting)
SELECT playerID, yearID, teamID, G, AB, H, HR, RBI
FROM batting
ORDER BY HR DESC, yearID DESC
LIMIT 20;