In [1]:
# This cell imports dependencies and verifies project paths.

from pathlib import Path
import pandas as pd

RAW_PATH = Path("../data/raw/driving_sample.csv")

print("Project root assumed at one level up from notebooks/")
print(f"Expecting sample file at: {RAW_PATH}")

if not RAW_PATH.exists():
    raise FileNotFoundError("Sample dataset not found. Please run 'make download' at project root first.")

print("OK: sample dataset exists.")


Project root assumed at one level up from notebooks/
Expecting sample file at: ../data/raw/driving_sample.csv
OK: sample dataset exists.


In [2]:
#Load the sample dataset and print basic stats.

df = pd.read_csv(RAW_PATH)

print("Shape (rows, cols):", df.shape)
print("\nHead(5):")
display(df.head()) 

print("\nDescribe():")
display(df.describe(include='all'))  # Quick sanity check of numeric/object columns


Shape (rows, cols): (398, 9)

Head(5):


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino



Describe():


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0,398,398
unique,,,,,,,,3,305
top,,,,,,,,usa,ford pinto
freq,,,,,,,,249,6
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,,
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627,,
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,,
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0,,
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,,
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0,,


In [3]:
# Quick data-quality checks.

# Missing values per column
missing = df.isna().sum().sort_values(ascending=False)
print("Missing values per column:")
display(missing.to_frame("missing_count"))

# Simple numeric sanity checks (if numeric columns exist)
numeric_cols = df.select_dtypes(include="number").columns.tolist()
if numeric_cols:
    print("\nNumeric columns detected:")
    print(numeric_cols)
    # Rough outlier proxy: values above 99th percentile
    q99 = df[numeric_cols].quantile(0.99)
    print("\n99th percentile (rough upper bounds):")
    display(q99.to_frame("p99"))
else:
    print("\nNo numeric columns detected.")


Missing values per column:


Unnamed: 0,missing_count
horsepower,6
mpg,0
cylinders,0
displacement,0
weight,0
acceleration,0
model_year,0
origin,0
name,0



Numeric columns detected:
['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']

99th percentile (rough upper bounds):


Unnamed: 0,p99
mpg,43.418
cylinders,8.0
displacement,440.42
horsepower,220.45
weight,4951.03
acceleration,22.239
model_year,82.0


In [4]:
# Persist a minimal quality report to docs/.

DOCS_REPORT = Path("../docs/data_quality_report.md")

lines = []
lines.append("# Data Quality Report — Phase 1\n")
lines.append("- File: data/raw/driving_sample.csv\n")
lines.append(f"- Shape: {df.shape[0]} rows × {df.shape[1]} cols\n")

# Missing summary
missing_summary = df.isna().sum()
total_cells = df.shape[0] * df.shape[1]
missing_total = int(missing_summary.sum())
missing_rate = missing_total / total_cells if total_cells else 0.0
lines.append(f"- Missing cells (total): {missing_total}\n")
lines.append(f"- Missing rate (overall): {missing_rate:.4%}\n")

# Numeric p99 snapshot
numeric_cols = df.select_dtypes(include="number").columns.tolist()
if numeric_cols:
    p99 = df[numeric_cols].quantile(0.99)
    lines.append("- 99th percentile snapshot (numeric columns):\n")
    for c, v in p99.items():
        lines.append(f"  - {c}: {v}\n")
else:
    lines.append("- No numeric columns detected.\n")

DOCS_REPORT.write_text("".join(lines), encoding="utf-8")
print(f"Phase 1 quality report saved to: {DOCS_REPORT.resolve()}")


Phase 1 quality report saved to: /home/lwinds/accident-risk/docs/data_quality_report.md
