In [35]:
from pathlib import Path

root = Path("/content/project")
raw_dir = root / "data" / "raw"
proc_dir = root / "data" / "processed"
nb_dir = root / "notebooks"

for p in [raw_dir, proc_dir, nb_dir]:
    p.mkdir(parents=True, exist_ok=True)

print("created:", root)

created: /content/project


In [36]:
url = "http://davidcard.berkeley.edu/data_sets/njmin.zip"
zip_path = raw_dir / "njmin.zip"

!curl -L "{url}" -o "{zip_path}"
print("zip exists:", zip_path.exists(), "size_kb:", round(zip_path.stat().st_size/1024, 1))

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   178  100   178    0     0    883      0 --:--:-- --:--:-- --:--:--   885
100 23017  100 23017    0     0  43873      0 --:--:-- --:--:-- --:--:--  458k
zip exists: True size_kb: 22.5


In [37]:
!unzip -o "{zip_path}" -d "{raw_dir}"
!ls -lh "{raw_dir}"

Archive:  /content/project/data/raw/njmin.zip
  inflating: /content/project/data/raw/check.sas  
  inflating: /content/project/data/raw/codebook  
  inflating: /content/project/data/raw/public.dat  
  inflating: /content/project/data/raw/read.me  
  inflating: /content/project/data/raw/survey1.nj  
  inflating: /content/project/data/raw/survey2.nj  
total 148K
-rw-r--r-- 1 root root  14K Oct 19  1997 check.sas
-rw-r--r-- 1 root root 3.8K Oct 19  1997 codebook
-rw-r--r-- 1 root root  23K Feb 28 03:21 njmin.zip
-rw-r--r-- 1 root root  81K Oct 19  1997 public.dat
-rw-r--r-- 1 root root 1.3K Oct 19  1997 read.me
-rw-r--r-- 1 root root 5.7K Feb 12  1996 survey1.nj
-rw-r--r-- 1 root root 5.4K Feb 12  1996 survey2.nj


In [38]:
import pandas as pd

public_path = raw_dir / "public.dat"
print("public.dat exists:", public_path.exists())

# just a minimal fixed-width read to prove ingestion works
colspecs = [(0, 3), (4, 6), (7, 10)]
names = ["store_id", "state", "emp_wave1"]

df = pd.read_fwf(public_path, colspecs=colspecs, names=names)
df.head()

public.dat exists: True


Unnamed: 0,store_id,state,emp_wave1
0,46,1,0
1,49,2,0
2,506,2,0
3,56,4,0
4,61,4,0


In [39]:
out_csv = proc_dir / "public_minimal_processed.csv"
df.to_csv(out_csv, index=False)

print("saved:", out_csv)
!ls -lh "{proc_dir}"

saved: /content/project/data/processed/public_minimal_processed.csv
total 4.0K
-rw-r--r-- 1 root root 3.2K Feb 28 03:21 public_minimal_processed.csv


In [40]:
readme = f"""\
# Card & Krueger (1994) Replication â€” Phase 1

## Track
Causal Policy Track (Difference-in-Differences)

## Paper
Card, D., & Krueger, A. B. (1994). Minimum Wages and Employment: A Case Study of the Fast-Food Industry in New Jersey and Pennsylvania.

## What this repo is (Phase 1)
For Phase 1 I set up the repo structure, downloaded the original replication dataset, and verified I can load the raw fixed-width file (`public.dat`) in Python.

## Data source
{url}

## Repo structure
- data/raw: downloaded + unzipped raw files (do not overwrite)
- data/processed: small processed outputs from the code
- notebooks: analysis notebooks

## Phase 1 check
- successfully loaded `public.dat` using `pandas.read_fwf()` and displayed `head()`
- saved `data/processed/public_minimal_processed.csv`
"""

(root / "README.md").write_text(readme, encoding="utf-8")
print("README written:", root / "README.md")

README written: /content/project/README.md


In [41]:
gitignore = """\
__pycache__/
*.pyc
.ipynb_checkpoints/
.DS_Store
Thumbs.db
"""
(root / ".gitignore").write_text(gitignore, encoding="utf-8")
print(".gitignore written:", root / ".gitignore")

.gitignore written: /content/project/.gitignore


In [42]:
# This only works if your notebook file is accessible in the runtime.
# If it doesn't find it, just use the manual method (A).

import glob, shutil

candidates = glob.glob("/content/*.ipynb")
print("found:", candidates)

if candidates:
    src = candidates[0]
    dst = nb_dir / "01_Data_Cleaning.ipynb"
    shutil.copy(src, dst)
    print("copied to:", dst)
else:
    print("no ipynb found in /content. use File -> Download -> .ipynb instead.")

found: []
no ipynb found in /content. use File -> Download -> .ipynb instead.


In [43]:
from google.colab import files
zip_out = Path("/content/project_phase1_ready.zip")

!rm -f "{zip_out}"
!cd /content && zip -r project_phase1_ready.zip project > /dev/null

print("zip ready:", zip_out)
files.download(str(zip_out))

zip ready: /content/project_phase1_ready.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>