In [None]:
!pip install geopandas pyarrow shapely pyproj rtree matplotlib pandas numpy openpyxl jupyterlab seaborn plotly scipy scikit-learn pulp


In [3]:
import pandas as pd
from pathlib import Path
import numpy as np

# Creating Synthetic Fields

In [None]:
# --- CONFIG ---
N_FIELDS = 20

# Rough bounding box for central Illinois
MIN_LAT, MAX_LAT = 39.0, 41.0
MIN_LON, MAX_LON = -90.5, -87.5

rng = np.random.default_rng(42)

# 1) Sample realistic-ish field sizes (acres)
# We'll use a lognormal-like distribution and clip to [40, 120] acres
raw_acres = rng.lognormal(mean=4.0, sigma=0.35, size=N_FIELDS)  # raw in arbitrary units
scaled_acres = np.interp(raw_acres,
                         (raw_acres.min(), raw_acres.max()),
                         (40, 120))  # scale to 40‚Äì120 acre range

# 2) Sample fake centroids within IL bounding box
lats = rng.uniform(MIN_LAT, MAX_LAT, size=N_FIELDS)
lons = rng.uniform(MIN_LON, MAX_LON, size=N_FIELDS)

# 3) Build DataFrame
fields = pd.DataFrame({
    "field_id": [f"F{i+1:03d}" for i in range(N_FIELDS)],
    "crop_code": 1,           # 1 = corn, consistent with CDL/NASS
    "crop_name": "CORN",
    "acres": scaled_acres,
    "centroid_lat": lats,
    "centroid_lon": lons,
    "state_fips": "17",       # Illinois
    "state_name": "Illinois"
})

print(fields.head())
print("\nSummary of acres:")
print(fields["acres"].describe())

# 4) Save to CSV for your modeling pipeline
fields.to_csv("illinois_corn_fields_synthetic.csv", index=False)
print("\nSaved illinois_corn_fields_synthetic.csv")


In [6]:
df_harvested = 'src/CORN, GRAIN ‚Äì PROGRESS, MEASURED IN PCT HARVESTED.csv'
df_fields  = 'src/illinois_corn_fields_synthetic.csv'
df_noaa_d  = 'src/noaa_il_daily_raw.csv'
df_noaa_w  = 'src/noaa_il_weekly_agg.csv'
df_labor   = 'src/no_of_worker2.csv'

In [7]:
df_planting = pd.read_csv('/home/mak/Documents/Optimization/Project/data/raw/CORNPROGRESSMEASURED IN PCT PLANTED.csv')

In [None]:
df_planting.info

In [None]:
df_planting.dtypes


In [None]:
print("Raw shape:", df_planting.shape)
df_planting.head()

## Filter only Illinois & planting rows

In [8]:
df_planting = df_planting[
    (df_planting["State"] == "ILLINOIS") &
    (df_planting["Data Item"].str.contains("PCT PLANTED"))
]

In [None]:
df_planting.head(3)

## üîß Step 2: Extract numeric week from Period
### Your Period looks like:
### "WEEK #13"
### "WEEK #17"
### We extract the number:

In [9]:
df_planting["week"] = df_planting["Period"].str.extract(r"(\d+)").astype(int)
# df_planting.head(3)
type(df_planting["week"][0])

numpy.int64

## üîß Step 3: Convert Week Ending ‚Üí datetime

In [10]:
df_planting["week_ending"] = pd.to_datetime(df_planting["Week Ending"])
type(df_planting["week_ending"][0])

pandas._libs.tslibs.timestamps.Timestamp

In [11]:
df_planting.head(2)

Unnamed: 0,Program,Year,Period,Week Ending,Geo Level,State,State ANSI,Ag District,Ag District Code,County,...,watershed_code,Watershed,Commodity,Data Item,Domain,Domain Category,Value,CV (%),week,week_ending
0,SURVEY,2025,WEEK #13,2025-03-30,STATE,ILLINOIS,17,,,,...,0,,CORN,"CORN - PROGRESS, MEASURED IN PCT PLANTED",TOTAL,NOT SPECIFIED,0,,13,2025-03-30
1,SURVEY,2025,WEEK #14,2025-04-06,STATE,ILLINOIS,17,,,,...,0,,CORN,"CORN - PROGRESS, MEASURED IN PCT PLANTED",TOTAL,NOT SPECIFIED,0,,14,2025-04-06


## üîß Step 4: Rename Value ‚Üí pct_planted

In [12]:
df_planting["pct_planted"] = df_planting["Value"].astype(float)
type(df_planting["pct_planted"][0])

numpy.float64

## üîß Step 5: Keep only the needed columns

In [13]:
df_planting_clean = df_planting[["Year", "week", "week_ending", "pct_planted"]].sort_values(["Year", "week"])
df_planting_clean.head()


Unnamed: 0,Year,week,week_ending,pct_planted
194,2005,15,2005-04-17,35.0
195,2005,16,2005-04-24,64.0
196,2005,17,2005-05-01,82.0
197,2005,18,2005-05-08,94.0
198,2005,19,2005-05-15,98.0


## üåæ 5. Compute planting windows (per-year)

### Planting window logic:
### Start: first week where pct_planted > 1
### End: first week where pct_planted ‚â• 90

In [None]:
windows = []
for year, g in df_planting_clean.groupby("Year"):
    g = g.sort_values("week")
    
    start = g[g["pct_planted"] > 1]["week"].min()
    end = g[g["pct_planted"] >= 90]["week"].min()
    
    windows.append({"Year": year,
                    "plant_start_week": start,
                    "plant_end_week": end})
    
plantingWindows_df = pd.DataFrame(windows).sort_values("Year")
plantingWindows_df


## üåΩ 6. Compute typical (median) planting window
## This is what the optimization model will use:

In [15]:
typical_start = int(plantingWindows_df["plant_start_week"].median())
typical_end = int(plantingWindows_df["plant_end_week"].median())

print("Typical window:", typical_start, "to", typical_end)


Typical window: 16 to 21


In [None]:
df_harvested = pd.read_csv('/home/mak/Documents/Optimization/Project/data/raw/CORN, GRAIN ‚Äì PROGRESS, MEASURED IN PCT HARVESTED.csv')
print(df_harvested.shape)
df_harvested.head(3)

## Filter to only Illinois & ‚ÄúPCT HARVESTED‚Äù

In [None]:
df_harvested = df_harvested[
    (df_harvested["State"] == "ILLINOIS") &
    (df_harvested["Data Item"].str.contains("PCT HARVESTED"))
]

print("After filter:", df_harvested.shape)
df_harvested.head(3)

## ‚úÖ 3. Extract week number from ‚ÄúPeriod‚Äù
### The Period column looks like "WEEK #41".
### We extract the number:

In [18]:
# ---- Extract week number from Period "WEEK #xx" ----
df_harvested["week"] = df_harvested["Period"].str.extract(r'(\d+)').astype(int)
type(df_harvested["week"][0])


numpy.int64

## ‚úÖ 4. Convert week-ending date

In [19]:
# ---- Convert week ending to datetime ----
df_harvested["week_ending"] = pd.to_datetime(df_harvested["Week Ending"], errors='coerce')
type(df_harvested["week_ending"][0])

pandas._libs.tslibs.timestamps.Timestamp

## ‚úÖ 5. Rename Value ‚Üí pct_harvested

In [20]:
# ---- Clean Value column ----
df_harvested["pct_harvested"] = pd.to_numeric(df_harvested["Value"], errors='coerce')
type(df_harvested["pct_harvested"][0])

numpy.int64

## ‚úÖ 6. Keep only the needed columns

In [None]:
# ---- Select only useful columns ----
df_harvested_clean = df_harvested[["Year", "week", "week_ending", "pct_harvested"]] \
    .sort_values(["Year", "week"]) \
    .reset_index(drop=True)

print(df_harvested_clean.head())

## üåΩ 8. Compute harvest windows (per-year)
### We define the harvest window using typical agricultural logic:
### Start = first week with >1% harvested
### End = first week with ‚â•90% harvested

In [None]:
# ---- Compute harvest windows ----
harvest_windows = []

for year, g in df_harvested_clean.groupby("Year"):
    g = g.sort_values("week")

    start = g[g["pct_harvested"] > 1]["week"].min()
    end   = g[g["pct_harvested"] >= 90]["week"].min()
    print(start, end)

    harvest_windows.append({
        "Year": year,
        "harvest_start_week": start,
        "harvest_end_week": end
    })

harvest_windows_df = pd.DataFrame(harvest_windows).sort_values("Year")
print(harvest_windows_df.head())

In [None]:
harvest_windows_df['Year'].value_counts()

## üåΩ 9. Compute the typical (median) harvest window

In [None]:
# ---- Compute typical window ----
typical_start = int(harvest_windows_df["harvest_start_week"].median())
typical_end   = int(harvest_windows_df["harvest_end_week"].median())

print("Typical harvest window:")
print(f"Start week ‚âà {typical_start}")
print(f"End week   ‚âà {typical_end}")

In [25]:
df_planting_clean = df_planting_clean.dropna(subset=["week"])
df_planting_clean["week"] = df_planting_clean["week"].astype(int)

# Clean harvest NASS
df_harvested_clean = df_harvested_clean.dropna(subset=["week"])
df_harvested_clean["week"] = df_harvested_clean["week"].astype(int)


### Loading Corn fields Synthetic

In [26]:
fields  = pd.read_csv('/home/mak/Documents/Optimization/Project/data/raw/illinois_corn_fields_synthetic.csv')

In [None]:
fields.head(3)

### üéØ What you get from this cleaning
####  1. Cleaned weekly harvest table

#### Columns:
#### Year | week | week_ending | pct_harvested

### 2. Harvest window table
#### Columns:
#### Year | harvest_start_week | harvest_end_week

### 3. Typical harvest window (median across years)
#### Something like:

#### Start ‚âà week 36
#### End   ‚âà week 44
#### This will drive the harvest constraints in your MILP model.

In [None]:
print("Raw shape:", fields.shape)
print(fields.head())

# 1. Keep only corn rows (future-proofing, even if currently all CORN)
fields = fields[fields["crop_name"].str.upper() == "CORN"].copy()

In [None]:
fields.head(3)
# fields["acres"].value_counts()

In [30]:
# 2. Basic sanity checks
#    - acres > 0
fields = fields[fields["acres"] > 0].copy()

# 3. Define simple regions based on latitude (south/central/north Illinois)
#    You can tweak thresholds if you like.
lat = fields["centroid_lat"]

In [31]:
def assign_region(lat_val):
    if lat_val < 39.7:
        return "South"
    elif lat_val < 40.3:
        return "Central"
    else:
        return "North"

In [None]:
fields["region"] = lat.apply(assign_region)

# 4. Select only columns we actually need for modeling
fields_clean = fields[[
    "field_id",
    "acres",
    "centroid_lat",
    "centroid_lon",
    "region"
]].sort_values("field_id").reset_index(drop=True)

print(fields_clean.head())

### 3Ô∏è‚É£ How this feeds each process

### Optimization (Gurobi MILP)

### field_id ‚Üí set of decision variables Plant[f,w], Harvest[f,w]

### acres ‚Üí used in capacity and labor constraints:
### sum_f Plant[f,w] * acres[f] ‚â§ capacity[w]

### Weather & regional extensions (optional later)
### region + (centroid_lat, centroid_lon) let you:
### join to regional weather series,
### model different planting windows by region, etc.

### Visualization
### You can color Gantt bars or maps by region.

### 1Ô∏è‚É£ Understand what‚Äôs in the NOAA files
### From your files:
### noaa_il_daily_raw.csv
### Columns:
### station ‚Äì station ID (e.g. GHCND:US1ILBN0014)

### date ‚Äì "YYYY-MM-DD"
### PRCP ‚Äì daily precipitation (inches)
### AWND ‚Äì average daily wind speed (mph)
### TAVG ‚Äì avg temperature (¬∞F)
### TMAX ‚Äì max temp (¬∞F)
### TMIN ‚Äì min temp (¬∞F)
### noaa_il_weekly_agg.csv

### Columns:

### year ‚Äì int
### week ‚Äì ISO week number (1‚Äì52)
### PRCP ‚Äì average daily precip over that week (inches/day)
### TMAX, TMIN, TAVG ‚Äì average of daily values
### AWND ‚Äì average wind
### We‚Äôll turn the weekly file into a capacity table for the MILP.

## 2Ô∏è‚É£ Clean daily file (mostly for completeness / possible EDA)

In [35]:
# ImRkvVoWZZfIyRovfripXzAkOfNhzUol

In [33]:
daily_path = Path("/home/mak/Documents/Optimization/Project/data/raw/noaa_il_daily_raw.csv")
daily = pd.read_csv(daily_path)

In [None]:
daily.head(30)
# daily.shape
# daily['PRCP'].value_counts()

In [None]:
# assume df_daily has columns: ["TMAX", "TMIN", "TAVG", ...]
# define which rows you consider "bad" TAVG
mask_bad_tavg = daily["TAVG"].isna()

# (optionally also treat zeros as bad if you know they‚Äôre placeholders, not real temps)
# mask_bad_tavg = df_daily["TAVG"].isna() | (df_daily["TAVG"] == 0)

daily.loc[mask_bad_tavg, "TAVG"] = (
    daily.loc[mask_bad_tavg, ["TMAX", "TMIN"]].mean(axis=1)
)
daily['TAVG'].value_counts()

In [35]:
daily['TAVG'].isna().sum()

np.int64(0)

In [36]:
# Parse date
daily["date"] = pd.to_datetime(daily["date"])

In [37]:
# Ensure numeric
for col in ["PRCP", "AWND", "TAVG", "TMAX", "TMIN"]:
    daily[col] = pd.to_numeric(daily[col], errors="coerce")

In [38]:
# Optional: drop rows that are completely missing weather
daily = daily.dropna(subset=["PRCP", "TMAX", "TMIN"], how="all").reset_index(drop=True)

In [39]:
daily.head(3)
daily.shape

(14608, 7)

In [40]:
daily['date'] = pd.to_datetime(daily['date'])

daily['year'] = daily['date'].dt.year
daily['week'] = daily['date'].dt.isocalendar().week.astype(int)


## 3Ô∏è‚É£ Clean + enhance weekly file (this is the important one)
## We‚Äôll:
## Make sure types are right
## Compute weekly precipitation (inches/week) from the average daily PRCP
## Add a capacity factor (0‚Äì1) that we‚Äôll use as a multiplier on planter/harvester capacity
## Optionally add a simple rain category for plotting / analysis

In [41]:
weekly = (
    daily
    .groupby(["year", "week"], as_index=False)
    .agg(
        prcp_week_in=("PRCP", "sum"),
        TMAX=("TMAX", "mean"),
        TMIN=("TMIN", "mean"),
        TAVG=("TAVG", "mean"),
        AWND=("AWND", "mean"),
    )
)

weekly.to_csv('/home/mak/Documents/Optimization/Project/data/processed/noaa_il_weekly_agg.csv', index=False)

In [42]:
weekly_path = Path("/home/mak/Documents/Optimization/Project/data/raw/noaa_il_weekly_agg.csv")
weekly = pd.read_csv(weekly_path)

In [None]:
print("Raw weekly shape:", weekly.shape)
print(weekly.head(30))

In [44]:
# ---- Basic type cleanup ----
weekly["year"] = weekly["year"].astype(int)
weekly["week"] = weekly["week"].astype(int)

for col in ["PRCP", "TMAX", "TMIN", "TAVG", "AWND"]:
    weekly[col] = pd.to_numeric(weekly[col], errors="coerce")

In [None]:
weekly["prcp_week_in"] = weekly["PRCP"] * 7
weekly["prcp_week_in"].head(3)

In [None]:
print(weekly.head(3))
weekly.columns
weekly.shape

In [47]:
# ---- Compute weekly precipitation (inches/week) ----
# PRCP in your file is average daily precip over that week,
# so approximate total weekly precip as PRCP * 7
# ---- Define a capacity_factor based on total weekly rain ----
def capacity_from_rain(prcp_week):
    """
    Simple heuristic:
      - 0.0   in: perfect dry week ‚Üí 1.0 (100% capacity)
      - <0.5  in: light rain       ‚Üí 0.9
      - <1.5  in: moderate rain    ‚Üí 0.7
      - <3.0  in: heavy rain       ‚Üí 0.4
      - >=3.0 in: very wet         ‚Üí 0.2
    """
    if pd.isna(prcp_week):
        return 0.8  # neutral if missing
    if prcp_week == 0:
        return 1.0
    if prcp_week < 0.5:
        return 0.9
    if prcp_week < 1.5:
        return 0.7
    if prcp_week < 3.0:
        return 0.4
    return 0.2

In [48]:
weekly["capacity_factor"] = weekly["prcp_week_in"].apply(capacity_from_rain)

# ---- Optional: rain category for plotting ----
def rain_bucket(prcp_week):
    if pd.isna(prcp_week):
        return "missing"
    if prcp_week == 0:
        return "dry"
    if prcp_week < 0.5:
        return "light"
    if prcp_week < 1.5:
        return "moderate"
    if prcp_week < 3.0:
        return "heavy"
    return "very_heavy"

weekly["rain_category"] = weekly["prcp_week_in"].apply(rain_bucket)

In [None]:
weekly["rain_category"].value_counts()

In [50]:
# ---- Keep only the columns we actually need for modeling ----
weekly_clean = weekly[
    ["year", "week",
     "prcp_week_in", "TMAX", "TMIN", "TAVG", "AWND",
     "capacity_factor", "rain_category"]
].sort_values(["year", "week"]).reset_index(drop=True)

In [None]:
weekly_clean

### 4Ô∏è‚É£ How this NOAA weekly table will be used later
### In the MILP (Gurobi)
### When we define weekly capacity constraints, we‚Äôll do something like:
# Example parameters (you‚Äôll define them when we build the model)
### base_planter_capacity = 600  # acres/week
### base_harvester_capacity = 800  # acres/week
# For each week w:
### eff_planter_cap[w]  = base_planter_capacity  * capacity_factor[w]
### eff_harvester_cap[w] = base_harvester_capacity * capacity_factor[w]
# Then constraints:
### sum_f Plant[f,w]   * area[f] <= eff_planter_cap[w]
### sum_f Harvest[f,w] * area[f] <= eff_harvester_cap[w]
# So rainier weeks literally shrink capacity in the optimization model.
### In Monte Carlo
### We‚Äôll fit distributions to prcp_week_in (or to capacity_factor) over all years.
### For each simulation, we‚Äôll draw a random capacity_factor[w] trajectory for the season.
### Then re-run the MILP under that scenario.
### In Forecasting
### Weather features for the ML model:
### prcp_week_in
### TAVG, TMAX, TMIN
### maybe lagged versions (last week‚Äôs rain, temp, etc.)
# Targets:
### % planted / % harvested for that week.
### If this looks good, NOAA is done ‚úÖ
### Next we can clean the labor dataset (no_of_worker2.csv), which will let us add labor-hour constraints to the MILP.

## ‚úÖ 1. What‚Äôs inside your labor dataset
### From the preview, the important columns are:
### Year ‚Üí 2017, 2022 (USDA Census dataset gives 5-year snapshots)

### Data Item
### Examples:
### "LABOR, HIRED - NUMBER OF WORKERS"
### "LABOR, HIRED, GE 150 DAYS - NUMBER OF WORKERS"
### "LABOR, HIRED, LT 150 DAYS - NUMBER OF WORKERS"

### Domain Category
### Grouping:

### (1 TO 4 HIRED WORKERS)
### (5 TO 9)
### (10 OR MORE)
### etc.

### Value
### A STRING with commas: "29,321"

### The rest of the columns are irrelevant (Ag District, Week Ending, County, etc. ‚Üí all NaN).
## üéØ 2. What we want from this dataset
### We need a single number per year:
### üîπ total hired workers
### = sum of all "Value" rows for that year.
### Then for operations modeling:
### üîπ Estimate weekly labor-hours
### Assume:
### full-time equivalent = 40 hours/week

### seasonal workers available mostly during planting + harvest
### We will generate:
### year | total_workers | avg_weekly_labor_hours

### If total workers = 29,321 (example), then:
### avg_weekly_labor_hours = total_workers * 40  = 1,172,840 hours/week
### We can later reshape this into week-level labor availability for the MILP and Monte Carlo.

## üîß 3. Clean the dataset (copy & paste code)
### We transform Value, group by Year, compute totals.

In [52]:
df = pd.read_csv('/home/mak/Documents/Optimization/Project/data/raw/no_of_worker2.csv')

In [None]:
df = df[df["Data Item"].str.contains("LABOR", case=False)]
df.head(3)

In [54]:
# ---- Clean Value: remove commas and convert to int ----
df["Value"] = df["Value"].astype(str).str.replace(",", "")
df["Value"] = pd.to_numeric(df["Value"], errors="coerce")

In [55]:
# ---- Group per year ----
labor_by_year = df.groupby("Year")["Value"].sum().reset_index()
labor_by_year = labor_by_year.rename(columns={"Value": "total_workers"})
print(labor_by_year)

   Year  total_workers
0  2017         168597
1  2022         148014


## üîß 4. Convert workers ‚Üí weekly labor hours

## We assume:
## 40 hours/week per worker
## Opportunity to allocate more hours in peak weeks

In [56]:
labor_by_year["weekly_labor_hours"] = labor_by_year["total_workers"] * 40
labor_by_year

Unnamed: 0,Year,total_workers,weekly_labor_hours
0,2017,168597,6743880
1,2022,148014,5920560


## üî• 6. Convert yearly ‚Üí weekly labor availability

## Since NASS/NOAA data is week-based, we create a weekly labor table.
## We need a simple distribution strategy:

## Option A (simple):
## Labor available evenly across all weeks.
## Option B (better for realism):

## Planting-heavy (weeks 16‚Äì22): 120% of average
## Harvest-heavy (weeks 35‚Äì44): 150% of average
## Other weeks: 75% of average

## Let‚Äôs implement Option B.

In [57]:
def build_weekly_labor(year, total_workers):
    base = total_workers * 40  # weekly hours
    
    weekly = []
    for week in range(1, 53):
        if 16 <= week <= 22:       # planting season
            mult = 1.2
        elif 35 <= week <= 44:     # harvest season
            mult = 1.5
        else:
            mult = 0.75
                    
        weekly.append({
            "year": year,
            "week": week,
            "labor_hours": base * mult
        })
    
    return pd.DataFrame(weekly)

In [58]:
weekly_labor = pd.concat([
    build_weekly_labor(row["Year"], row["total_workers"])
    for _, row in labor_by_year.iterrows()
])

In [59]:
weekly_labor.head(3)

Unnamed: 0,year,week,labor_hours
0,2017,1,5057910.0
1,2017,2,5057910.0
2,2017,3,5057910.0


## üìå 7. How labor dataset integrates into your pipeline
## ‚úî Optimization (Gurobi)

## This provides the weekly constraint:

## sum_f Plant[f,w] * area[f] * labor_per_acre ‚â§ labor_hours[w]


## and similar for Harvest.
## ‚úî Monte Carlo Simulation
## Labor availability is perturbed:
## labor_hours_sim[w] = labor_hours[w] * uniform(0.85, 1.15)
## ‚úî Forecasting
## Use labor as a predictive feature:
## total labor available
## planting-week labor intensity
## ‚úî Visualization
## Labor heatmap across weeks.

## üß© The Master Table Builder

In [60]:
# ---- 1. Merge weather + labor on year/week ----
weekly_master = weekly_clean.merge(
    weekly_labor,
    on=["year", "week"],
    how="left"
)

In [61]:
# ---- 2. Attach planting windows (per year) ----
weekly_master = weekly_master.merge(
    plantingWindows_df.rename(columns={"Year": "year"}),
    on="year",
    how="left"
)

In [62]:
# ---- 3. Attach harvest windows (per year) ----
weekly_master = weekly_master.merge(
    harvest_windows_df.rename(columns={"Year": "year"}),
    on="year",
    how="left"
)

In [63]:
# ---- 4. Create window flags ----
weekly_master["is_plant_window"] = (
    (weekly_master["week"] >= weekly_master["plant_start_week"]) &
    (weekly_master["week"] <= weekly_master["plant_end_week"])
)

In [64]:
weekly_master["is_harvest_window"] = (
    (weekly_master["week"] >= weekly_master["harvest_start_week"]) &
    (weekly_master["week"] <= weekly_master["harvest_end_week"])
)

In [65]:
# Optional: fill labor_hours for weeks with no census year (if any) with 0 or a default
weekly_master["labor_hours"] = weekly_master["labor_hours"].fillna(0)
# ---- 5. Sort & inspect ----
weekly_master = weekly_master.sort_values(["year", "week"]).reset_index(drop=True)
weekly_master.head()

Unnamed: 0,year,week,prcp_week_in,TMAX,TMIN,TAVG,AWND,capacity_factor,rain_category,labor_hours,plant_start_week,plant_end_week,harvest_start_week,harvest_end_week,is_plant_window,is_harvest_window
0,2015,1,1.139524,34.697368,17.697368,27.45,9.190909,0.7,moderate,0.0,16,20,36,43.0,False,False
1,2015,2,0.281357,18.744361,0.556391,10.314286,11.903896,0.9,light,0.0,16,20,36,43.0,False,False
2,2015,3,0.147626,33.541353,13.80303,25.2,8.815584,0.9,light,0.0,16,20,36,43.0,False,False
3,2015,4,0.130556,39.533835,26.900763,34.142857,6.849351,0.9,light,0.0,16,20,36,43.0,False,False
4,2015,5,0.802372,35.639098,21.293233,30.571429,9.894805,0.7,moderate,0.0,16,20,36,43.0,False,False


## This is correct because:
## ‚úî Weeks 1‚Äì15 ‚Üí not planting
## You set:
plant_start_week = 16
plant_end_week = 20

So week 1 ‚Üí is_plant_window = False.

‚úî Weeks 1‚Äì35 ‚Üí not harvest

You set:

harvest_start_week = 36

harvest_end_week = 43

So week 1 ‚Üí is_harvest_window = False.

‚úî Rain category + capacity factor work

Week 1 has ~1.14 inches of rain ‚Üí moderate rain ‚Üí capacity factor = 0.7.
Correct.

‚úî Labor hours = 0

Because your labor dataset likely only has 2017 & 2022 data.
Other years defaulted to 0 (we can fix this if needed, see below).

Everything is working.

‚ö†Ô∏è 2. Important fix: labor_hours = 0 for all years except census years

Right now:

USDA labor census only exists for 2017 and 2022

All other years show 0 labor-hours

For the MILP, you want non-zero labor for all weeks.

You have 2 good options:

Option A ‚Äî Forward-fill from closest census year

This is realistic.

Example:

Years 2015, 2016 ‚Üí use 2017 labor numbers

Years 2018, 2019, 2020, 2021 ‚Üí use 2022

Years after 2022 ‚Üí keep 2022

In [66]:
# Make sure labor_hours is float so it can hold NaN
weekly_master["labor_hours"] = weekly_master["labor_hours"].astype(float)

In [67]:
weekly_master["labor_hours"] = (
    weekly_master
    .groupby("week")["labor_hours"]
    .transform(lambda s: s.replace(0, np.nan).ffill().bfill())
)

In [None]:
# weekly_master.head(60)

In [68]:
weekly_master['year'].value_counts()

year
2015    53
2016    53
2021    53
2020    53
2017    52
2019    52
2018    52
2022    52
2023    52
2024    52
2025    44
Name: count, dtype: int64

In [81]:
from pathlib import Path

# Create directories if not exist
Path("data/processed").mkdir(parents=True, exist_ok=True)
Path("data/raw").mkdir(parents=True, exist_ok=True)

# Save processed datasets
save_map = {
    "nass_corn_planting_weekly_clean.csv": df_planting_clean,
    "nass_corn_planting_windows.csv": plantingWindows_df,
    "nass_corn_harvest_weekly_clean.csv": df_harvested_clean,
    "nass_corn_harvest_windows.csv": harvest_windows_df,
    "illinois_corn_fields_clean.csv": fields_clean,
    "noaa_il_daily_clean.csv": daily,
    "noaa_il_weekly_clean.csv": weekly_clean,
    "labor_illinois_yearly_clean.csv": labor_by_year,
    "labor_weekly_capacity_clean.csv": weekly_labor,

    # "master_weekly_table.csv": weekly_master
}

for filename, df in save_map.items():
    df.to_csv(f"data/processed/{filename}", index=False)
    print(f"Saved: data/processed/{filename}")

Saved: data/processed/nass_corn_planting_weekly_clean.csv
Saved: data/processed/nass_corn_planting_windows.csv
Saved: data/processed/nass_corn_harvest_weekly_clean.csv
Saved: data/processed/nass_corn_harvest_windows.csv
Saved: data/processed/illinois_corn_fields_clean.csv
Saved: data/processed/noaa_il_daily_clean.csv
Saved: data/processed/noaa_il_weekly_clean.csv
Saved: data/processed/labor_illinois_yearly_clean.csv
Saved: data/processed/labor_weekly_capacity_clean.csv


In [71]:
df_harvested_clean.head(3)

Unnamed: 0,Year,week,week_ending,pct_harvested
0,2005,36,2005-09-11,6
1,2005,37,2005-09-18,13
2,2005,38,2005-09-25,30


In [72]:
df_planting_clean.head(3)

Unnamed: 0,Year,week,week_ending,pct_planted
194,2005,15,2005-04-17,35.0
195,2005,16,2005-04-24,64.0
196,2005,17,2005-05-01,82.0


# 1Ô∏è‚É£ Make labor realistic (seasonal, not 5M hours every week)
Right now, after you build weekly_master, every row has the same labor_hours ‚âà 5,057,910.
We‚Äôll keep that as the annual pool, but scale it by week.

In [82]:
import numpy as np

# Assume weekly_master is already built and contains `year`, `week`, `labor_hours`
wm = weekly_master.copy()

# (Optional) keep original labor for reference
wm["labor_hours_base"] = wm["labor_hours"]

def labor_season_factor(week: int) -> float:
    """
    Fraction of the annual labor pool effectively available in a given week.
    Tune these numbers as you like.
    """
    # Peak field activity during planting
    if 16 <= week <= 20:
        return 0.30      # 30% of annual workforce active in field

    # Peak field activity during harvest
    elif 36 <= week <= 45:
        return 0.40      # 40% of annual workforce

    # Shoulder / off-season
    else:
        return 0.10      # 10% in field operations

# Apply seasonal factor
wm["labor_hours"] = wm.apply(
    lambda r: r["labor_hours_base"] * labor_season_factor(int(r["week"])),
    axis=1,
)

# Overwrite weekly_master and resave
weekly_master = wm
weekly_master.to_csv("data/processed/master_weekly_table.csv", index=False)

weekly_master.head(15)


Unnamed: 0,year,week,prcp_week_in,TMAX,TMIN,TAVG,AWND,capacity_factor,rain_category,labor_hours,plant_start_week,plant_end_week,harvest_start_week,harvest_end_week,is_plant_window,is_harvest_window,labor_hours_base
0,2015,1,1.139524,34.697368,17.697368,27.45,9.190909,0.7,moderate,50579.1,16,20,36,43.0,False,False,505791.0
1,2015,2,0.281357,18.744361,0.556391,10.314286,11.903896,0.9,light,50579.1,16,20,36,43.0,False,False,505791.0
2,2015,3,0.147626,33.541353,13.80303,25.2,8.815584,0.9,light,50579.1,16,20,36,43.0,False,False,505791.0
3,2015,4,0.130556,39.533835,26.900763,34.142857,6.849351,0.9,light,50579.1,16,20,36,43.0,False,False,505791.0
4,2015,5,0.802372,35.639098,21.293233,30.571429,9.894805,0.7,moderate,50579.1,16,20,36,43.0,False,False,505791.0
5,2015,6,0.279729,31.984962,11.167939,22.942857,9.623377,0.9,light,50579.1,16,20,36,43.0,False,False,505791.0
6,2015,7,0.008411,29.112782,11.353383,21.114286,11.131169,0.9,light,50579.1,16,20,36,43.0,False,False,505791.0
7,2015,8,0.174054,21.0,4.037594,14.771429,9.672727,0.9,light,50579.1,16,20,36,43.0,False,False,505791.0
8,2015,9,0.317838,22.481203,0.052632,12.914286,8.05974,0.9,light,50579.1,16,20,36,43.0,False,False,505791.0
9,2015,10,0.187216,33.219697,11.878788,25.228571,9.261842,0.9,light,50579.1,16,20,36,43.0,False,False,505791.0


In [86]:
wm = pd.read_csv("data/processed/master_weekly_table.csv")
wm = wm[(wm["year"] >= 2017) & (wm["year"] <= 2024)].copy()
wm.to_csv("data/processed/master_weekly_table_2017_2024.csv", index=False)

In [85]:
wm_df = pd.read_csv("data/processed/master_weekly_table_2017_2024.csv")
wm_df["labor_hours"] = wm_df["labor_hours"] * (1/6)
wm_df.to_csv("data/processed/master_weekly_table_labor_2017_2024_tightened.csv", index=False) 