In [33]:
import numpy as np

In [34]:
import pandas as pd

# Load the cleaned weekly planting data and NOAA weather aggregation
weather_path = "../../data/processed/noaa_il_weekly_agg.csv"
planting_path = "../../data/processed/nass_corn_planting_weekly_clean.csv"

# Load data
weather_df = pd.read_csv(weather_path)
planting_df = pd.read_csv(planting_path)

# Preview
weather_df.head(), planting_df.head()


(   year  week  prcp_week_in       TMAX       TMIN       TAVG       AWND
 0  2017     1          0.77  22.285714   7.742857  15.414286  10.685714
 1  2017     2          3.14  34.857143  18.685714  26.528571  10.742857
 2  2017     3          7.84  45.628571  34.285714  39.914286   8.432143
 3  2017     4          0.38  35.257143  29.000000  32.428571  11.775000
 4  2017     5          0.19  35.057143  19.342857  27.300000  11.800000,
    Year  week week_ending  pct_planted
 0  2005    15  2005-04-17         35.0
 1  2005    16  2005-04-24         64.0
 2  2005    17  2005-05-01         82.0
 3  2005    18  2005-05-08         94.0
 4  2005    19  2005-05-15         98.0)

In [35]:
weather_df = pd.read_csv(weather_path)
planting_df = pd.read_csv(planting_path)

# Standardize column names to match on 'year' and 'week'
planting_df = planting_df.rename(columns={"Year": "year"})

# Merge on 'year' and 'week'
merged_df = pd.merge(planting_df, weather_df, on=["year", "week"], how="inner")

# Display merged dataset structure
merged_df.head()

Unnamed: 0,year,week,week_ending,pct_planted,prcp_week_in,TMAX,TMIN,TAVG,AWND
0,2017,14,2017-04-09,1.0,6.92,59.542857,42.657143,50.814286,13.278571
1,2017,15,2017-04-16,6.0,6.22,67.485714,46.6,57.157143,10.903571
2,2017,16,2017-04-23,34.0,1.28,67.171429,44.514286,55.885714,10.325
3,2017,17,2017-04-30,63.0,15.96,62.8,45.714286,54.457143,12.653571
4,2017,18,2017-05-07,65.0,3.4,58.228571,42.142857,50.242857,12.178571


In [36]:
# Drop redundant year column
merged_df.drop(columns=["year"], inplace=True)

# Drop rows with missing values (if any)
merged_df.dropna(inplace=True)

In [37]:
# merged_path = "/../../data/processed/merged_planting_weather.csv"
# merged_df = pd.to_csv(merged_path)

In [38]:
# Fix column name for consistency
planting_df = planting_df.rename(columns={"Year": "year"})

# Merge again
merged_df = pd.merge(planting_df, weather_df, on=["year", "week"], how="inner")

# Drop rows with missing planting data
merged_df = merged_df.dropna(subset=["pct_planted"])

# Sort for lag features
merged_df = merged_df.sort_values(by=["year", "week"]).reset_index(drop=True)

merged_df.head()


Unnamed: 0,year,week,week_ending,pct_planted,prcp_week_in,TMAX,TMIN,TAVG,AWND
0,2017,14,2017-04-09,1.0,6.92,59.542857,42.657143,50.814286,13.278571
1,2017,15,2017-04-16,6.0,6.22,67.485714,46.6,57.157143,10.903571
2,2017,16,2017-04-23,34.0,1.28,67.171429,44.514286,55.885714,10.325
3,2017,17,2017-04-30,63.0,15.96,62.8,45.714286,54.457143,12.653571
4,2017,18,2017-05-07,65.0,3.4,58.228571,42.142857,50.242857,12.178571


# ✅ 1. Temporal Features (Most Important)
Planting is a cumulative seasonal process. Your model must understand time.

In [39]:
merged_df["week_number"] = merged_df["week"]
merged_df["cos_week"] = np.cos(2 * np.pi * merged_df["week"] / 52)
merged_df["sin_week"] = np.sin(2 * np.pi * merged_df["week"] / 52)

# ✅ 2. Lag Features (Huge performance boost)
Planting progress this week depends on last week’s progress.

In [40]:
merged_df["pct_lag_1"] = merged_df.groupby("year")["pct_planted"].shift(1)
merged_df["pct_lag_2"] = merged_df.groupby("year")["pct_planted"].shift(2)
merged_df["pct_weekly_change"] = merged_df.groupby("year")["pct_planted"].diff()
merged_df["pct_weekly_change_pct"] = merged_df.groupby("year")["pct_planted"].pct_change()


# ✅ 3. Add Growing Degree Days (GDD)
Corn growth and planting conditions correlate strongly with heat accumulation.

In [41]:
merged_df["gdd"] = np.maximum(merged_df["TAVG"] - 10, 0)
merged_df["gdd_cum"] = merged_df.groupby("year")["gdd"].cumsum()

# ✅ 4. Add Soil Moisture Proxy (Rainfall Memory)
This is extremely predictive of planting delays.

In [42]:
merged_df["rain_last_week"] = merged_df.groupby("year")["prcp_week_in"].shift(1)
merged_df["rain_2wk"] = merged_df.groupby("year")["prcp_week_in"].rolling(2).sum().reset_index(0,drop=True)
merged_df["rain_3wk"] = merged_df.groupby("year")["prcp_week_in"].rolling(3).sum().reset_index(0,drop=True)


# ✅ 5. Add Drying Index
Warmer + windier weather dries the soil faster, allowing machinery to enter fields.

In [43]:
merged_df["drying_index"] = merged_df["TAVG"] * merged_df["AWND"]

# ✅ 6. Add Anomalies (Deviation from Normal)
Weekly normals matter in agronomy.

In [44]:
merged_df["tavg_anomaly"] = merged_df["TAVG"] - merged_df.groupby("week")["TAVG"].transform("mean")
merged_df["prcp_anomaly"] = merged_df["prcp_week_in"] - merged_df.groupby("week")["prcp_week_in"].transform("mean")


In [45]:
merged_df.head()
merged_df.columns

Index(['year', 'week', 'week_ending', 'pct_planted', 'prcp_week_in', 'TMAX',
       'TMIN', 'TAVG', 'AWND', 'week_number', 'cos_week', 'sin_week',
       'pct_lag_1', 'pct_lag_2', 'pct_weekly_change', 'pct_weekly_change_pct',
       'gdd', 'gdd_cum', 'rain_last_week', 'rain_2wk', 'rain_3wk',
       'drying_index', 'tavg_anomaly', 'prcp_anomaly'],
      dtype='object')

In [46]:
del merged_df['week_ending']

In [47]:
!pip install xgboost



In [48]:
import pandas as pd
import xgboost 
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [51]:
merged_df.replace([np.inf, -np.inf], np.nan, inplace=True)
merged_df.isna().sum().sort_values(ascending=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df.replace([np.inf, -np.inf], np.nan, inplace=True)


pct_weekly_change_pct    3
week                     0
year                     0
prcp_week_in             0
TMAX                     0
TMIN                     0
pct_planted              0
TAVG                     0
AWND                     0
cos_week                 0
week_number              0
sin_week                 0
pct_lag_1                0
pct_lag_2                0
pct_weekly_change        0
gdd                      0
gdd_cum                  0
rain_last_week           0
rain_2wk                 0
rain_3wk                 0
drying_index             0
tavg_anomaly             0
prcp_anomaly             0
dtype: int64

In [53]:
# merged_df["week_ending"] = pd.to_datetime(merged_df["week_ending"]).astype(int) // 10**9
merged_df["pct_weekly_change_pct"] = merged_df["pct_weekly_change_pct"].fillna(0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df["pct_weekly_change_pct"] = merged_df["pct_weekly_change_pct"].fillna(0)


In [54]:
# Define features and target
features = ['year', 'week', 'prcp_week_in', 'TMAX','TMIN', 'TAVG', 'AWND',
            'week_number', 'cos_week', 'sin_week','pct_lag_1', 'pct_lag_2', 'pct_weekly_change',
            'pct_weekly_change_pct','gdd', 'gdd_cum', 'rain_last_week', 'rain_2wk', 'rain_3wk',
            'drying_index', 'tavg_anomaly', 'prcp_anomaly']
target = "pct_planted"

# Drop rows with missing values
merged_df = merged_df.dropna(subset=features + [target])

# Split into train and test sets
X = merged_df[features]
y = merged_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost regressor
model = XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2

(29.18326972312897, 0.9726728828800253)

In [60]:
# Convert year to int if needed
merged_df["year"] = merged_df["year"].astype(int)

# Define train and test split by year
train_df = merged_df[merged_df["year"] <= 2022]
test_df  = merged_df[merged_df["year"] == 2023]

# Split features and target
X_train = train_df[features]
y_train = train_df[target]
X_test  = test_df[features]
y_test  = test_df[target]

# Train XGBoost
model = XGBRegressor(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42
)
model.fit(X_train, y_train)

# Test predictions
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2


(11.506746566412403, 0.9895391727119648)

In [62]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_log_error

# Evaluate other regression models
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)

# Calculate additional metrics
rf_mae = mean_absolute_error(y_test, rf_preds)
rf_msle = mean_squared_log_error(y_test.clip(min=0), rf_preds.clip(min=0))

lr_mae = mean_absolute_error(y_test, lr_preds)
lr_msle = mean_squared_log_error(y_test.clip(min=0), lr_preds.clip(min=0))

{
    "Random Forest MAE": rf_mae,
    # "Random Forest MSLE": rf_msle,
    "Linear Regression MAE": lr_mae,
    # "Linear Regression MSLE": lr_msle
}


TypeError: clip() got an unexpected keyword argument 'min'

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    mean_squared_log_error,
    r2_score
)
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Load the merged dataset again
# file_path = "/mnt/data/merged_weekly_planting_weather.csv"
# merged_df = pd.read_csv(file_path)

# Define features and target
features = ["week", "prcp_week_in", "TMAX", "TMIN", "TAVG", "AWND"]
target = "pct_planted"

# Drop missing
merged_df = merged_df.dropna(subset=features + [target])

# Split into train-test
X = merged_df[features]
y = merged_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
rf = RandomForestRegressor(random_state=42)
lr = LinearRegression()

rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

# Predictions
rf_preds = rf.predict(X_test)
lr_preds = lr.predict(X_test)

# Ensure no negative predictions for MSLE
rf_preds_safe = rf_preds.clip(min=0)
lr_preds_safe = lr_preds.clip(min=0)
y_test_safe = y_test.clip(lower=0)

# Calculate metrics
metrics = {
    "Random Forest MAE": mean_absolute_error(y_test, rf_preds),
    "Random Forest MSE": mean_squared_error(y_test, rf_preds),
    "Random Forest R2": r2_score(y_test, rf_preds),
    "Random Forest MSLE": mean_squared_log_error(y_test_safe, rf_preds_safe),

    "Linear Regression MAE": mean_absolute_error(y_test, lr_preds),
    "Linear Regression MSE": mean_squared_error(y_test, lr_preds),
    "Linear Regression R2": r2_score(y_test, lr_preds),
    "Linear Regression MSLE": mean_squared_log_error(y_test_safe, lr_preds_safe),
}

metrics


{'Random Forest MAE': 8.268333333333334,
 'Random Forest MSE': 117.10195000000002,
 'Random Forest R2': 0.9294613780871428,
 'Random Forest MSLE': 0.8138307097721663,
 'Linear Regression MAE': 10.704263769305058,
 'Linear Regression MSE': 202.68655945856898,
 'Linear Regression R2': 0.8779078351430881,
 'Linear Regression MSLE': 0.6722083934833497}