In [1]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Simulate 100 days of matcha farming
n_samples = 100

data = {
    'sunlight_hours': np.clip(np.random.normal(5, 1.5, n_samples), 2, 8),
    'soil_ph': np.clip(np.random.normal(6.3, 0.2, n_samples), 5.8, 6.8),
    'rainfall_mm': np.clip(np.random.normal(25, 10, n_samples), 5, 60),
    'avg_temp_celsius': np.clip(np.random.normal(20, 3, n_samples), 15, 28),
}

# Create target features with some weighted influence and noise
data['leaf_quality_score'] = (
    0.3 * data['sunlight_hours'] +
    0.2 * (7 - np.abs(data['soil_ph'] - 6.3)) +  # Optimal near 6.3
    0.2 * np.clip(30 - np.abs(data['rainfall_mm'] - 25), 0, 30) / 30 +
    0.2 * np.clip(25 - np.abs(data['avg_temp_celsius'] - 22), 0, 25) / 25 +
    np.random.normal(0, 0.5, n_samples)
) * 10  # Scale to 0–100

data['leaf_quality_score'] = np.clip(data['leaf_quality_score'], 0, 100)

data['harvest_yield_kg'] = (
    0.5 * data['leaf_quality_score'] +
    np.random.normal(0, 5, n_samples)
) / 10

# Convert to DataFrame
df = pd.DataFrame(data)
print(df.head())

   sunlight_hours   soil_ph  rainfall_mm  avg_temp_celsius  \
0        5.745071  6.016926    28.577874         17.513015   
1        4.792604  6.215871    30.607845         18.319457   
2        5.971533  6.231457    35.830512         22.241881   
3        7.284545  6.139545    35.538021         21.831111   
4        4.648770  6.267743    11.223306         19.937295   

   leaf_quality_score  harvest_yield_kg  
0           26.099443          1.768061  
1           28.544378          2.381927  
2           35.062346          1.053834  
3           39.051581          2.234064  
4           28.548006          1.102079  
