In [1]:
import pandas as pd

# Example daily rainfall data
# Skip the first row which contains header-like information
rainfall_df = pd.read_csv("ken-rainfall-subnat-full.csv", skiprows=[1])

# Convert the 'date' column to datetime objects
rainfall_df["date"] = pd.to_datetime(rainfall_df["date"])

# Convert the rainfall columns to numeric, coercing errors
rainfall_df["r1h"] = pd.to_numeric(rainfall_df["r1h"], errors='coerce')
rainfall_df["r1h_avg"] = pd.to_numeric(rainfall_df["r1h_avg"], errors='coerce')


# Suppose you know planting_date
planting_date = pd.Timestamp("2023-03-15")

# Filter rainfall in month before planting
rainfall_before = rainfall_df[(rainfall_df["date"] >= planting_date - pd.Timedelta(days=30)) &
                     (rainfall_df["date"] < planting_date)]

# Use 'r1h_avg' column for rainfall data
avg_rainfall = rainfall_before["r1h_avg"].mean()
print("Avg rainfall (30 days before planting):", avg_rainfall)

Avg rainfall (30 days before planting): 56.351073882304526


In [2]:
print(rainfall_df.columns)

Index(['date', 'adm_level', 'adm_id', 'PCODE', 'n_pixels', 'rfh', 'rfh_avg',
       'r1h', 'r1h_avg', 'r3h', 'r3h_avg', 'rfq', 'r1q', 'r3q', 'version'],
      dtype='object')


In [3]:
# Create a rolling sum over 5 days using the 'r1h_avg' column
rainfall_df["rain_5day_sum"] = rainfall_df["r1h_avg"].rolling(window=5).sum()

# Find the first date where rolling sum >= 20mm
onset_row = rainfall_df[rainfall_df["rain_5day_sum"] >= 20].head(1)
if not onset_row.empty:
    onset_date = onset_row["date"].iloc[0]
    print("Rainfall onset date:", onset_date)
else:
    print("No onset found in dataset")

Rainfall onset date: 1981-03-01 00:00:00


In [4]:
import pandas as pd

temp_df = pd.read_csv("kenya-climate-data-1991-2016-temp-degress-celcius.csv")

# Create a datetime column from 'Year' and 'Month Average'
# Assuming 'Month Average' contains month names like 'Jan Average', 'Feb Average', etc.
temp_df['date'] = pd.to_datetime(temp_df['Year'].astype(str) + '-' + temp_df['Month Average'].str.split().str[0], format='%Y-%b')

# Select 30 days after planting
temp_after = temp_df[(temp_df["date"] >= planting_date) &
                     (temp_df["date"] < planting_date + pd.Timedelta(days=30))]

avg_temp = temp_after["Temperature - (Celsius)"].mean()
print("Avg temp (30 days after planting):", avg_temp)

Avg temp (30 days after planting): nan


In [5]:
print(temp_df.columns)

Index(['Year', 'Month Average', 'Temperature - (Celsius)', 'date'], dtype='object')


In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree, ensemble
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [7]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rahuljaiswalonkaggle/soil-fertility-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rahuljaiswalonkaggle/soil-fertility-dataset?dataset_version_number=1...


100%|██████████| 16.9k/16.9k [00:00<00:00, 3.04MB/s]

Extracting files...
Path to dataset files: C:\Users\ADMIN\.cache\kagglehub\datasets\rahuljaiswalonkaggle\soil-fertility-dataset\versions\1





In [8]:
soil_df = pd.DataFrame({
    "N": [25],  # Nitrogen ppm
    "P": [15],  # Phosphorus ppm
    "K": [30],  # Potassium ppm
    "pH": [6.5]
})

# Example: weighted soil fertility score
soil_df["fertility_index"] = (0.4*soil_df["N"] +
                              0.3*soil_df["P"] +
                              0.2*soil_df["K"] -
                              0.1*abs(soil_df["pH"]-6.5))

print("Soil Fertility Index:", soil_df["fertility_index"].iloc[0])


Soil Fertility Index: 20.5


In [9]:
features = pd.DataFrame([{
    "avg_rainfall": avg_rainfall,
    "avg_temp": avg_temp,
    "fertility_index": soil_df["fertility_index"].iloc[0]
}])
print(features)

   avg_rainfall  avg_temp  fertility_index
0     56.351074       NaN             20.5


In [10]:
# --- Synthetic labeling function ---
def label_suitability(row):
    # Define thresholds (tweak these!)
    rain_ok = 30 <= row["avg_rainfall"] <= 200
    temp_ok = 18 <= row["avg_temp"] <= 30
    fertility_ok = row["fertility_index"] >= 15

    return int(rain_ok and temp_ok and fertility_ok)

# Apply to dataset
final_df = pd.DataFrame({
    "avg_rainfall": [avg_rainfall],
    "avg_temp": [avg_temp],
    "fertility_index": soil_df["fertility_index"].iloc[0]
})

# Replace NaN with mean temperature
final_df["avg_temp"] = final_df["avg_temp"].fillna(temp_df["Temperature - (Celsius)"].mean())


print(final_df)


   avg_rainfall   avg_temp  fertility_index
0     56.351074  25.118588             20.5


In [11]:
import numpy as np
import pandas as pd

# Generate synthetic data
np.random.seed(42)
n_samples = 500

synthetic_df = pd.DataFrame({
    "avg_rainfall": np.random.uniform(20, 300, n_samples),
    "avg_temp": np.random.uniform(10, 35, n_samples),
    "fertility_index": np.random.uniform(5, 30, n_samples)
})

# Labeling function (baseline rules)
def assign_label(row):
    if (50 <= row["avg_rainfall"] <= 200) and (18 <= row["avg_temp"] <= 30) and (row["fertility_index"] >= 15):
        return 1
    return 0

synthetic_df["suitable"] = synthetic_df.apply(assign_label, axis=1)

# Add noise (flip ~10% of labels randomly)
noise_fraction = 0.1
n_noisy = int(noise_fraction * len(synthetic_df))
flip_indices = np.random.choice(synthetic_df.index, size=n_noisy, replace=False)
synthetic_df.loc[flip_indices, "suitable"] = 1 - synthetic_df.loc[flip_indices, "suitable"]

print("Class balance with noise:\n", synthetic_df["suitable"].value_counts())
synthetic_df.head(10)



Class balance with noise:
 suitable
0    391
1    109
Name: count, dtype: int64


Unnamed: 0,avg_rainfall,avg_temp,fertility_index,suitable
0,124.871233,27.454043,9.628323,0
1,286.200006,23.402409,18.547524,0
2,224.958304,17.73819,26.823646,0
3,187.624376,30.344875,23.305622,0
4,63.685219,27.118279,25.164029,1
5,63.678466,14.065423,21.469584,1
6,36.263411,32.77318,22.306914,0
7,262.529321,30.563431,26.229891,0
8,188.312203,33.744998,11.2417,0
9,218.260322,28.142988,17.235624,0


In [12]:
from imblearn.over_sampling import SMOTE

X = synthetic_df[["avg_rainfall", "avg_temp", "fertility_index"]]
y = synthetic_df["suitable"]

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

balanced_df = pd.DataFrame(X_res, columns=X.columns)
balanced_df["suitable"] = y_res

print("Class balance after SMOTE:\n", balanced_df["suitable"].value_counts())




Class balance after SMOTE:
 suitable
0    391
1    391
Name: count, dtype: int64


In [13]:
# ✅ Use balanced_df from SMOTE step
X = balanced_df.drop("suitable", axis=1)
y = balanced_df["suitable"]

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train Decision Tree
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.81      0.78      0.79        79
           1       0.79      0.81      0.80        78

    accuracy                           0.80       157
   macro avg       0.80      0.80      0.80       157
weighted avg       0.80      0.80      0.80       157



In [14]:
import joblib

# Save the trained model
joblib.dump(clf, "crop_model.pkl")
print("✅ Model saved as crop_model.pkl")

✅ Model saved as crop_model.pkl
