In [9]:
# 🌿 Climate-Resilient Crop Recommendation - Full ETL + Preprocessing (No Train/Test Split)

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from scipy import stats

# -------------------------
# 1. Extract (Load Data)
# -------------------------
df = pd.read_csv("Crop_recommendation.csv")

print("🔹 Dataset Shape:", df.shape)
print("🔹 Columns:", df.columns.tolist())
print("\n🔹 First 5 rows:\n", df.head())


🔹 Dataset Shape: (2200, 8)
🔹 Columns: ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label']

🔹 First 5 rows:
     N   P   K  temperature   humidity        ph    rainfall label
0  90  42  43    20.879744  82.002744  6.502985  202.935536  rice
1  85  58  41    21.770462  80.319644  7.038096  226.655537  rice
2  60  55  44    23.004459  82.320763  7.840207  263.964248  rice
3  74  35  40    26.491096  80.158363  6.980401  242.864034  rice
4  78  42  42    20.130175  81.604873  7.628473  262.717340  rice


In [10]:
# -------------------------
# 2. Transform (Cleaning)
# -------------------------

# (a) Handle missing values
print("\nMissing values:\n", df.isnull().sum())
df = df.dropna()   # just in case

# (b) Remove duplicates
dup_count = df.duplicated().sum()
print("\nDuplicate rows:", dup_count)
if dup_count > 0:
    df = df.drop_duplicates()

# (c) Outlier Detection & Removal (Z-score method)
numeric_cols = df.drop("label", axis=1).columns
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]  # remove extreme outliers

print("\nAfter outlier removal, shape:", df.shape)


Missing values:
 N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64

Duplicate rows: 0

After outlier removal, shape: (2029, 8)


In [11]:
# -------------------------
# 3. Feature Engineering
# -------------------------

# (a) Encode crop labels
le = LabelEncoder()
df["label_encoded"] = le.fit_transform(df["label"])

# (b) Apply Scaling (both versions available)
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

df_standard = df.copy()
df_standard[numeric_cols] = scaler_standard.fit_transform(df_standard[numeric_cols])

df_minmax = df.copy()
df_minmax[numeric_cols] = scaler_minmax.fit_transform(df_minmax[numeric_cols])

print("\n✅ Label Encoding + Scaling Done!")
print("Sample crops:", df['label'].unique()[:5])
print("Encoded labels:", df['label_encoded'].unique()[:5])


✅ Label Encoding + Scaling Done!
Sample crops: ['rice' 'maize' 'chickpea' 'kidneybeans' 'pigeonpeas']
Encoded labels: [20 11  3  9 18]


In [12]:
# -------------------------
# 4. Load (Save Clean Data)
# -------------------------
df.to_csv("Crop_cleaned.csv", index=False)
df_standard.to_csv("Crop_cleaned_standard_scaled.csv", index=False)
df_minmax.to_csv("Crop_cleaned_minmax_scaled.csv", index=False)

print("\n📂 Preprocessed datasets saved:")
print(" - Crop_cleaned.csv (raw cleaned)")
print(" - Crop_cleaned_standard_scaled.csv (standard scaled)")
print(" - Crop_cleaned_minmax_scaled.csv (min-max scaled)")


📂 Preprocessed datasets saved:
 - Crop_cleaned.csv (raw cleaned)
 - Crop_cleaned_standard_scaled.csv (standard scaled)
 - Crop_cleaned_minmax_scaled.csv (min-max scaled)
