# Data Preprocessing for Org Go Project
This notebook covers data loading, cleaning, feature engineering, encoding, scaling, and train-test splits.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os

# Load data
df = pd.read_csv('data/raw/Crop_recommendation.csv')
print(f'Dataset shape: {df.shape}')
print(df.head())

In [None]:
# Remove duplicates
df.drop_duplicates(inplace=True)

# Handle missing values by median (numerical) and mode (categorical)
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if df[col].dtype == 'object':
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)

print('Missing values after imputation:', df.isnull().sum().sum())

In [None]:
# Feature Engineering
if set(['N', 'P', 'K']).issubset(df.columns):
    df['NPK_sum'] = df['N'] + df['P'] + df['K']
    df['N_to_PK_ratio'] = df['N'] / (df['P'] + df['K'] + 1)

if set(['temperature', 'humidity']).issubset(df.columns):
    df['temp_humidity_ratio'] = df['temperature'] / (df['humidity'] + 1)

if 'ph' in df.columns:
    df['soil_type'] = pd.cut(df['ph'], bins=[0,6.5,7.5,14], labels=['Acidic', 'Neutral', 'Alkaline'])

if 'rainfall' in df.columns:
    df['rainfall_category'] = pd.cut(df['rainfall'], bins=[0,50,100,200,float('inf')], labels=['Low', 'Medium', 'High', 'Very High'])

In [None]:
# Encoding categorical variables
label_enc_cols = ['label', 'soil_type', 'rainfall_category']
le_dict = {}
for col in label_enc_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = df[col].astype(str)
        df[col] = le.fit_transform(df[col])
        le_dict[col] = le

print('Encoding complete.')

In [None]:
# Feature selection for model training
feature_cols = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'NPK_sum', 'N_to_PK_ratio', 'temp_humidity_ratio']
feature_cols = [col for col in feature_cols if col in df.columns]
X = df[feature_cols]
y = df['label']

# Scaling numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Save the processed data
import numpy as np
np.savetxt('data/processed/X_train.csv', X_train, delimiter=',')
np.savetxt('data/processed/X_test.csv', X_test, delimiter=',')
np.savetxt('data/processed/y_train.csv', y_train, delimiter=',')
np.savetxt('data/processed/y_test.csv', y_test, delimiter=',')

print('Data preprocessing complete. Files saved in data/processed/')