In [1]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 2. Load the Dataset
df = pd.read_csv('../data/crop_data.csv')

# 3. Separate Features (X) and Target (y)
X = df.drop('label', axis=1)
y = df['label']

# 4. Encode the Target Variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("Original Crop Labels:", label_encoder.classes_)
print("Encoded Labels:", np.unique(y_encoded))

# 5. Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)
print("\nShape of training features:", X_train.shape)
print("Shape of testing features:", X_test.shape)

# 6. Save Processed Data
processed_data_dir = '../data/processed'
if not os.path.exists(processed_data_dir):
    os.makedirs(processed_data_dir)

X_train.to_csv(os.path.join(processed_data_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(processed_data_dir, 'X_test.csv'), index=False)
pd.Series(y_train).to_csv(os.path.join(processed_data_dir, 'y_train.csv'), index=False, header=['label'])
pd.Series(y_test).to_csv(os.path.join(processed_data_dir, 'y_test.csv'), index=False, header=['label'])

print("\nProcessed data saved successfully.")


Original Crop Labels: ['banana' 'blackgram' 'chickpea' 'kidneybeans' 'lentil' 'maize'
 'mothbeans' 'mungbean' 'pigeonpeas' 'pomegranate' 'rice']
Encoded Labels: [ 0  1  2  3  4  5  6  7  8  9 10]

Shape of training features: (1760, 7)
Shape of testing features: (440, 7)

Processed data saved successfully.
