In [None]:
5->final

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import pickle
import random
from datetime import datetime
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv("Crop_recommendation.csv")

def infer_season(temp, rainfall):
    """Determine season based on temperature and rainfall"""
    if temp < 20 and rainfall < 100:
        return 'Winter'
    elif temp >= 20 and temp < 30 and rainfall >= 100:
        return 'Monsoon'
    elif temp >= 30 and rainfall < 100:
        return 'Summer'
    else:
        return 'Spring'

# Feature engineering: Add season column
data['season'] = data.apply(lambda row: infer_season(row['temperature'], row['rainfall']), axis=1)

# Initialize encoders
season_encoder = LabelEncoder()
label_encoder = LabelEncoder()

# Encode categorical features
data['season_encoded'] = season_encoder.fit_transform(data['season'])
data['label_encoded'] = label_encoder.fit_transform(data['label'])

# Plot feature distributions
plt.figure(figsize=(15, 10))
features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
for i, feature in enumerate(features, 1):
    plt.subplot(3, 3, i)
    plt.hist(data[feature], bins=20, color='skyblue', edgecolor='black')
    plt.title(f'{feature} Distribution', fontsize=10)
    plt.xlabel(feature)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.suptitle('Feature Distributions', y=1.02, fontsize=14)
plt.show()

# Scatter plot matrix for key features
plt.figure(figsize=(18, 15))
scatter_features = ['temperature', 'rainfall', 'humidity', 'N', 'P', 'K']
crop_labels = label_encoder.inverse_transform(data['label_encoded'])

for i, feat1 in enumerate(scatter_features):
    for j, feat2 in enumerate(scatter_features):
        plt.subplot(len(scatter_features), len(scatter_features), i*len(scatter_features)+j+1)
        if i == j:
            plt.hist(data[feat1], bins=20, color='skyblue')
            plt.xlabel(feat1)
        else:
            plt.scatter(data[feat2], data[feat1], c=data['label_encoded'], 
                       cmap='tab20', alpha=0.6, s=10)
            plt.xlabel(feat2)
        plt.ylabel(feat1 if j == 0 else '')
plt.tight_layout()
plt.suptitle('Feature Relationships and Distributions', y=1.02, fontsize=16)
plt.show()

# Prepare features and target
X = data[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'season_encoded']]
y = data['label_encoded']

# Dynamic data splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=random.randint(0, 1000)
)

# Handle class imbalance
smote = SMOTE(random_state=random.randint(0, 1000))
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Plot class distribution
plt.figure(figsize=(15, 6))

# Before SMOTE
plt.subplot(1, 2, 1)
original_counts = pd.Series(label_encoder.inverse_transform(y_train)).value_counts()
original_counts.plot(kind='bar', color='salmon')
plt.title('Class Distribution Before SMOTE')
plt.xlabel('Crop Type')
plt.ylabel('Count')
plt.xticks(rotation=45)

# After SMOTE
plt.subplot(1, 2, 2)
resampled_counts = pd.Series(label_encoder.inverse_transform(y_train_res)).value_counts()
resampled_counts.plot(kind='bar', color='lightgreen')
plt.title('Class Distribution After SMOTE')
plt.xlabel('Crop Type')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'random_state': [random.randint(0, 1000)]
}

grid_search = GridSearchCV(
    RandomForestClassifier(),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train_res)

# Best model
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Validation
cv_scores = cross_val_score(best_model, X_train_scaled, y_train_res, cv=5, scoring='accuracy')
print(f"Cross-Val Accuracy: {np.mean(cv_scores):.2f}")

# Evaluation
y_pred = best_model.predict(X_test_scaled)
print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Feature importance plot
plt.figure(figsize=(12, 6))
importances = best_model.feature_importances_
features = X.columns
indices = np.argsort(importances)[::-1]

plt.title("Feature Importance")
plt.bar(range(X.shape[1]), importances[indices], align='center', color='orange')
plt.xticks(range(X.shape[1]), features[indices], rotation=90)
plt.xlabel("Features")
plt.ylabel("Importance Score")
plt.tight_layout()
plt.show()

# Save artifacts
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
pickle.dump(best_model, open(f"model_{timestamp}.pkl", "wb"))
pickle.dump(scaler, open(f"scaler_{timestamp}.pkl", "wb"))
pickle.dump(season_encoder, open(f"season_encoder_{timestamp}.pkl", "wb"))
pickle.dump(label_encoder, open(f"label_encoder_{timestamp}.pkl", "wb"))

def predict_crop(features=None):
    """Predict crop with optional custom features"""
    if features is None:
        features = [
            random.randint(0, 150),        # N
            random.randint(5, 60),         # P
            random.randint(5, 60),         # K
            random.uniform(8.0, 45.0),     # temp
            random.uniform(15.0, 100.0),   # humidity
            random.uniform(3.5, 9.5),      # ph
            random.uniform(20.0, 300.0)    # rainfall
        ]
    
    N, P, K, temp, hum, ph, rain = features
    season = infer_season(temp, rain)
    season_enc = season_encoder.transform([season])[0]
    
    scaled_data = scaler.transform([[N, P, K, temp, hum, ph, rain, season_enc]])
    return label_encoder.inverse_transform(best_model.predict(scaled_data))[0]

# Random prediction demo
print("\n=== Random Prediction ===")
print(f"Recommended Crop: {predict_crop()}")