# Tomato Grading AI - Model Training

This notebook demonstrates the process of training machine learning models for tomato grading based on image features. We'll extract features from tomato images in three categories (ripe, unripe, and reject) and train classification models to distinguish between them.

## 1. Import Required Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import joblib
import sys

# Add the parent directory to the path to import our feature extraction module
sys.path.append('..')
from features.feature_extraction import extract_features_batch

## 2. Load and Prepare Dataset

In [None]:
# Define dataset directories
dataset_dir = '../dataset'
ripe_dir = os.path.join(dataset_dir, 'ripe')
unripe_dir = os.path.join(dataset_dir, 'unripe')
reject_dir = os.path.join(dataset_dir, 'reject')

# Extract features for each category
print("Extracting features from ripe tomato images...")
ripe_features, ripe_labels = extract_features_batch(ripe_dir, 'ripe')

print("Extracting features from unripe tomato images...")
unripe_features, unripe_labels = extract_features_batch(unripe_dir, 'unripe')

print("Extracting features from reject tomato images...")
reject_features, reject_labels = extract_features_batch(reject_dir, 'reject')

# Combine all features and labels
X = np.vstack((ripe_features, unripe_features, reject_features))
y = np.hstack((ripe_labels, unripe_labels, reject_labels))

print(f"Dataset shape: {X.shape}")
print(f"Number of samples per class: Ripe: {len(ripe_features)}, Unripe: {len(unripe_features)}, Reject: {len(reject_features)}")

## 3. Data Preprocessing

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for later use
joblib.dump(scaler, '../models/scaler.pkl')

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

## 4. Train SVM Models

In [None]:
# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train SVM with linear kernel
print("Training SVM with linear kernel...")
svm_linear = SVC(kernel='linear', probability=True, random_state=42)
param_grid = {
    'C': [0.1, 1, 10, 100]
}

grid_search_linear = GridSearchCV(svm_linear, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid_search_linear.fit(X_train_scaled, y_train)

print(f"Best parameters: {grid_search_linear.best_params_}")
print(f"Best cross-validation score: {grid_search_linear.best_score_:.4f}")

# Train SVM with RBF kernel
print("\nTraining SVM with quadratic kernel...")
svm_quad = SVC(kernel='poly', degree=2, probability=True, random_state=42)
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.1, 1]
}

grid_search_quad = GridSearchCV(svm_quad, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid_search_quad.fit(X_train_scaled, y_train)

print(f"Best parameters: {grid_search_quad.best_params_}")
print(f"Best cross-validation score: {grid_search_quad.best_score_:.4f}")

# Save the best models
joblib.dump(grid_search_linear.best_estimator_, '../models/svm_linear.pkl')
joblib.dump(grid_search_quad.best_estimator_, '../models/svm_quadratic.pkl')

## 5. Evaluate Models

In [None]:
# Evaluate the linear SVM
y_pred_linear = grid_search_linear.predict(X_test_scaled)
print("Linear SVM Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_linear):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_linear))

# Create confusion matrix
cm_linear = confusion_matrix(y_test, y_pred_linear)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_linear, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['reject', 'ripe', 'unripe'], 
            yticklabels=['reject', 'ripe', 'unripe'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - Linear SVM')
plt.tight_layout()
plt.show()

# Evaluate the quadratic SVM
y_pred_quad = grid_search_quad.predict(X_test_scaled)
print("\nQuadratic SVM Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_quad):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_quad))

# Create confusion matrix
cm_quad = confusion_matrix(y_test, y_pred_quad)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_quad, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['reject', 'ripe', 'unripe'], 
            yticklabels=['reject', 'ripe', 'unripe'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - Quadratic SVM')
plt.tight_layout()
plt.show()

## 6. Feature Importance Analysis

Let's train a Random Forest classifier to analyze feature importance

In [None]:
# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

# Get feature importances
importances = rf.feature_importances_

# Create a DataFrame for better visualization
feature_count = X.shape[1]
color_count = 30
texture_count = len(importances) - color_count - 7  # 7 is the number of shape features

# Create feature names
color_features = [f'Color_{i+1}' for i in range(color_count)]
texture_features = [f'Texture_{i+1}' for i in range(texture_count)]
shape_features = ['Area', 'Perimeter', 'Circularity', 'Aspect_Ratio', 
                  'Equiv_Diameter', 'Extent', 'Solidity']

feature_names = color_features + texture_features + shape_features
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort by importance
importance_df = importance_df.sort_values('Importance', ascending=False)

# Plot top 20 features
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.show()

# Group importance by feature type
feature_types = ['Color'] * color_count + ['Texture'] * texture_count + ['Shape'] * 7
importance_by_type = pd.DataFrame({
    'Feature_Type': feature_types,
    'Importance': importances
})

# Calculate sum of importance by type
importance_by_type = importance_by_type.groupby('Feature_Type').sum().reset_index()

# Plot importance by feature type
plt.figure(figsize=(10, 6))
sns.barplot(x='Feature_Type', y='Importance', data=importance_by_type)
plt.title('Feature Importance by Type')
plt.tight_layout()
plt.show()

## 7. Conclusion

In this notebook, we've successfully:
1. Extracted color, texture, and shape features from tomato images
2. Trained SVM models with linear and quadratic kernels
3. Evaluated model performance using accuracy, precision, recall, and F1-score
4. Analyzed feature importance using a Random Forest classifier

The best model achieved good classification accuracy between ripe, unripe, and reject tomatoes. This model can now be used in the main application for tomato grading.