<a href="https://colab.research.google.com/github/Gilangp/Machine-learning/blob/main/KUIS%202/KUIS2_2341720042_Gilang_Purnomo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Klasifikasi Jenis Sayur**

### **Import Library**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
import cv2 as cv
import os
import joblib
import time

### **Load Dataset**

In [None]:
train_dir = "/content/drive/MyDrive/Colab Notebooks/Machine Learning/Vegetable Images/train"

class_names = os.listdir(train_dir)
print("Class names:", class_names)
print("Number of classes:", len(class_names))

train_counts = []

for class_name in class_names:
    train_path = os.path.join(train_dir, class_name)
    train_count = len(os.listdir(train_path))
    train_counts.append(train_count)

print("\nTraining images per class:")
for i, class_name in enumerate(class_names):
    print(f"{class_name}: {train_counts[i]} images")

Class names: ['Radish', 'Tomato', 'Potato', 'Cauliflower', 'Pumpkin', 'Cucumber', 'Papaya', 'Cabbage', 'Carrot', 'Capsicum', 'Broccoli', 'Brinjal', 'Bean', 'Bitter_Gourd', 'Bottle_Gourd']
Number of classes: 15

Training images per class:
Radish: 1000 images
Tomato: 1014 images
Potato: 1000 images
Cauliflower: 1001 images
Pumpkin: 1014 images
Cucumber: 1000 images
Papaya: 1000 images
Cabbage: 1014 images
Carrot: 1000 images
Capsicum: 1000 images
Broccoli: 1016 images
Brinjal: 1000 images
Bean: 1000 images
Bitter_Gourd: 1000 images
Bottle_Gourd: 1001 images


### **Pra Pengolahan Data**

In [None]:
X_train = []
y_train = []

img_size = (128, 128)
max_images_per_class = 200  # Batasi untuk efisiensi

print("Loading and preprocessing training images...")

for class_idx, class_name in enumerate(class_names):
    class_path = os.path.join(train_dir, class_name)
    image_files = os.listdir(class_path)

    for img_file in image_files:
        img_path = os.path.join(class_path, img_file)

        img = cv.imread(img_path)
        if img is not None:
            img = cv.resize(img, img_size)
            img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
            img = img / 255.0

            # Preprocessing enhancements
            img = cv.convertScaleAbs((img * 255).astype(np.uint8), alpha=1.2, beta=10)
            img = img / 255.0
            img = cv.GaussianBlur(img, (3, 3), 0)

            X_train.append(img)
            y_train.append(class_idx)

X_train = np.array(X_train)
y_train = np.array(y_train)

print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")

Loading and preprocessing training images...


### **Ekstraksi Fitur**

In [None]:
# Extract color features
color_features = []

for img in X_train:
    hsv = cv.cvtColor((img * 255).astype(np.uint8), cv.COLOR_RGB2HSV)

    h_mean, h_std = np.mean(hsv[:,:,0]), np.std(hsv[:,:,0])
    s_mean, s_std = np.mean(hsv[:,:,1]), np.std(hsv[:,:,1])
    v_mean, v_std = np.mean(hsv[:,:,2]), np.std(hsv[:,:,2])

    hist_h = cv.calcHist([hsv], [0], None, [8], [0, 180]).flatten()
    hist_s = cv.calcHist([hsv], [1], None, [8], [0, 256]).flatten()
    hist_v = cv.calcHist([hsv], [2], None, [8], [0, 256]).flatten()

    color_feature = np.array([h_mean, h_std, s_mean, s_std, v_mean, v_std])
    color_feature = np.concatenate([color_feature, hist_h, hist_s, hist_v])
    color_features.append(color_feature)

color_features = np.array(color_features)

# Extract texture features
texture_features = []

for img in X_train:
    gray = cv.cvtColor((img * 255).astype(np.uint8), cv.COLOR_RGB2GRAY)
    mean_val = np.mean(gray)
    std_val = np.std(gray)
    entropy_val = -np.sum(gray * np.log2(gray + 1e-8))
    texture_feature = np.array([mean_val, std_val, entropy_val])
    texture_features.append(texture_feature)

texture_features = np.array(texture_features)

# Combine features
X_features = np.concatenate([color_features, texture_features], axis=1)
print(f"Combined features shape: {X_features.shape}")

# Standardize and apply PCA
scaler = StandardScaler()
X_features_scaled = scaler.fit_transform(X_features)

pca = PCA(n_components=0.95)
X_features_pca = pca.fit_transform(X_features_scaled)

print(f"Features after PCA: {X_features_pca.shape}")

### **Model dengan Semua Metode Split**

**Inisialisasi Model**

In [None]:
svm_model = SVC(kernel='rbf', random_state=42, probability=True)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

models = {
    'SVM': svm_model,
    'Random Forest': rf_model
}

**Metode Split 70:30**

In [None]:
X_tr_70, X_te_70, y_tr_70, y_te_70 = train_test_split(
    X_features_pca, y_train,
    train_size=0.7,
    test_size=0.3,
    random_state=42,
    stratify=y_train
)

print(f"Training samples: {len(y_tr_70)}")
print(f"Testing samples: {len(y_te_70)}")

results_70 = {}

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    start_time = time.time()

    model.fit(X_tr_70, y_tr_70)
    y_pred = model.predict(X_te_70)
    y_pred_proba = model.predict_proba(X_te_70)

    training_time = time.time() - start_time

    accuracy = accuracy_score(y_te_70, y_pred)
    precision = precision_score(y_te_70, y_pred, average='weighted')
    recall = recall_score(y_te_70, y_pred, average='weighted')
    f1 = f1_score(y_te_70, y_pred, average='weighted')

    results_70[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'training_time': training_time
    }

    print(f"{model_name} Results:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  Time:      {training_time:.2f}s")

# Create results DataFrame for 70:30
results_70_df = pd.DataFrame(results_70).T
print("\nSummary 70:30 Split:")
print(results_70_df)

**Metode Split 80:20**

In [None]:
# Split data 80:20
X_tr_80, X_te_80, y_tr_80, y_te_80 = train_test_split(
    X_features_pca, y_train,
    train_size=0.8,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

print(f"Training samples: {len(y_tr_80)}")
print(f"Testing samples: {len(y_te_80)}")

results_80 = {}

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    start_time = time.time()

    # Create new instance for this split
    if model_name == 'SVM':
        current_model = SVC(kernel='rbf', random_state=42, probability=True)
    else:
        current_model = RandomForestClassifier(n_estimators=100, random_state=42)

    current_model.fit(X_tr_80, y_tr_80)
    y_pred = current_model.predict(X_te_80)

    training_time = time.time() - start_time

    accuracy = accuracy_score(y_te_80, y_pred)
    precision = precision_score(y_te_80, y_pred, average='weighted')
    recall = recall_score(y_te_80, y_pred, average='weighted')
    f1 = f1_score(y_te_80, y_pred, average='weighted')

    results_80[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'training_time': training_time,
        'model': current_model
    }

    print(f"{model_name} Results:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  Time:      {training_time:.2f}s")

# Create results DataFrame for 80:20
results_80_df = pd.DataFrame(results_80).T
print("\nSummary 80:20 Split:")
print(results_80_df)

**Metode Split 90:10**

In [None]:
# Split data 90:10
X_tr_90, X_te_90, y_tr_90, y_te_90 = train_test_split(
    X_features_pca, y_train,
    train_size=0.9,
    test_size=0.1,
    random_state=42,
    stratify=y_train
)

print(f"Training samples: {len(y_tr_90)}")
print(f"Testing samples: {len(y_te_90)}")

results_90 = {}

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    start_time = time.time()

    # Create new instance for this split
    if model_name == 'SVM':
        current_model = SVC(kernel='rbf', random_state=42, probability=True)
    else:
        current_model = RandomForestClassifier(n_estimators=100, random_state=42)

    current_model.fit(X_tr_90, y_tr_90)
    y_pred = current_model.predict(X_te_90)

    training_time = time.time() - start_time

    accuracy = accuracy_score(y_te_90, y_pred)
    precision = precision_score(y_te_90, y_pred, average='weighted')
    recall = recall_score(y_te_90, y_pred, average='weighted')
    f1 = f1_score(y_te_90, y_pred, average='weighted')

    results_90[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'training_time': training_time,
        'model': current_model
    }

    print(f"{model_name} Results:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  Time:      {training_time:.2f}s")

# Create results DataFrame for 90:10
results_90_df = pd.DataFrame(results_90).T
print("\nSummary 90:10 Split:")
print(results_90_df)

**Cross-Validation dengan k=5**

In [None]:
cv_results = {}

for model_name, model in models.items():
    print(f"\nPerforming 5-fold CV for {model_name}...")
    start_time = time.time()

    # Perform cross-validation with multiple metrics
    scoring = {'accuracy': 'accuracy',
               'precision': 'precision_weighted',
               'recall': 'recall_weighted',
               'f1': 'f1_weighted'}

    cv_scores = cross_validate(model, X_features_pca, y_train,
                              cv=5, scoring=scoring,
                              return_train_score=False, n_jobs=-1)

    cv_time = time.time() - start_time

    cv_results[model_name] = {
        'cv_accuracy_mean': np.mean(cv_scores['test_accuracy']),
        'cv_accuracy_std': np.std(cv_scores['test_accuracy']),
        'cv_precision_mean': np.mean(cv_scores['test_precision']),
        'cv_precision_std': np.std(cv_scores['test_precision']),
        'cv_recall_mean': np.mean(cv_scores['test_recall']),
        'cv_recall_std': np.std(cv_scores['test_recall']),
        'cv_f1_mean': np.mean(cv_scores['test_f1']),
        'cv_f1_std': np.std(cv_scores['test_f1']),
        'cv_time': cv_time
    }

    print(f"{model_name} CV Results:")
    print(f"  Accuracy:  {cv_results[model_name]['cv_accuracy_mean']:.4f} (±{cv_results[model_name]['cv_accuracy_std']:.4f})")
    print(f"  Precision: {cv_results[model_name]['cv_precision_mean']:.4f} (±{cv_results[model_name]['cv_precision_std']:.4f})")
    print(f"  Recall:    {cv_results[model_name]['cv_recall_mean']:.4f} (±{cv_results[model_name]['cv_recall_std']:.4f})")
    print(f"  F1-Score:  {cv_results[model_name]['cv_f1_mean']:.4f} (±{cv_results[model_name]['cv_f1_std']:.4f})")
    print(f"  Time:      {cv_time:.2f}s")

# Create CV results DataFrame
cv_results_df = pd.DataFrame(cv_results).T
print("\nSummary Cross-Validation:")
print(cv_results_df)

### **Perbandingan Semua Metode**

In [None]:
# Prepare data for visualization
split_methods = ['70:30', '80:20', '90:10', 'CV-5']
metrics = ['accuracy', 'precision', 'recall', 'f1_score']

# Create comparison DataFrame
comparison_data = []

for model_name in ['SVM', 'Random Forest']:
    for metric in metrics:
        row = {'Model': model_name, 'Metric': metric}

        # Add results for each split method
        row['70:30'] = results_70[model_name][metric]
        row['80:20'] = results_80[model_name][metric]
        row['90:10'] = results_90[model_name][metric]

        # For CV, use mean values
        cv_metric_map = {'accuracy': 'cv_accuracy_mean',
                        'precision': 'cv_precision_mean',
                        'recall': 'cv_recall_mean',
                        'f1_score': 'cv_f1_mean'}
        row['CV-5'] = cv_results[model_name][cv_metric_map[metric]]

        comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data)
print("\nComparison DataFrame:")
print(comparison_df)

# Visualization 1: Accuracy comparison across methods
plt.figure(figsize=(12, 8))

# Plot for each model
for i, model_name in enumerate(['SVM', 'Random Forest']):
    model_data = comparison_df[comparison_df['Model'] == model_name]
    acc_data = model_data[model_data['Metric'] == 'accuracy']

    methods = ['70:30', '80:20', '90:10', 'CV-5']
    acc_values = [acc_data[method].values[0] for method in methods]

    plt.subplot(2, 2, i+1)
    bars = plt.bar(methods, acc_values, color=['skyblue', 'lightcoral', 'lightgreen', 'gold'])
    plt.title(f'{model_name} - Accuracy Comparison')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1)

    # Add value labels on bars
    for bar, value in zip(bars, acc_values):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{value:.4f}', ha='center', va='bottom')

# Plot for F1-Score comparison
for i, model_name in enumerate(['SVM', 'Random Forest']):
    model_data = comparison_df[comparison_df['Model'] == model_name]
    f1_data = model_data[model_data['Metric'] == 'f1_score']

    methods = ['70:30', '80:20', '90:10', 'CV-5']
    f1_values = [f1_data[method].values[0] for method in methods]

    plt.subplot(2, 2, i+3)
    bars = plt.bar(methods, f1_values, color=['skyblue', 'lightcoral', 'lightgreen', 'gold'])
    plt.title(f'{model_name} - F1-Score Comparison')
    plt.ylabel('F1-Score')
    plt.ylim(0, 1)

    # Add value labels on bars
    for bar, value in zip(bars, f1_values):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{value:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Visualization 2: Detailed metrics for best split method (80:20)
best_split_results = results_80_df
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1_score']

plt.figure(figsize=(10, 6))
x = np.arange(len(metrics_to_plot))
width = 0.35

svm_scores = [best_split_results.loc['SVM', metric] for metric in metrics_to_plot]
rf_scores = [best_split_results.loc['Random Forest', metric] for metric in metrics_to_plot]

plt.bar(x - width/2, svm_scores, width, label='SVM', alpha=0.8)
plt.bar(x + width/2, rf_scores, width, label='Random Forest', alpha=0.8)

plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Model Performance Comparison (80:20 Split)')
plt.xticks(x, ['Accuracy', 'Precision', 'Recall', 'F1-Score'])
plt.legend()
plt.ylim(0, 1)
plt.grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(svm_scores):
    plt.text(i - width/2, v + 0.01, f'{v:.4f}', ha='center')
for i, v in enumerate(rf_scores):
    plt.text(i + width/2, v + 0.01, f'{v:.4f}', ha='center')

plt.tight_layout()
plt.show()

### **Hyperparameter Tuning untuk Model Terbaik**

In [None]:
# Use 80:20 split for tuning (based on previous results)
X_tr_tune = X_tr_80
y_tr_tune = y_tr_80
X_te_tune = X_te_80
y_te_tune = y_te_80

# Hyperparameter tuning for SVM
print("Performing GridSearchCV for SVM...")

param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

svm_tune = SVC(random_state=42, probability=True)
grid_svm = GridSearchCV(svm_tune, param_grid_svm, cv=3, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid_svm.fit(X_tr_tune, y_tr_tune)
tuning_time = time.time() - start_time

print("Best SVM parameters:", grid_svm.best_params_)
print("Best SVM score:", grid_svm.best_score_)
print(f"Tuning time: {tuning_time:.2f}s")

# Train best model
best_svm = grid_svm.best_estimator_
y_pred_tune = best_svm.predict(X_te_tune)

# Evaluate tuned model
tuned_accuracy = accuracy_score(y_te_tune, y_pred_tune)
tuned_precision = precision_score(y_te_tune, y_pred_tune, average='weighted')
tuned_recall = recall_score(y_te_tune, y_pred_tune, average='weighted')
tuned_f1 = f1_score(y_te_tune, y_pred_tune, average='weighted')

print("\nTuned SVM Results:")
print(f"Accuracy:  {tuned_accuracy:.4f}")
print(f"Precision: {tuned_precision:.4f}")
print(f"Recall:    {tuned_recall:.4f}")
print(f"F1-Score:  {tuned_f1:.4f}")

# Compare with default SVM
default_svm_accuracy = results_80['SVM']['accuracy']
improvement = tuned_accuracy - default_svm_accuracy
print(f"Improvement after tuning: {improvement:.4f}")

### **Simpan Model Terbaik**

In [None]:
# Save the best tuned model and preprocessing objects
joblib.dump(best_svm, 'best_vegetable_classifier_svm.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
joblib.dump(pca, 'pca_transformer.pkl')

print("Best model and preprocessing objects saved successfully!")
print("Files created:")
print("- best_vegetable_classifier_svm.pkl")
print("- feature_scaler.pkl")
print("- pca_transformer.pkl")

# Verify loading
loaded_model = joblib.load('best_vegetable_classifier_svm.pkl')
loaded_scaler = joblib.load('feature_scaler.pkl')
loaded_pca = joblib.load('pca_transformer.pkl')

print("Model verification - loaded successfully!")

In [None]:
# Final comparison table
final_comparison = pd.DataFrame()

for split_method in ['70:30', '80:20', '90:10']:
    for model_name in ['SVM', 'Random Forest']:
        if split_method == '70:30':
            results = results_70[model_name]
        elif split_method == '80:20':
            results = results_80[model_name]
        else:
            results = results_90[model_name]

        final_comparison.loc[model_name, f'{split_method}_Accuracy'] = results['accuracy']
        final_comparison.loc[model_name, f'{split_method}_F1'] = results['f1_score']

# Add CV results
final_comparison.loc['SVM', 'CV_Accuracy'] = cv_results['SVM']['cv_accuracy_mean']
final_comparison.loc['SVM', 'CV_F1'] = cv_results['SVM']['cv_f1_mean']
final_comparison.loc['Random Forest', 'CV_Accuracy'] = cv_results['Random Forest']['cv_accuracy_mean']
final_comparison.loc['Random Forest', 'CV_F1'] = cv_results['Random Forest']['cv_f1_mean']

print("\nFINAL COMPARISON TABLE:")
print(final_comparison.round(4))

# Determine best method and model
best_accuracy = 0
best_method = ""
best_model = ""

for split_method in ['70:30', '80:20', '90:10']:
    for model_name in ['SVM', 'Random Forest']:
        accuracy = final_comparison.loc[model_name, f'{split_method}_Accuracy']
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_method = split_method
            best_model = model_name

print(f"\nBEST COMBINATION:")
print(f"Model: {best_model}")
print(f"Split Method: {best_method}")
print(f"Accuracy: {best_accuracy:.4f}")

print(f"\nBEST TUNED MODEL (SVM):")
print(f"Accuracy: {tuned_accuracy:.4f}")
print(f"Parameters: {grid_svm.best_params_}")