In [1]:
import torch
from torch.utils.data import DataLoader, random_split, Subset, Dataset, ConcatDataset
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from dataset import AppleBrowningDataset
from model import SVMModel, RandomForestModel
from train_utils import train_svm, train_rf

#### HYPERPARAMETERS ####
BS = 32

# Carica il dataset1
excel_file = "Full Data Analysis- file, new revised_Sundus 1.xlsx"
sheet_name = "Exp. 1"
transform = None

# Creazione del dataset
dataset = AppleBrowningDataset(excel_file, sheet_name, transform=transform)


# Carica il dataset2
sheet_name2 = "Exp. 2"

# Creazione del dataset
dataset2 = AppleBrowningDataset(excel_file, sheet_name2, transform=transform)

combined_dataset = ConcatDataset([dataset, dataset2])


train_size = int(0.7 * len(combined_dataset))
test_size = len(combined_dataset) - train_size
train_set, test_set = random_split(combined_dataset, [train_size, test_size])


# Debug: Verifica la distribuzione delle classi
train_labels = [label for _, label in train_set]
test_labels = [label for _, label in test_set]

print(f"Training set size: {train_size}, Test set size: {test_size}")
print(f"Training set class distribution: {sum(train_labels)} positive, {len(train_labels) - sum(train_labels)} negative")
print(f"Test set class distribution: {sum(test_labels)} positive, {len(test_labels) - sum(test_labels)} negative")


# Funzione per valutare il modello sul test set

def evaluate_model(model, dataset):
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for i in range(len(dataset)):
            features, labels = dataset[i]
            
            # Assicurati che le features siano in formato numpy array
            if isinstance(features, torch.Tensor):
                features = features.numpy()
            
            # Appiattimento delle dimensioni, se necessario
            if len(features.shape) > 2:
                features = features.reshape(features.shape[0], -1)  # Modifica da view a reshape per NumPy

            # Predizioni del modello senza scalare le features
            preds = model.predict(features.reshape(1, -1))  # Aggiungi la dimensione del batch
            
            all_preds.extend(preds)
            all_labels.append(labels)
    
    # Converti le liste in array numpy
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    # Debug: Controlla la distribuzione delle predizioni
    print(f"Predictions distribution: {np.sum(all_preds)} positive, {len(all_preds) - np.sum(all_preds)} negative")
    print(f"Labels distribution: {np.sum(all_labels)} positive, {len(all_labels) - np.sum(all_labels)} negative")
    
    if not len(all_preds) or not len(all_labels):  # Controlla se le liste sono vuote
        print("Error: No predictions or labels found.")
        return

    accuracy = accuracy_score(all_labels, all_preds)  # Calcola l'accuratezza
    report = classification_report(all_labels, all_preds, zero_division=1)  # Gestisci precisione ill-definita
    print(f"Test Accuracy: {accuracy}")
    print(f"Classification Report:\n{report}")

    # Confronto manuale delle predizioni e delle etichette
    correct_predictions = np.sum(all_preds == all_labels)
    total_samples = len(all_labels)
    manual_accuracy = correct_predictions / total_samples
    print(f"Accuratezza calcolata manualmente: {manual_accuracy}")

    # Confronto accurato tra accuratezze
    if abs(accuracy - manual_accuracy) > 1e-5:  # Controlla se ci sono discrepanze significative
        print(f"Attenzione: Differenza rilevante tra l'accuratezza calcolata e quella manuale ({accuracy} vs {manual_accuracy})")
    else:
        print("Le due accuratezze coincidono.")



# K-Fold Cross-Validation per il modello SVM
print("Evaluating SVM Model with K-Fold Cross-Validation")
svm_model = SVMModel()
train_svm(svm_model, train_set)



# K-Fold Cross-Validation per il modello RandomForest
print("Evaluating RandomForest Model with K-Fold Cross-Validation")
rf_model = RandomForestModel()
train_rf(rf_model, train_set)




# Valutazione del modello SVM
print("\n\n")
print("Evaluating SVM Model on Test Set")
evaluate_model(svm_model, test_set)

# Valutazione del modello RandomForest
print("Evaluating RandomForest Model on Test Set")
evaluate_model(rf_model, test_set)


Training set size: 223, Test set size: 96
Training set class distribution: 63 positive, 160 negative
Test set class distribution: 31 positive, 65 negative
Evaluating SVM Model with K-Fold Cross-Validation
Training data shape: 223, Labels distribution: 63 positive, 160 negative
Fitting 5 folds for each of 3969 candidates, totalling 19845 fits
[CV 1/5] END model__C=0.001, model__class_weight=balanced, model__coef0=0.0, model__degree=2, model__gamma=0.001, model__kernel=linear;, score=0.711 total time=   0.0s
[CV 2/5] END model__C=0.001, model__class_weight=balanced, model__coef0=0.0, model__degree=2, model__gamma=0.001, model__kernel=linear;, score=0.733 total time=   0.0s
[CV 3/5] END model__C=0.001, model__class_weight=balanced, model__coef0=0.0, model__degree=2, model__gamma=0.001, model__kernel=linear;, score=0.667 total time=   0.0s
[CV 4/5] END model__C=0.001, model__class_weight=balanced, model__coef0=0.0, model__degree=2, model__gamma=0.001, model__kernel=linear;, score=0.795 tot