In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import datetime
import json

sns.set(style="whitegrid")

In [2]:
# 1. Carregar o dataset MNIST
print("Carregando o dataset MNIST...")
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist.data, mnist.target.astype(int)
print(f"Shape dos dados: {X.shape}, Labels: {y.shape}")

# Inicializar dicionário para salvar resultados
results = {
    'timestamp': datetime.datetime.now().isoformat(),
    'dataset_info': {
        'shape': X.shape,
        'num_classes': len(np.unique(y)),
        'classes': np.unique(y).tolist()
    },
    'experiments': []
}

Carregando o dataset MNIST...
Shape dos dados: (70000, 784), Labels: (70000,)
Shape dos dados: (70000, 784), Labels: (70000,)


In [3]:
# 2. Funções auxiliares
def split_data(X, y, train_size, val_size, test_size, random_state=42):
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, train_size=train_size, random_state=random_state, stratify=y)
    val_relative_size = val_size / (val_size + test_size)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, train_size=val_relative_size, random_state=random_state, stratify=y_temp)
    return X_train, X_val, X_test, y_train, y_val, y_test

def apply_pca(X_train, X_val, X_test, n_components=0.95):
    pca = PCA(n_components=n_components, whiten=True, random_state=42)
    X_train_pca = pca.fit_transform(X_train)
    X_val_pca = pca.transform(X_val)
    X_test_pca = pca.transform(X_test)
    print(f"PCA: de {X_train.shape[1]} para {X_train_pca.shape[1]} dimensões.")
    return X_train_pca, X_val_pca, X_test_pca, pca

def train_knn(X_train, y_train, X_val, y_val, k, metric='euclidean'):
    knn = KNeighborsClassifier(n_neighbors=k, metric=metric, n_jobs=-1)
    knn.fit(X_train, y_train)
    y_val_pred = knn.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    print(f"k-NN (k={k}, metric={metric}): Validação: {acc:.4f}")
    return knn, acc

def train_logistic(X_train, y_train, X_val, y_val):
    clf = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial', n_jobs=-1)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    print(f"Logistic Regression: Validação: {acc:.4f}")
    return clf, acc

In [4]:
# 3. Divisão dos dados e PCA
print("\nDividindo os dados...")
# Exemplo: 60% treino, 20% validação, 20% teste
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y, 0.6, 0.2, 0.2)

print("Aplicando PCA...")
X_train_pca, X_val_pca, X_test_pca, pca = apply_pca(X_train, X_val, X_test, n_components=0.95)

# Adicionar informações do PCA aos resultados
results['pca_info'] = {
    'original_dimensions': X_train.shape[1],
    'reduced_dimensions': X_train_pca.shape[1],
    'variance_explained': float(pca.explained_variance_ratio_.sum())
}


Dividindo os dados...
Aplicando PCA...
Aplicando PCA...
PCA: de 784 para 154 dimensões.
PCA: de 784 para 154 dimensões.


In [5]:
# 4. Experimentos com k-NN
print("\n=== Experimentos com k-NN ===")
for metric in ['euclidean', 'manhattan']:
    for k in [3, 5, 7]:
        knn, val_acc = train_knn(X_train_pca, y_train, X_val_pca, y_val, k, metric)
        y_test_pred = knn.predict(X_test_pca)
        test_acc = accuracy_score(y_test, y_test_pred)
        print(f"Teste k-NN (k={k}, metric={metric}): {test_acc:.4f}\n")
        
        # Salvar resultados
        experiment_result = {
            'algorithm': 'k-NN',
            'parameters': {'k': k, 'metric': metric},
            'validation_accuracy': float(val_acc),
            'test_accuracy': float(test_acc)
        }
        results['experiments'].append(experiment_result)


=== Experimentos com k-NN ===
k-NN (k=3, metric=euclidean): Validação: 0.9006
k-NN (k=3, metric=euclidean): Validação: 0.9006
Teste k-NN (k=3, metric=euclidean): 0.8997

Teste k-NN (k=3, metric=euclidean): 0.8997

k-NN (k=5, metric=euclidean): Validação: 0.8965
k-NN (k=5, metric=euclidean): Validação: 0.8965
Teste k-NN (k=5, metric=euclidean): 0.8934

Teste k-NN (k=5, metric=euclidean): 0.8934

k-NN (k=7, metric=euclidean): Validação: 0.8890
k-NN (k=7, metric=euclidean): Validação: 0.8890
Teste k-NN (k=7, metric=euclidean): 0.8839

Teste k-NN (k=7, metric=euclidean): 0.8839

k-NN (k=3, metric=manhattan): Validação: 0.9049
k-NN (k=3, metric=manhattan): Validação: 0.9049
Teste k-NN (k=3, metric=manhattan): 0.9032

Teste k-NN (k=3, metric=manhattan): 0.9032

k-NN (k=5, metric=manhattan): Validação: 0.8983
k-NN (k=5, metric=manhattan): Validação: 0.8983
Teste k-NN (k=5, metric=manhattan): 0.8955

Teste k-NN (k=5, metric=manhattan): 0.8955

k-NN (k=7, metric=manhattan): Validação: 0.8906
k

In [6]:
# 5. Experimentos com Classificador Linear
print("\n=== Experimentos com Logistic Regression ===")
logistic, val_acc = train_logistic(X_train_pca, y_train, X_val_pca, y_val)
y_test_pred = logistic.predict(X_test_pca)
test_acc = accuracy_score(y_test, y_test_pred)
print(f"Teste Logistic Regression: {test_acc:.4f}")

# Salvar resultados da Regressão Logística
experiment_result = {
    'algorithm': 'Logistic Regression',
    'parameters': {'max_iter': 1000, 'solver': 'lbfgs', 'multi_class': 'multinomial'},
    'validation_accuracy': float(val_acc),
    'test_accuracy': float(test_acc)
}
results['experiments'].append(experiment_result)


=== Experimentos com Logistic Regression ===




Logistic Regression: Validação: 0.9195
Teste Logistic Regression: 0.9199


In [7]:
# 6. Avaliação final
print("\n=== Avaliação Final ===")
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred, output_dict=True)

print("Matriz de Confusão:")
print(conf_matrix)
print("\nRelatório de Classificação:")
print(classification_report(y_test, y_test_pred))

# Adicionar avaliação final aos resultados
results['final_evaluation'] = {
    'confusion_matrix': conf_matrix.tolist(),
    'classification_report': class_report,
    'best_algorithm': max(results['experiments'], key=lambda x: x['test_accuracy'])['algorithm'],
    'best_test_accuracy': max(results['experiments'], key=lambda x: x['test_accuracy'])['test_accuracy']
}


=== Avaliação Final ===
Matriz de Confusão:
[[1339    0    4    2    2   12    6    3    7    5]
 [   1 1535    6    6    1    5    0    4   14    3]
 [  11   13 1253   26   15    6   23   20   23    8]
 [   8   11   29 1267    1   50    7   19   27   10]
 [   3    8    8    7 1264    0   12    5   10   48]
 [  14    7   20   34   10 1098   24    2   41   13]
 [  10    5   17    1    9   13 1314    2    4    0]
 [   3    6   18    8   11    2    0 1362    5   44]
 [  15   27   17   33   11   39   14    6 1191   12]
 [  12    5    6   17   42   10    1   33    9 1256]]

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1380
           1       0.95      0.97      0.96      1575
           2       0.91      0.90      0.90      1398
           3       0.90      0.89      0.90      1429
           4       0.93      0.93      0.93      1365
           5       0.89      0.87      0.88      1263
           6    

In [8]:
# 7. Salvar resultados em arquivo
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
results_filename = f"mnist_experiments_{timestamp}.json"

with open(results_filename, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n=== Resultados salvos em {results_filename} ===")
print(f"Melhor algoritmo: {results['final_evaluation']['best_algorithm']}")
print(f"Melhor acurácia de teste: {results['final_evaluation']['best_test_accuracy']:.4f}")

# Salvar também um resumo em texto
summary_filename = f"mnist_summary_{timestamp}.txt"
with open(summary_filename, 'w') as f:
    f.write(f"MNIST Classification Experiments - {results['timestamp']}\n")
    f.write("=" * 60 + "\n\n")
    f.write(f"Dataset Info:\n")
    f.write(f"  Shape: {results['dataset_info']['shape']}\n")
    f.write(f"  Number of classes: {results['dataset_info']['num_classes']}\n\n")
    f.write(f"PCA Info:\n")
    f.write(f"  Original dimensions: {results['pca_info']['original_dimensions']}\n")
    f.write(f"  Reduced dimensions: {results['pca_info']['reduced_dimensions']}\n")
    f.write(f"  Variance explained: {results['pca_info']['variance_explained']:.4f}\n\n")
    f.write("Experiment Results:\n")
    for exp in results['experiments']:
        f.write(f"  {exp['algorithm']}: Val={exp['validation_accuracy']:.4f}, Test={exp['test_accuracy']:.4f}\n")
    f.write(f"\nBest Algorithm: {results['final_evaluation']['best_algorithm']}\n")
    f.write(f"Best Test Accuracy: {results['final_evaluation']['best_test_accuracy']:.4f}\n")

print(f"Resumo salvo em {summary_filename}")


=== Resultados salvos em mnist_experiments_20250608_172159.json ===
Melhor algoritmo: Logistic Regression
Melhor acurácia de teste: 0.9199
Resumo salvo em mnist_summary_20250608_172159.txt


In [None]:
# 8. Criar arquivo CSV com os resultados
import csv
import pandas as pd

# Preparar dados para CSV
csv_data = []
for exp in results['experiments']:
    row = {
        'Algorithm': exp['algorithm'],
        'Validation_Accuracy': exp['validation_accuracy'],
        'Test_Accuracy': exp['test_accuracy']
    }
    
    # Adicionar parâmetros específicos
    for param, value in exp['parameters'].items():
        row[param] = value
    
    csv_data.append(row)

# Criar DataFrame e salvar CSV
df = pd.DataFrame(csv_data)
csv_filename = f"mnist_results_{timestamp}.csv"
df.to_csv(csv_filename, index=False)

print(f"\n=== Resultados CSV salvos em {csv_filename} ===")
print("\nTabela de Resultados:")
print(df.to_string(index=False))

# Criar também um CSV mais detalhado com métricas por classe
detailed_csv_data = []
for class_num in range(10):
    class_str = str(class_num)
    if class_str in class_report:
        detailed_csv_data.append({
            'Class': class_num,
            'Precision': class_report[class_str]['precision'],
            'Recall': class_report[class_str]['recall'],
            'F1_Score': class_report[class_str]['f1-score'],
            'Support': int(class_report[class_str]['support'])
        })

# Adicionar métricas gerais
detailed_csv_data.append({
    'Class': 'Overall',
    'Precision': class_report['weighted avg']['precision'],
    'Recall': class_report['weighted avg']['recall'],
    'F1_Score': class_report['weighted avg']['f1-score'],
    'Support': int(class_report['weighted avg']['support'])
})

df_detailed = pd.DataFrame(detailed_csv_data)
detailed_csv_filename = f"mnist_detailed_metrics_{timestamp}.csv"
df_detailed.to_csv(detailed_csv_filename, index=False)

print(f"\nMétricas detalhadas salvas em {detailed_csv_filename}")
print("\nMétricas por Classe:")
print(df_detailed.to_string(index=False))