In [11]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import time
import joblib
import os
from tabulate import tabulate


In [12]:
# Load CIFAR-10 Dataset
def load_cifar_batch(file_path):
    with open(file_path, 'rb') as f:
        batch = pickle.load(f, encoding='bytes')
    data = batch[b'data']
    labels = batch[b'labels']
    return data, labels

# Load training batches (data_batch_1 to data_batch_5)
train_data = []
train_labels = []
for i in range(1, 6):
    data, labels = load_cifar_batch(os.path.join('datasets/cifar10', f'data_batch_{i}'))
    train_data.append(data)
    train_labels.append(labels)

# Combine training data
X_train = np.vstack(train_data)
y_train = np.hstack(train_labels)

# Load test batch
X_test, y_test = load_cifar_batch('datasets/cifar10/test_batch')

# Reshape and normalize data
X_train = X_train.reshape(X_train.shape[0], -1).astype('float32') / 255.0
X_test = X_test.reshape(X_test.shape[0], -1).astype('float32') / 255.0

print(f'Training data shape: {X_train.shape}')
print(f'Test data shape: {X_test.shape}')

# Preprocessing
# Create models subfolder
os.makedirs('models', exist_ok=True)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Training data shape: (50000, 3072)
Test data shape: (10000, 3072)


In [13]:

# Training and Inference
# Define models
models = {
    'CIFAR_Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, multi_class='multinomial'),
    'CIFAR_SGD Classifier': SGDClassifier(random_state=42, max_iter=1000),
    'CIFAR_Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Initialize results storage
results = []
classification_reports = {}

# Train and evaluate each model
for model_name, model in models.items():
    # Measure training time
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Calculate accuracies
    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)
    
    # Store results
    results.append({
        'Model': model_name,
        'Training Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Training Time (s)': training_time
    })
    
    # Generate classification report
    y_pred = model.predict(X_test)
    classification_reports[model_name] = classification_report(y_test, y_pred, output_dict=True)
    
    # Save the model
    joblib.dump(model, f'models/{model_name.replace(" ", "_").lower()}.pkl')

# Convert results to DataFrame and format
results_df = pd.DataFrame(results)
results_df['Training Accuracy'] = results_df['Training Accuracy'].round(4)
results_df['Test Accuracy'] = results_df['Test Accuracy'].round(4)
results_df['Training Time (s)'] = results_df['Training Time (s)'].round(4)

# Print results table
print("\nModel Performance Comparison:")
print(tabulate(results_df, headers='keys', tablefmt='psql', showindex=False))

# Print classification reports
print("\nClassification Reports (Test Set):")
for model_name in models.keys():
    print(f"\n{model_name}:")
    report = classification_reports[model_name]
    report_df = pd.DataFrame({
        'Class': [str(i) for i in range(10)] + ['Weighted Avg'],
        'Precision': [report[str(i)]['precision'] for i in range(10)] + [report['weighted avg']['precision']],
        'Recall': [report[str(i)]['recall'] for i in range(10)] + [report['weighted avg']['recall']],
        'F1-Score': [report[str(i)]['f1-score'] for i in range(10)] + [report['weighted avg']['f1-score']],
        'Support': [report[str(i)]['support'] for i in range(10)] + [report['weighted avg']['support']]
    }).round(4)
    print(tabulate(report_df, headers='keys', tablefmt='psql', showindex=False))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Model Performance Comparison:
+---------------------------+---------------------+-----------------+---------------------+
| Model                     |   Training Accuracy |   Test Accuracy |   Training Time (s) |
|---------------------------+---------------------+-----------------+---------------------|
| CIFAR_Logistic Regression |              0.5151 |          0.3656 |             127.28  |
| CIFAR_SGD Classifier      |              0.3763 |          0.2936 |            2620.39  |
| CIFAR_Decision Tree       |              1      |          0.2664 |             109.704 |
+---------------------------+---------------------+-----------------+---------------------+

Classification Reports (Test Set):

CIFAR_Logistic Regression:
+--------------+-------------+----------+------------+-----------+
| Class        |   Precision |   Recall |   F1-Score |   Support |
|--------------+-------------+----------+------------+-----------|
| 0            |      0.4109 |   0.422  |     0.4164 |      

In [14]:
os.makedirs('results', exist_ok=True)
results_df.to_csv(os.path.join('results', 'cifar10_basic_results.csv'), index=False)
# Save classification reports
for model_name, report in classification_reports.items():
    report_df = pd.DataFrame(report).transpose()
    report_df.to_csv(os.path.join('results', f'{model_name.replace(" ", "_").lower()}_classification_report.csv'), index=True)