<a href="https://colab.research.google.com/github/Jieoi/Food_supply_analysis/blob/master/bitbybit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import mnist

from sklearn.utils import shuffle
import time

In [17]:
# Define a simple CNN model
def create_model():
    model = models.Sequential([
        layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Data splitting function: Split data into random subsets
def split_data(x_train, y_train, subset_fraction=0.1, num_steps=5):
    subsets = []
    for step in range(num_steps):
        # Select a random subset of data
        x_train_subset, y_train_subset = shuffle(x_train, y_train, random_state=step)
        subset_size = int(len(x_train_subset) * subset_fraction)
        x_train_subset = x_train_subset[:subset_size]
        y_train_subset = y_train_subset[:subset_size]
        subsets.append((x_train_subset, y_train_subset))
        print(f"Subset {step + 1}/{num_steps} - Size: {subset_size} images")
    return subsets

# Reusable model training function: Train the model on a given dataset
def train_model_on_data(model, x_train_data, y_train_data, epochs=5):
    model.fit(x_train_data, y_train_data, epochs=epochs, batch_size=32, verbose=0)

# Model averaging function: Average model weights after training
def average_model_weights(model, model_weights):
    new_weights = model.get_weights()
    averaged_weights = [(w1 + w2) / 2 for w1, w2 in zip(model_weights, new_weights)]
    model.set_weights(averaged_weights)
    return model.get_weights()

# # Function to train and average models on random subsets
# def train_and_average_models(x_train, y_train, model, x_test, y_test, num_steps=5, subset_fraction=0.1):
#     model_weights = model.get_weights()
#     total_start_time = time.time()  # Track total time for this function
#     small_models = []  # To store the small models trained on each subset

#     # Split data into random subsets
#     subsets = split_data(x_train, y_train, subset_fraction, num_steps)

#     for step, (x_train_subset, y_train_subset) in enumerate(subsets):
#         start_time = time.time()  # Track time for each step
#         print(f"Training step {step + 1}/{num_steps}")

#         # Train the model on this subset
#         train_model_on_data(model, x_train_subset, y_train_subset)

#         # Average the model weights
#         model_weights = average_model_weights(model, model_weights)

#         # Store the trained model for this subset
#         small_models.append(model_weights)

#         # Test the model on the test set after training on the current subset
#         test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
#         print(f"Test accuracy after step {step + 1}: {test_acc:.4f}")

#         step_time = time.time() - start_time
#         print(f"Time taken for step {step + 1}: {step_time:.2f} seconds")

#     total_time = time.time() - total_start_time
#     print(f"\nTotal time for training and averaging models: {total_time:.2f} seconds")

#     return model, small_models
#  train and average models on random subsets
def train_and_average_models(x_train, y_train, model, x_test, y_test, num_steps=5, subset_fraction=0.1):
    model_weights = model.get_weights()
    total_start_time = time.time()  # Track total time for this function
    small_models = []  # To store the small models trained on each subset
    small_model_accuracies = []  # To store the accuracy of each small model

    # Split data into random subsets
    subsets = split_data(x_train, y_train, subset_fraction, num_steps)

    for step, (x_train_subset, y_train_subset) in enumerate(subsets):
        # Start time for training and averaging (exclude testing time)
        step_start_time = time.time()
        print(f"Training step {step + 1}/{num_steps}")

        # Train the model on this subset
        train_model_on_data(model, x_train_subset, y_train_subset)

        # Test accuracy of this small model on the test set (no timing here)
        test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
        small_model_accuracies.append(test_acc)
        print(f"Accuracy of the small model trained on subset {step + 1}: {test_acc:.4f}")

        if step > 1:
          # Average the model weights
          model_weights = average_model_weights(model, model_weights)

        # Test the model after averaging weights (accuracy of the current model after including the new model)
        test_loss_after_averaging, test_acc_after_averaging = model.evaluate(x_test, y_test, verbose=0)
        print(f"Accuracy of the model after including new model (step {step + 1}): {test_acc_after_averaging:.4f}")

        # Store the trained model weights for this subset
        small_models.append(model_weights)

        # Time taken for training and averaging (exclude testing time)
        step_time = time.time() - step_start_time
        print(f"Time taken for training and averaging step {step + 1}: {step_time:.2f} seconds")

    total_time = time.time() - total_start_time
    print(f"\nTotal time for training and averaging models (excluding testing): {total_time:.2f} seconds")

    # Return the final model, small models, and their accuracies
    return model, small_models, small_model_accuracies



# Control model - Train on the entire dataset at once
def train_control_model(x_train, y_train, x_test, y_test, epochs=5):
    control_model = create_model()
    start_time = time.time()  # Track time for control model

    # Train the model on the full dataset
    train_model_on_data(control_model, x_train, y_train, epochs)

    # Evaluate the control model on the test set
    test_loss, test_acc = control_model.evaluate(x_test, y_test)
    print(f"Control model accuracy: {test_acc:.4f}")

    control_time = time.time() - start_time
    print(f"Time taken to train control model: {control_time:.2f} seconds")

    return control_model

In [18]:
# Load MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Preprocess data (normalize and reshape)
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train = np.expand_dims(x_train, -1)  # channel dimension
x_test = np.expand_dims(x_test, -1)

In [19]:
# Create the model
model = create_model()

# Train and average models on random subsets
print("\nTraining on random subsets and averaging models...")
trained_model, small_models, small_model_accuracies = train_and_average_models(x_train, y_train, model, x_test, y_test, num_steps=5, subset_fraction=0.1)

# Final evaluation of the model trained on random subsets
test_loss, test_acc = trained_model.evaluate(x_test, y_test)
print(f"Final model accuracy (random subsets averaging): {test_acc:.4f}")


Training on random subsets and averaging models...
Subset 1/5 - Size: 6000 images
Subset 2/5 - Size: 6000 images
Subset 3/5 - Size: 6000 images
Subset 4/5 - Size: 6000 images
Subset 5/5 - Size: 6000 images
Training step 1/5
Accuracy of the small model trained on subset 1: 0.9703
Accuracy of the model after including new model (step 1): 0.9703
Time taken for training and averaging step 1: 38.31 seconds
Training step 2/5
Accuracy of the small model trained on subset 2: 0.9817
Accuracy of the model after including new model (step 2): 0.9817
Time taken for training and averaging step 2: 30.12 seconds
Training step 3/5
Accuracy of the small model trained on subset 3: 0.9792
Accuracy of the model after including new model (step 3): 0.9720
Time taken for training and averaging step 3: 39.88 seconds
Training step 4/5
Accuracy of the small model trained on subset 4: 0.9823
Accuracy of the model after including new model (step 4): 0.9818
Time taken for training and averaging step 4: 30.57 secon

In [20]:
# Train the control model on the full dataset
print("\nTraining control model on the full dataset...")
control_model = train_control_model(x_train, y_train, x_test, y_test, epochs=5)


Training control model on the full dataset...
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.9851 - loss: 0.0424
Control model accuracy: 0.9884
Time taken to train control model: 314.12 seconds
