In [1]:
# !pip3 install pyarrow pillow --upgrade --user
# !pip3 install mlflow
import pyarrow.parquet as pq
from datasets import Dataset
import torch
import pandas as pd
import os
import mlflow.pytorch
from mlflow import MlflowClient

In [4]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv('.env')

False

In [6]:
print(os.getenv('MLFLOW_TRACKING_USERNAME'))

None


In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv('.env')

# Set MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD
os.environ['MLFLOW_TRACKING_USERNAME'] = os.getenv('MLFLOW_TRACKING_USERNAME')
os.environ['MLFLOW_TRACKING_PASSWORD'] = os.getenv('MLFLOW_TRACKING_PASSWORD')


TypeError: str expected, not NoneType

In [3]:
mlflow.set_experiment("CNN-pytorch")
mlflow.pytorch.autolog
mlflow.set_tracking_uri('https://dagshub.com/wwoszczek/MLOps-TeamBeans.mlflow')

In [4]:
def print_auto_logged_info(r):
    tags = {k: v for k, v in r.data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in MlflowClient().list_artifacts(r.info.run_id, "model")]
    print(f"run_id: {r.info.run_id}")
    print(f"artifacts: {artifacts}")
    print(f"params: {r.data.params}")
    print(f"metrics: {r.data.metrics}")
    print(f"tags: {tags}")

#### Copying the class of CustomDataset as it cannot be imported easily 

In [5]:
import pyarrow.parquet as pq
from datasets import Dataset
import pandas as pd
import os

In [6]:
current_directory = os.getcwd()

dataset_train = Dataset.from_file(os.path.join(current_directory, "raw\\train\\") + "data-00000-of-00001.arrow")

dataset_validation = Dataset.from_file(os.path.join(current_directory, "raw\\validation\\") + "data-00000-of-00001.arrow")

dataset_test = Dataset.from_file(os.path.join(current_directory, "raw\\test\\") + "data-00000-of-00001.arrow")

dataset_train

Dataset({
    features: ['image_file_path', 'image', 'labels'],
    num_rows: 1034
})

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn

torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

class CustomDataset(Dataset):
    def __init__(self, dataset):
        self.data = dataset
        self.transform = transforms.Compose([
            transforms.Resize((500,500)),  # Resize to our desired size
            transforms.ToTensor(),          # Convert PIL Image to PyTorch tensor
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize RGB channels
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        image = self.transform(sample['image'])
        label = sample['labels']

        return image, label
    

custom_train = CustomDataset(dataset_train)
custom_validation = CustomDataset(dataset_validation)
custom_test = CustomDataset(dataset_test)

# Create a DataLoader for training, validation and test
train_loader = DataLoader(custom_train, batch_size=32, shuffle=True)    
validation_loader = DataLoader(custom_validation, batch_size=32, shuffle=False)
test_loader = DataLoader(custom_test, batch_size=32, shuffle=False)

#### End of the copied part.

In [8]:
# Load the DataLoader from the file
train_loader = torch.load('dataloaders/train_loader.pt')
validation_loader= torch.load('dataloaders/validation_loader.pt')
test_loader = torch.load('dataloaders/test_loader.pt')

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [10]:
for images, labels in train_loader:
  print(images.size(), labels.size())
  break

torch.Size([32, 3, 500, 500]) torch.Size([32])


In [11]:
import torch
import torch.nn as nn

class SimpleCNNReducedStride10(nn.Module):
    def __init__(self, num_classes=3):
        super(SimpleCNNReducedStride10, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=8, kernel_size=3, stride=2, padding=1)
        self.relu1 = nn.ReLU()
        
        self.conv2 = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3, stride=2, padding=1)
        self.relu2 = nn.ReLU()
        
        self.dropout = nn.Dropout(0.5)  # Add dropout for regularization
        
        # Calculate the correct input size for fc1 based on the spatial dimensions
        self.fc1_input_size = self.calculate_fc1_input_size()
        self.fc1 = nn.Linear(250000, 256)
        self.relu3 = nn.ReLU()
        
        self.dropout2 = nn.Dropout(0.5)  # Add dropout for regularization
        
        self.fc2 = nn.Linear(256, num_classes)
        self.log_softmax = nn.LogSoftmax(dim=1)  # Softmax activation for classification

    def calculate_fc1_input_size(self):
        # Assuming the output size after the second convolutional layer
        # with stride 10 is (16, 50, 50), calculate the input size for fc1
        return 16 * 50 * 50

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        
        x = self.conv2(x)
        x = self.relu2(x)
        
        x = x.view(x.size(0), -1)  # Flatten the feature maps
        x = self.dropout(x)  # Apply dropout for regularization
        
        x = self.fc1(x)
        
        x = self.relu3(x)
        x = self.dropout2(x)
        
        x = self.fc2(x)
        
        x = self.log_softmax(x)  # Apply softmax for classification
        return x

#### Code Restructured to fit MLFlow

In [12]:
with mlflow.start_run() as run:
    ## The idea is to get the autolog to run for our pytorch funct. 
    ## It might depend on the funct. we choose and the pytorch version
    ## Thus initially I defined some metrics to try it.
    
    # Create an instance of the SimpleCNNReduced model
    model = SimpleCNNReducedStride10(num_classes=3)

    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    # Calculate the total number of trainable parameters
    total_params_reduced = count_parameters(model)
    mlflow.log_param("total_trainable_parameters", total_params_reduced)
    print(f"Total trainable parameters in the reduced model: {total_params_reduced}")
    
    ###############################3
    
    from torch.optim import Adam

    model = model.to(device)
    optimizer = Adam(model.parameters())
    criterion = nn.NLLLoss()

    num_epochs = 5
    batch_loss = 0
    cum_epoch_loss = 0
    
    # Log parameters
    mlflow.log_param("num_epochs", num_epochs)
    mlflow.log_param("num_classes", 3)
    mlflow.log_param("kernel_size_conv1", 3)
    mlflow.log_param("stride_conv1", 2)
    mlflow.log_param("padding_conv1", 1)
    mlflow.log_param("kernel_size_conv2", 3)
    mlflow.log_param("stride_conv2", 2)
    mlflow.log_param("padding_conv2", 1)
    mlflow.log_param("dropout_rate", 0.5)
    mlflow.log_param("fc1_input_size", model.fc1_input_size)
    mlflow.log_param("num_conv_layers", 2)  # Example: Number of convolutional layers
    mlflow.log_param("activation_function", "ReLU")  # Example: Activation function used

    for e in range(num_epochs):
      cum_epoch_loss = 0

      for batch, (images, labels) in enumerate(train_loader,1):
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logps = model(images)
        loss = criterion(logps, labels)
        loss.backward()
        optimizer.step()

        batch_loss += loss.item()
        print(f'Epoch({e}/{num_epochs} : Batch number({batch}/{len(train_loader)}) : Batch loss : {loss.item()}')

      print(f'Training loss : {batch_loss/len(train_loader)}')
    
    # Log a metric (e.g., training loss)
    mlflow.log_metric("training_loss", batch_loss / len(train_loader))
    
    ###########################################################333
    
    model.to('cpu')
    
    # Save the model as an artifact
    mlflow.pytorch.log_model(model, "models")

    model.eval()
    with torch.no_grad():
        num_correct = 0
        total = 0

        #set_trace()
        for batch, (images, labels) in enumerate(validation_loader,1):

            logps = model(images)
            output = torch.exp(logps)

            pred = torch.argmax(output, 1)
            total += labels.size(0)
            num_correct += (pred == labels).sum().item()
            print(f'Batch ({batch}/{len(validation_loader)})')

            # if batch == 5:
             # break

        # Calculate test accuracy
        test_accuracy = num_correct * 100 / total
        print(f'Accuracy of the model on {total} test images: {test_accuracy}% ')

        # Log the test accuracy as a metric
        mlflow.log_metric("test_accuracy", test_accuracy)


# fetch the auto logged parameters and metrics
print_auto_logged_info(mlflow.get_run(run_id=run.info.run_id))

Total trainable parameters in the reduced model: 64002419
Epoch(0/5 : Batch number(1/33) : Batch loss : 1.1032708883285522
Epoch(0/5 : Batch number(2/33) : Batch loss : 8.681683540344238
Epoch(0/5 : Batch number(3/33) : Batch loss : 17.68632698059082
Epoch(0/5 : Batch number(4/33) : Batch loss : 8.561243057250977
Epoch(0/5 : Batch number(5/33) : Batch loss : 9.949642181396484
Epoch(0/5 : Batch number(6/33) : Batch loss : 4.945017337799072
Epoch(0/5 : Batch number(7/33) : Batch loss : 3.6156113147735596
Epoch(0/5 : Batch number(8/33) : Batch loss : 3.332427740097046
Epoch(0/5 : Batch number(9/33) : Batch loss : 4.154150009155273
Epoch(0/5 : Batch number(10/33) : Batch loss : 1.5108925104141235
Epoch(0/5 : Batch number(11/33) : Batch loss : 2.1299262046813965
Epoch(0/5 : Batch number(12/33) : Batch loss : 1.7299935817718506
Epoch(0/5 : Batch number(13/33) : Batch loss : 1.1807994842529297
Epoch(0/5 : Batch number(14/33) : Batch loss : 1.1367098093032837
Epoch(0/5 : Batch number(15/33) : 

Epoch(3/5 : Batch number(25/33) : Batch loss : 0.3181658983230591
Epoch(3/5 : Batch number(26/33) : Batch loss : 0.42861124873161316
Epoch(3/5 : Batch number(27/33) : Batch loss : 0.47407102584838867
Epoch(3/5 : Batch number(28/33) : Batch loss : 0.279560387134552
Epoch(3/5 : Batch number(29/33) : Batch loss : 0.4122806191444397
Epoch(3/5 : Batch number(30/33) : Batch loss : 0.42252063751220703
Epoch(3/5 : Batch number(31/33) : Batch loss : 0.2878819704055786
Epoch(3/5 : Batch number(32/33) : Batch loss : 0.3474612236022949
Epoch(3/5 : Batch number(33/33) : Batch loss : 0.2461000382900238
Training loss : 4.333207594174327
Epoch(4/5 : Batch number(1/33) : Batch loss : 0.3796701431274414
Epoch(4/5 : Batch number(2/33) : Batch loss : 0.41745495796203613
Epoch(4/5 : Batch number(3/33) : Batch loss : 0.2648260295391083
Epoch(4/5 : Batch number(4/33) : Batch loss : 0.1686577945947647
Epoch(4/5 : Batch number(5/33) : Batch loss : 0.29636791348457336
Epoch(4/5 : Batch number(6/33) : Batch loss