In [1]:
import torch
import torch.nn as nn
import pandas as pd
from typing import Optional
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader

In [2]:
pd.__version__, torch.__version__, torch.cuda.is_available()

('2.2.3', '2.5.1', True)

In [24]:
class DiabetClassification:
    
    def __init__(self, *, dataset_url: str,train_batch_size: int, val_batch_size: int, test_batch_size: int, epoch_num: int):
        self.dataset_url: str= dataset_url
        self.features: Optional[pd.Series]= None
        self.label: Optional[pd.Series]= None
        self.num_features: int= 0
        self.epoch_num: int= epoch_num
        self.X_train: torch.Tensor= None
        self.y_train: torch.Tensor= None
        self.X_val: torch.Tensor= None
        self.y_valid: torch.Tensor= None
        self.X_test: torch.Tensor= None
        self.y_test: torch.Tensor= None
        self.train_batch_size: int= train_batch_size
        self.val_batch_size: int= val_batch_size
        self.test_batch_size: int= test_batch_size
        self.train_set: torch.Tensor= None
        self.val_set: torch.Tensor= None
        self.test_set: torch.Tensor= None
        self.train_loader: torch.Tensor= None
        self.val_loader: torch.Tensor= None
        self.test_loader: torch.Tensor= None
        self.model: nn.Module= None
        self.loss_fn: Optional[Callable]= None
        self.optimizer: Optional[OptimizerType]= None
        
    
    def _load_dataset(self) -> None:
        """  
        Loads the dataset from a CSV file and splits it into features and labels.  
        The features exclude the 'Outcome' column, and the label is reshaped.  
        """  
        df= pd.read_csv(self.dataset_url)
        self.num_features= df.shape[1] - 1 # number of features
        self.features= df.drop('Outcome', axis=1).values # Features
        self.label= df['Outcome'].values.reshape(-1, 1) # Label

    
    def _split_data(self)-> None:
        """  
        Splits the dataset into training, validation, and test sets.  
        The training set receives 72% of the data, validation set 8%, and test set 20%.  
        """  
        X_train, X_test, y_train, y_test= train_test_split(
            self.features,
            self.label,
            test_size=0.2,
            random_state= 42)

        X_train, X_val, y_train, y_val= train_test_split(
            X_train,
            y_train,
            test_size= 0.1,
            random_state= 42)

        self.X_train= X_train
        self.y_train= y_train
        self.X_val= X_val
        self.y_val= y_val
        self.X_test= X_test
        self.y_test= y_test

    
    def _normalization(self)-> None:
        """  
        Normalizes the training, validation, and test datasets using StandardScaler.  
        Transforms the features to have a mean of 0 and a standard deviation of 1.  
        """  
        x_scaler= StandardScaler()
        
        self.X_train= x_scaler.fit_transform(self.X_train)
        self.X_val= x_scaler.transform(self.X_val)
        self.X_test= x_scaler.transform(self.X_test)
        
    
    def _to_tensor(self)-> None:
        """  
        Converts training, validation, and test datasets to PyTorch tensors   
        with float32 data type.  
        """         
        self.X_train= torch.tensor(self.X_train, dtype= torch.float32)
        self.y_train= torch.tensor(self.y_train, dtype= torch.float32)
        self.X_val= torch.tensor(self.X_val, dtype= torch.float32)
        self.y_val= torch.tensor(self.y_val, dtype= torch.float32)
        self.X_test= torch.tensor(self.X_test, dtype= torch.float32)
        self.y_test= torch.tensor(self.y_test, dtype= torch.float32)

    
    def _create_dataloader(self):
        """  
        Creates DataLoader objects for training, validation, and test datasets.  
        Initializes:  
            - train_loader: DataLoader for training data (shuffled).  
            - val_loader: DataLoader for validation data.  
            - test_loader: DataLoader for test data.  
        """         
        self.train_set= TensorDataset(self.X_train, self.y_train)
        self.val_set= TensorDataset(self.X_val, self.y_val)
        self.test_set= TensorDataset(self.X_test, self.y_test)

        self.train_loader= DataLoader(self.train_set, batch_size= self.train_batch_size, shuffle= True)
        self.val_loader= DataLoader(self.val_set, batch_size= self.val_batch_size)
        self.test_loader= DataLoader(self.test_set, batch_size= self.test_batch_size)


    def _build_model(self):   
        """  
        Builds a neural network model with a linear layer and sigmoid activation.    
        This model is used for binary classification, taking `num_features` as input  
        and outputting a probability value between 0 and 1.  
    
        Attributes:  
            model (nn.Sequential): The constructed neural network model.  
        """         
        self.model= nn.Sequential(
           nn.Linear(in_features= self.num_features, out_features=1),
           nn.Sigmoid())

    def _compute_loss(self):
        self.loss_fn = nn.BCELoss()

    
    def _setup_optimizer(self):
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.05, momentum= 0.9)


    def _train(self):
        for epoch in range(self.epoch_num):
            sum_train_loss, sum_train_acc = 0, 0
            
            for x_batch, y_batch in self.train_loader:
                y_hat= self.model(x_batch) # model
                loss= self.loss_fn(y_hat, y_batch) # loss
                loss.backward() # gradient

                self.optimizer.step() # update
                self.optimizer.zero_grad()
                
                sum_train_loss += loss * len(x_batch)
                sum_train_acc +=torch.sum(y_hat.round() == y_batch).item()
                
            mean_train_loss= sum_train_loss / len(self.train_set)
            mean_train_acc= sum_train_acc / len(self.train_set)

            with torch.no_grad():           
                sum_val_loss, sum_val_acc= 0, 0
                
                for x_batch, y_batch in self.val_loader:
                    y_hat= self.model(x_batch)
                    loss= self.loss_fn(y_hat, y_batch)

                    sum_val_loss += loss * len(x_batch)
                    sum_val_acc += torch.sum(y_hat.round() == y_batch).item()
                    
                mean_val_loss= sum_val_loss / len(self.val_set)
                mean_val_acc= sum_val_acc / len(self.val_set)
            
            print(
                f'[epoch: {epoch}] ,',
                f'Train_loss: {mean_train_loss :.2f}, ',
                f'Train_acc: {mean_train_acc :.2f}, ',
                f'val_loss: {mean_val_loss :.2f}, ',
                f'val_acc: {mean_val_acc :.2f}, '
                )
            
    def _evaluate(self):
        with torch.no_grad():
            sum_test_acc= 0
            
            for x_batch, y_batch in self.test_loader:
                y_hat= self.model(x_batch)
                sum_test_acc += torch.sum(y_hat.round() == y_batch).item()

            mean_test_acc= sum_test_acc / len(self.test_set)
            print(f'\nTest accuracy: {mean_test_acc :.2f}')                
                
    
    def __call__(self):
        self._load_dataset()
        self._split_data()
        self._normalization()
        self._to_tensor()
        self._create_dataloader()
        self._build_model()
        self._compute_loss()
        self._setup_optimizer()
        self._train()
        self._evaluate()

    def __str__(self):
        parameters= [
            f'model weights: {self.model[0].weight}',
            f'model bias: {self.model[0].bias}',
        ]
        return ','.join(parameters)


In [25]:
classifier_obj= DiabetClassification(
    dataset_url= 'diabetes.csv',
    train_batch_size= 100,
    val_batch_size=50, 
    test_batch_size=50,
    epoch_num= 10)        

classifier_obj()

[epoch: 0] , Train_loss: 0.63,  Train_acc: 0.70,  val_loss: 0.60,  val_acc: 0.66, 
[epoch: 1] , Train_loss: 0.55,  Train_acc: 0.75,  val_loss: 0.48,  val_acc: 0.79, 
[epoch: 2] , Train_loss: 0.50,  Train_acc: 0.76,  val_loss: 0.45,  val_acc: 0.79, 
[epoch: 3] , Train_loss: 0.49,  Train_acc: 0.76,  val_loss: 0.44,  val_acc: 0.81, 
[epoch: 4] , Train_loss: 0.48,  Train_acc: 0.77,  val_loss: 0.45,  val_acc: 0.82, 
[epoch: 5] , Train_loss: 0.47,  Train_acc: 0.76,  val_loss: 0.45,  val_acc: 0.81, 
[epoch: 6] , Train_loss: 0.47,  Train_acc: 0.76,  val_loss: 0.45,  val_acc: 0.82, 
[epoch: 7] , Train_loss: 0.47,  Train_acc: 0.77,  val_loss: 0.46,  val_acc: 0.82, 
[epoch: 8] , Train_loss: 0.47,  Train_acc: 0.77,  val_loss: 0.46,  val_acc: 0.82, 
[epoch: 9] , Train_loss: 0.47,  Train_acc: 0.77,  val_loss: 0.46,  val_acc: 0.82, 

Test accuracy: 0.74


In [26]:
classifier_obj.X_train.shape, classifier_obj.X_val.shape, classifier_obj.X_test.shape

(torch.Size([552, 8]), torch.Size([62, 8]), torch.Size([154, 8]))

In [27]:
x_batch, y_batch = next(iter(classifier_obj.val_loader))
x_batch.shape, y_batch.shape, len(classifier_obj.train_loader)

(torch.Size([50, 8]), torch.Size([50, 1]), 6)

In [30]:
classifier_obj.epoch_num

10