In [1]:
import os

In [2]:
%pwd

'c:\\Users\\justi\\Desktop\\MLOps Demo\\MLOPS-Demo-Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\justi\\Desktop\\MLOps Demo\\MLOPS-Demo-Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

#need to change according to my training needs
@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    # alpha: float
    # l1_ratio: float
    target_column: str
    learning_rate: float
    input_dim: int 
    hidden1_dim: int 
    hidden2_dim: int 
    output_dim: int
    criterion: str
    optimizer: str
    num_epochs: int

In [6]:
from src.mlops_project.constants import *
from src.mlops_project.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_training
        params = self.params.HeartDiseaseNN
        schema = self.schema.TARGET_COLUMN


        create_directories([config.root_dir])

        model_training_config = ModelTrainingConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            model_name = config.model_name,
            target_column = schema.name,
            learning_rate = params.learning_rate,
            input_dim = params.input_dim,
            hidden1_dim = params.hidden1_dim,
            hidden2_dim = params.hidden2_dim,
            output_dim = params.output_dim,
            criterion = params.criterion,
            optimizer = params.optimizer,
            num_epochs = params.num_epochs
        )
        
        return model_training_config


In [8]:
import os
from src.mlops_project import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
import torch


In [9]:
class HeartDiseaseNN(nn.Module):
    def __init__(self, input_dim, hidden1_dim, hidden2_dim, output_dim):
        super(HeartDiseaseNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden1_dim)    # Input layer to first hidden layer
        self.fc2 = nn.Linear(hidden1_dim, hidden2_dim)  # First hidden layer to second hidden layer
        self.fc3 = nn.Linear(hidden2_dim, output_dim)   # Second hidden layer to output layer

    def forward(self, x):
        x = F.relu(self.fc1(x))  # Activation function for the first hidden layer
        x = F.relu(self.fc2(x))  # Activation function for the second hidden layer
        x = self.fc3(x)          # Output layer
        return x

In [10]:

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden1_dim, hidden2_dim, output_dim):
        super(NeuralNetwork, self).__init__()

        self.layer1 = nn.Linear(input_dim, hidden1_dim)
        self.layer2 = nn.Linear(hidden1_dim, hidden2_dim)
        self.output_layer = nn.Linear(hidden2_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = torch.sigmoid(self.output_layer(x))
        return x
    


In [11]:
def get_optimizer(optimizer_str: str, learning_rate: float, model):
    if optimizer_str == "adam":
        optimizer = Adam(model.parameters(), lr=learning_rate)
    return optimizer

def get_criterion(criterion_str: str):
    if criterion_str == "cross entropy loss":
        criterion = nn.CrossEntropyLoss()
    return criterion

def resample_classes(dataframe):
    df_majority = dataframe[dataframe["CHDRisk"] == 0]
    df_minority = dataframe[dataframe["CHDRisk"] == 1]

    minority_count = len(df_minority)

    df_majority_downsampled = df_majority.sample(n=minority_count, random_state=42)  # Ensuring reproducibility

    df_balanced = pd.concat([df_minority, df_majority_downsampled])

    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    #Turn all items within the table to float
    df_balanced = df_balanced.astype(float)
    return df_balanced


In [12]:
from sklearn.preprocessing import StandardScaler
import torch.optim as optim


In [13]:
class ModelTraining:
    def __init__(self, config: ModelTrainingConfig) -> None:
        self.config = config
    
    def train(self):

        # # Load training and testing data
        train_data = pd.read_csv(self.config.train_data_path)
        # test_data = pd.read_csv(self.config.test_data_path)
        train_data = train_data.dropna()

        resampled_train_data = resample_classes(train_data)

        device = torch.device("cpu")


        features = resampled_train_data.drop('CHDRisk', axis=1).values
        targets = resampled_train_data['CHDRisk'].values
        X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        X_train = torch.tensor(X_train, dtype=torch.float32)
        X_test = torch.tensor(X_test, dtype=torch.float32)
        y_train = torch.tensor(y_train, dtype=torch.float32)
        y_test = torch.tensor(y_test, dtype=torch.float32)

        train_dataset = TensorDataset(X_train, y_train)
        test_dataset = TensorDataset(X_test, y_test)

        train_loader = DataLoader(dataset=train_dataset, batch_size=10, shuffle=True)
        test_loader = DataLoader(dataset=test_dataset, batch_size=10, shuffle=False)

        model = NeuralNetwork(self.config.input_dim, self.config.hidden1_dim, self.config.hidden2_dim, self.config.output_dim).to(device)
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        # Training loop
        results = self.train_model(model, train_loader, criterion, optimizer, self.config.num_epochs, os.path.join(self.config.root_dir, self.config.model_name))
        return results

    
    def train_model(self, model, train_loader, criterion, optimizer, num_epochs, model_path):
        """ Train the model """
        results = {"losses": [], "accuracies": []}

        for epoch in range(num_epochs):
            total = 0
            correct = 0

            for inputs, labels in train_loader:

                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                predicted = (outputs.squeeze() > 0.5).float()
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            accuracy = 100 * correct / total
            results["losses"].append(loss)
            results["accuracies"].append(accuracy)
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy:.2f}%')
            torch.save(model.state_dict(), model_path)
        return results

    

In [14]:
try:
    config = ConfigurationManager()
    model_training_config = config.get_model_training_config()
    model_training_config = ModelTraining(config=model_training_config)
    model_training_config.train()
except Exception as e:
    raise e
    

[2024-06-08 02:07:04,580: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-08 02:07:04,583: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-08 02:07:04,585: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-06-08 02:07:04,586: INFO: common: created directory at: artifacts]
[2024-06-08 02:07:04,587: INFO: common: created directory at: artifacts/model_training]
Epoch [1/60], Loss: 0.6200, Accuracy: 58.96%
Epoch [2/60], Loss: 0.5040, Accuracy: 65.93%
Epoch [3/60], Loss: 0.8607, Accuracy: 65.78%
Epoch [4/60], Loss: 0.4906, Accuracy: 67.56%
Epoch [5/60], Loss: 0.3994, Accuracy: 67.85%
Epoch [6/60], Loss: 0.6615, Accuracy: 68.15%
Epoch [7/60], Loss: 0.4934, Accuracy: 68.44%
Epoch [8/60], Loss: 0.3210, Accuracy: 69.33%
Epoch [9/60], Loss: 0.2374, Accuracy: 70.07%
Epoch [10/60], Loss: 0.6380, Accuracy: 70.81%
Epoch [11/60], Loss: 0.3082, Accuracy: 71.85%
Epoch [12/60], Loss: 0.8428, Accuracy: 73.93%
Epoch [13/60], Loss: 0.8009, Accuracy

In [15]:
def evaluate_model(model, test_loader, criterion):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0
    results = {"losses": [], "accuracies": []}

    with torch.no_grad():
        for data, labels in test_loader:
            outputs = model(data)
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    average_loss = running_loss / len(test_loader)  
    accuracy = 100 * correct / total
    results["losses"].append(average_loss)
    results["accuracies"].append(accuracy)
    print(f'Accuracy of the model on the test data: {accuracy:.2f}%, Loss: {average_loss:.4f}')
    return results


In [16]:
train_data = pd.read_csv("artifacts/data_transformation/train.csv")
train_data

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
0,0.0,37,2,1.0,30,0,0,0,0,246,124.0,83.0,30.93,60,85,0
1,1.0,41,4,0.0,0,0,0,0,0,206,130.0,88.0,22.25,85,79,0
2,0.0,61,1,1.0,15,0,0,1,0,157,195.0,108.0,25.08,75,78,0
3,0.0,39,3,0.0,0,0,0,0,0,221,126.0,80.0,23.90,64,80,0
4,0.0,63,1,,10,0,0,0,1,240,146.0,84.0,30.48,75,120,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2750,1.0,58,1,1.0,15,0,0,1,0,275,140.0,78.0,19.18,78,74,1
2751,0.0,51,2,1.0,5,0,0,1,0,290,168.0,103.0,29.11,80,64,1
2752,0.0,51,2,1.0,20,0,0,0,0,227,139.0,74.0,29.29,80,67,0
2753,0.0,39,2,1.0,20,0,0,0,0,188,120.0,74.0,26.48,65,80,0


In [17]:
df_majority = train_data[train_data["CHDRisk"] == 0]
df_minority = train_data[train_data["CHDRisk"] == 1]
df_minority

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
5,1.0,53,1,0.0,0,0,0,1,1,248,200.0,140.0,43.30,107,130,1
20,0.0,64,1,1.0,30,0,0,1,0,253,178.0,106.0,24.68,100,76,1
24,0.0,54,1,1.0,43,0,0,0,0,243,135.0,92.0,31.30,90,65,1
31,1.0,62,4,0.0,0,1,0,1,0,274,167.0,94.0,28.18,100,80,1
32,1.0,41,3,1.0,15,0,0,0,0,195,120.5,76.0,22.91,75,70,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2736,0.0,61,1,0.0,0,0,0,1,0,243,142.0,89.0,27.30,65,67,1
2745,1.0,50,3,0.0,0,0,0,1,0,243,157.0,98.0,23.82,70,78,1
2749,1.0,61,4,,0,0,0,0,0,261,124.0,76.5,23.06,55,83,1
2750,1.0,58,1,1.0,15,0,0,1,0,275,140.0,78.0,19.18,78,74,1


In [18]:
new_frame = resample_classes(train_data)
new_frame

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
0,0.0,46.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,300.0,146.0,98.5,30.41,60.0,79.0,0.0
1,1.0,46.0,3.0,1.0,10.0,0.0,0.0,0.0,0.0,205.0,115.0,75.0,19.48,55.0,78.0,0.0
2,0.0,52.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,258.0,177.0,111.0,30.38,80.0,270.0,1.0
3,0.0,51.0,1.0,1.0,20.0,0.0,0.0,0.0,0.0,243.0,130.0,86.5,29.86,85.0,74.0,1.0
4,1.0,47.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,126.0,86.0,26.32,73.0,92.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851,1.0,65.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,230.0,159.0,87.0,22.91,70.0,65.0,1.0
852,0.0,62.0,1.0,1.0,23.0,0.0,0.0,1.0,0.0,286.0,164.0,88.0,19.53,85.0,126.0,1.0
853,1.0,47.0,2.0,1.0,9.0,0.0,0.0,1.0,0.0,253.0,129.0,81.0,22.18,70.0,122.0,1.0
854,1.0,51.0,3.0,1.0,15.0,0.0,0.0,0.0,0.0,326.0,101.0,67.0,22.73,69.0,87.0,0.0
