combined experiment tracking with model training

MartinKalema · Jun 11, 2024 · 279c6ec · 279c6ec
1 parent 06cf6e7
commit 279c6ec
Show file tree

Hide file tree

Showing 15 changed files with 127 additions and 91 deletions.
diff --git a/research/03_model_evaluation.ipynb → .dockerignore b/research/03_model_evaluation.ipynb → .dockerignore
diff --git a/configuration/configuration.yaml b/configuration/configuration.yaml
@@ -6,8 +6,9 @@ data_ingestion:
   test_source_URL: https://drive.google.com/file/d/1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er/view?usp=sharing
   train_data_file: artifacts/data_ingestion/compressed/train_data.zip
   test_data_file: artifacts/data_ingestion/compressed/test_data.zip
-  unzip_dir: artifacts/data_ingestion/decompressed
+  decompressed_dir: artifacts/data_ingestion/decompressed
 
 training:
   root_dir: artifacts/models
   training_data_path: artifacts/data_ingestion/decompressed/Train.csv
+  testing_data_path: artifacts/data_ingestion/decompressed/Test.csv
diff --git a/main.py b/main.py
@@ -1,8 +1,6 @@
 from swahiliNewsClassifier import log
 from swahiliNewsClassifier.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
-from swahiliNewsClassifier.pipeline.stage_02_model_training import ModelTrainingPipeline
-# from swahiliNewsClassifier.pipeline.stage_03_model_training import TrainingPipeline
-# from swahiliNewsClassifier.pipeline.stage_04_model_evaluation import EvaluationPipeline
+from swahiliNewsClassifier.pipeline.stage_02_model_training_and_evaluation import ModelTrainingAndEvaluationPipeline
 
 
 def run_pipeline_stage(stage_name, pipeline_class) -> None:
@@ -30,6 +28,4 @@ def run_pipeline_stage(stage_name, pipeline_class) -> None:
 
 if __name__ == '__main__':
     run_pipeline_stage("DATA INGESTION STAGE", DataIngestionTrainingPipeline)
-    run_pipeline_stage("MODEL TRAINING STAGE", ModelTrainingPipeline)
-    # run_pipeline_stage("Model Training Stage", TrainingPipeline)
-    # run_pipeline_stage("Model Evaluation Stage", EvaluationPipeline)
+    run_pipeline_stage("MODEL TRAINING AND EVALUATION STAGE", ModelTrainingAndEvaluationPipeline)
diff --git a/parameters.yaml b/parameters.yaml
@@ -3,7 +3,7 @@ LEARNING_RATE_2: 0.05
 LEARNING_RATE_3: 0.05
 LEARNING_RATE_4: 0.05
 LEARNING_RATE_5: 0.03
-NUMBER_OF_CLASSES: 2
+NUMBER_OF_CLASSES: 5
 EPOCHS_1: 5
 EPOCHS_2: 5
 EPOCHS_3: 5

diff --git a/research/02_model_training.ipynb b/research/02_model_training.ipynb
@@ -93,7 +93,6 @@
     "    epochs_4: int\n",
     "    epochs_5: int\n",
     "    training_data: Path\n",
-    "    number_of_classes: int\n",
     "    root_dir: Path"
    ]
   },
@@ -142,7 +141,6 @@
     "            epochs_3=self.params.EPOCHS_3,\n",
     "            epochs_4=self.params.EPOCHS_4,\n",
     "            epochs_5=self.params.EPOCHS_5,\n",
-    "            number_of_classes=self.params.NUMBER_OF_CLASSES,\n",
     "\n",
     "        )"
    ]

diff --git a/src/swahiliNewsClassifier/components/model_evaluation.py b/src/swahiliNewsClassifier/components/model_evaluation.py
diff --git a/...wsClassifier/components/model_training.py → ...mponents/model_training_and_evaluation.py b/...wsClassifier/components/model_training.py → ...mponents/model_training_and_evaluation.py
@@ -1,4 +1,4 @@
-from swahiliNewsClassifier.entity.entities import ModelTrainingConfig
+from swahiliNewsClassifier.entity.entities import ModelTrainingAndEvaluationConfig
 from swahiliNewsClassifier import log
 import torch
 import fastai
@@ -12,19 +12,20 @@
 from swahiliNewsClassifier import log
 import boto3
 from dotenv import load_dotenv
-
+import dagshub
+import mlflow
 
 load_dotenv()
 
-class ModelTraining:
-    def __init__(self, model_training_config: ModelTrainingConfig):
+class ModelTrainingAndEvaluation:
+    def __init__(self, model_training_and_evaluation_config: ModelTrainingAndEvaluationConfig):
         """
         Initialize ModelTraining object with the provided configuration.
 
         Args:
-            model_training_config (ModelTrainingConfig): Configuration object for model training.
+            model_training_and_evaluation_config (ModelTrainingConfig): Configuration object for model training.
         """
-        self.model_training_config = model_training_config
+        self.model_training_and_evaluation_config = model_training_and_evaluation_config
         self.bucket_name = "swahili-news-classifier"
         self.model_path = f"models/text_classifier_learner.pth"
         self.s3 = boto3.client('s3', aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), region_name=os.getenv('REGION_NAME'))
@@ -47,7 +48,7 @@ def load_data(self) -> pd.DataFrame:
             pd.DataFrame: Loaded training data.
         """
         log.info('Loading training data')
-        train = pd.read_csv(self.model_training_config.training_data)
+        train = pd.read_csv(self.model_training_and_evaluation_config.training_data)
         return train
 
     def prepare_data(self, train) -> 'tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]':
@@ -60,7 +61,7 @@ def prepare_data(self, train) -> 'tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame
         Returns:
             tuple: A tuple containing training data (df_trn), validation data (df_val), and data for language model (df_lm).
         """
-        df_trn, df_val = train_test_split(train, stratify=train['category'], test_size=self.model_training_config.test_size, random_state=123)
+        df_trn, df_val = train_test_split(train, stratify=train['category'], test_size=self.model_training_and_evaluation_config.test_size, random_state=123)
         df_lm = pd.concat([df_trn, df_val], axis=0)[['content']]
         return df_trn, df_val, df_lm
 
@@ -80,7 +81,7 @@ def create_dataloaders(self, df_lm) -> DataLoaders:
             get_x=ColReader('text'),
             splitter=RandomSplitter(0.1))
 
-        dls = dblock.dataloaders(df_lm, bs=self.model_training_config.batch_size_1)
+        dls = dblock.dataloaders(df_lm, bs=self.model_training_and_evaluation_config.batch_size_1)
         return dls
 
     def train_language_model(self, dls) -> Learner:
@@ -96,7 +97,7 @@ def train_language_model(self, dls) -> Learner:
         log.info('Training Language Model Learner')
         learn = language_model_learner(dls, AWD_LSTM, drop_mult=0.3, metrics=[accuracy]).to_fp16()
         learn.lr_find()
-        learn.fine_tune(self.model_training_config.epochs_1, self.model_training_config.learning_rate_1)
+        learn.fine_tune(self.model_training_and_evaluation_config.epochs_1, self.model_training_and_evaluation_config.learning_rate_1)
 
         log.info('Saving best Language Model Learner.')
 
@@ -123,7 +124,17 @@ def create_text_classifier_dataloaders(self, df_trn, dls_lm) -> DataLoaders:
             get_y=ColReader('category'),
             splitter=RandomSplitter(0.2))
 
-        return dblock.dataloaders(df_trn, bs=self.model_training_config.batch_size_2)
+        return dblock.dataloaders(df_trn, bs=self.model_training_and_evaluation_config.batch_size_2)
+
+    def log_to_mlflow(self, metrics: list) -> None:
+        os.environ['MLFLOW_TRACKING_URI'] = self.model_training_and_evaluation_config.mlflow_tracking_uri
+
+        dagshub.init(repo_owner=self.model_training_and_evaluation_config.mlflow_repo_owner, repo_name=self.model_training_and_evaluation_config.mlflow_repo_name, mlflow=True)
+
+        with mlflow.start_run():
+            mlflow.log_params(self.model_training_and_evaluation_config.all_params)
+            mlflow.log_metric('val_loss', metrics[0])
+            mlflow.log_metric('val_accuracy', metrics[1])
 
     def train_text_classifier(self, dls) -> None:
         """
@@ -132,21 +143,24 @@ def train_text_classifier(self, dls) -> None:
         Args:
             dls (DataLoaders): Dataloaders for the text classifier.
         """
+
         log.info('Training Text Classifier Learner.')
+
         learn = text_classifier_learner(dls, AWD_LSTM, metrics=[accuracy]).to_fp16()
         learn.load_encoder(f'language_model_learner')
         learn.lr_find()
-        learn.fit_one_cycle(self.model_training_config.epochs_2, self.model_training_config.learning_rate_2)
+        learn.fit_one_cycle(self.model_training_and_evaluation_config.epochs_2, self.model_training_and_evaluation_config.learning_rate_2)
         learn.freeze_to(-2)
-        learn.fit_one_cycle(self.model_training_config.epochs_3, slice(1e-3/(2.6**4), self.model_training_config.learning_rate_3))
+        learn.fit_one_cycle(self.model_training_and_evaluation_config.epochs_3, slice(1e-3/(2.6**4), self.model_training_and_evaluation_config.learning_rate_3))
         learn.freeze_to(-3)
-        learn.fit_one_cycle(self.model_training_config.epochs_4, slice(5e-3/(2.6**4), self.model_training_config.learning_rate_4))
+        learn.fit_one_cycle(self.model_training_and_evaluation_config.epochs_4, slice(5e-3/(2.6**4), self.model_training_and_evaluation_config.learning_rate_4))
         learn.unfreeze()
-        learn.fit_one_cycle(self.model_training_config.epochs_5, slice(1e-3/(2.6**4), self.model_training_config.learning_rate_5))
+        learn.fit_one_cycle(self.model_training_and_evaluation_config.epochs_5, slice(1e-3/(2.6**4), self.model_training_and_evaluation_config.learning_rate_5))
+        classifier_metrics = learn.validate()
+        self.log_to_mlflow(classifier_metrics)
+        learn.save_encoder(f'text_classifier_learner')
 
-        log.info("Saving best Text Classifier Learner.")
 
-        learn.save_encoder(f'text_classifier_learner')
 
     def run_pipeline(self) -> None:
         """

diff --git a/src/swahiliNewsClassifier/components/prediction_service.py b/src/swahiliNewsClassifier/components/prediction_service.py
diff --git a/src/swahiliNewsClassifier/configuration/configuration.py b/src/swahiliNewsClassifier/configuration/configuration.py
@@ -1,7 +1,10 @@
 from swahiliNewsClassifier.constants import *
 from swahiliNewsClassifier.utilities.helper_functions import read_yaml, create_directories
-from swahiliNewsClassifier.entity.entities import DataIngestionConfig, ModelTrainingConfig
+from swahiliNewsClassifier.entity.entities import DataIngestionConfig, ModelTrainingAndEvaluationConfig
+from dotenv import load_dotenv
+import os
 
+load_dotenv()
 
 class ConfigurationManager:
     def __init__(self, config_filepath=CONFIG_FILE_PATH,
@@ -38,16 +41,16 @@ def get_data_ingestion_config(self) -> DataIngestionConfig:
             unzip_dir=config.unzip_dir
         )
 
-    def get_model_training_config(self) -> ModelTrainingConfig:
+    def get_model_training_and_evaluation_config(self) -> ModelTrainingAndEvaluationConfig:
         """
-        Get the model training configuration.
+        Get the model training and evaluation configuration.
 
         Returns:
-            ModelTrainingConfig: Configuration object for model training.
+            ModelTrainingConfig: Configuration object for model training and evaluation.
         """
         create_directories([self.config.training.root_dir])
 
-        return ModelTrainingConfig(
+        return ModelTrainingAndEvaluationConfig(
             root_dir=self.config.training.root_dir,
             training_data=self.config.training.training_data_path,
             test_size=self.params.TEST_SIZE,
@@ -63,6 +66,9 @@ def get_model_training_config(self) -> ModelTrainingConfig:
             epochs_3=self.params.EPOCHS_3,
             epochs_4=self.params.EPOCHS_4,
             epochs_5=self.params.EPOCHS_5,
-            number_of_classes=self.params.NUMBER_OF_CLASSES,
+            mlflow_repo_name=os.getenv('MLFLOW_REPO_NAME'),
+            mlflow_tracking_uri=os.getenv('MLFLOW_TRACKING_URI'),
+            mlflow_repo_owner=os.getenv('MLFLOW_REPO_OWNER'),
+            all_params=self.params,
 
         )
diff --git a/src/swahiliNewsClassifier/entity/entities.py b/src/swahiliNewsClassifier/entity/entities.py
@@ -13,38 +13,61 @@ class DataIngestionConfig:
         test_source_URL (str): The URL from which the test data will be fetched.
         train_data_file (Path): The local file path where the downloaded training data will be stored.
         test_data_file (Path): The local file path where the downloaded test data will be stored.
-        unzip_dir (Path): The directory where the downloaded data will be extracted or unzipped.
+        decompressed_dir (Path): The directory where the downloaded data will be extracted.
     """
     root_dir: Path
     train_source_URL: str
     test_source_URL: str
     train_data_file: Path
     test_data_file: Path
-    unzip_dir: Path
+    decompressed_dir: Path
 
 
 @dataclass(frozen=True)
-class ModelTrainingConfig:
+class ModelTrainingAndEvaluationConfig:
     """
     Configuration class for model training using ULMFiT (Universal Language Model Fine-tuning).
 
     Attributes:
-        test_size (float): Proportion of the dataset to include in the test split.
-        learning_rate_1 (float): Learning rate for training the language model learner.
-        learning_rate_2 (float): Learning rate for the first phase of classifier training.
-        learning_rate_3 (float): Learning rate for the second phase of classifier training.
-        learning_rate_4 (float): Learning rate for the third phase of classifier training.
-        learning_rate_5 (float): Learning rate for the fourth phase of classifier training.
-        batch_size_1 (int): Batch size for language model training.
-        batch_size_2 (int): Batch size for text classifier training.
-        epochs_1 (int): Number of epochs for training the language model learner.
-        epochs_2 (int): Number of epochs for the first phase of classifier training.
-        epochs_3 (int): Number of epochs for the second phase of classifier training.
-        epochs_4 (int): Number of epochs for the third phase of classifier training.
-        epochs_5 (int): Number of epochs for the fourth phase of classifier training.
-        training_data (Path): Path to the training data CSV file.
-        number_of_classes (int): Number of target classes in the classification task.
-        root_dir (Path): Root directory for storing model artifacts.
+        test_size (float): Proportion of the dataset to include in the test split. This parameter is used to split the dataset into training and validation sets.
+
+        learning_rate_1 (float): Learning rate for training the language model learner. This is used during the fine-tuning of the pre-trained language model.
+        
+        learning_rate_2 (float): Learning rate for the first phase of classifier training. This is used in the initial phase of training the text classifier.
+        
+        learning_rate_3 (float): Learning rate for the second phase of classifier training. This is used in the second phase of training the text classifier.
+        
+        learning_rate_4 (float): Learning rate for the third phase of classifier training. This is used in the third phase of training the text classifier.
+        
+        learning_rate_5 (float): Learning rate for the fourth phase of classifier training. This is used in the final phase of training the text classifier.
+        
+        batch_size_1 (int): Batch size for language model training. This parameter defines the number of samples that will be propagated through the network at once during language model training.
+        
+        batch_size_2 (int): Batch size for text classifier training. This parameter defines the number of samples that will be propagated through the network at once during text classifier training.
+        
+        epochs_1 (int): Number of epochs for training the language model learner. This defines the number of complete passes through the training dataset.
+        
+        epochs_2 (int): Number of epochs for the first phase of classifier training. This defines the number of complete passes through the training dataset in the first phase.
+        
+        epochs_3 (int): Number of epochs for the second phase of classifier training. This defines the number of complete passes through the training dataset in the second phase.
+        
+        epochs_4 (int): Number of epochs for the third phase of classifier training. This defines the number of complete passes through the training dataset in the third phase.
+        
+        epochs_5 (int): Number of epochs for the fourth phase of classifier training. This defines the number of complete passes through the training dataset in the final phase.
+        
+        training_data (Path): Path to the training data CSV file. This file contains the text data and corresponding labels for training and validation.
+        
+        number_of_classes (int): Number of target classes in the classification task. This defines the number of unique labels in the dataset.
+        
+        root_dir (Path): Root directory for storing model artifacts. This directory is used to save trained models, logs, and other artifacts.
+        
+        mlflow_tracking_uri (str): URI for the MLflow tracking server. This is used to log and track experiments with MLflow.
+        
+        mlflow_repo_name (str): Repository name for MLflow tracking. This is used to organize and identify different MLflow runs within the repository.
+        
+        mlflow_repo_owner (str): Owner of the MLflow repository. This is used to identify the owner of the MLflow repository.
+        
+        all_params (dict): Dictionary containing all parameters used for model training. This includes all hyperparameters and other settings for reproducibility and logging.
     """
     test_size: float
     learning_rate_1: float
@@ -62,3 +85,7 @@ class ModelTrainingConfig:
     training_data: Path
     number_of_classes: int
     root_dir: Path
+    mlflow_tracking_uri: str
+    mlflow_repo_name: str
+    mlflow_repo_owner: str
+    all_params: dict
diff --git a/src/swahiliNewsClassifier/pipeline/stage_02_model_training.py b/src/swahiliNewsClassifier/pipeline/stage_02_model_training.py
diff --git a/src/swahiliNewsClassifier/pipeline/stage_02_model_training_and_evaluation.py b/src/swahiliNewsClassifier/pipeline/stage_02_model_training_and_evaluation.py
@@ -0,0 +1,31 @@
+from swahiliNewsClassifier.configuration.configuration import ConfigurationManager
+from swahiliNewsClassifier.components.model_training_and_evaluation import ModelTrainingAndEvaluation
+from swahiliNewsClassifier import log
+
+STAGE_NAME = "Model Training and Evaluation Stage"
+
+
+class ModelTrainingAndEvaluationPipeline:
+    def __init__(self):
+        """
+        Initialize the ModelTrainingAndEvaluationPipeline object.
+        """
+        self.config = ConfigurationManager()
+
+    def main(self):
+        """
+        Execute the model training and evaluation process.
+        """
+        try:
+            model_training_and_evaluation_config = self.config.get_model_training_and_evaluation_config()
+            model_training_and_evaluation = ModelTrainingAndEvaluation(
+                model_training_and_evaluation_config=model_training_and_evaluation_config)
+            model_training_and_evaluation.run_pipeline()
+        except Exception as e:
+            log.exception(f"An error occurred during {STAGE_NAME}: {e}")
+            raise e
+
+
+if __name__ == '__main__':
+    pipeline = ModelTrainingAndEvaluationPipeline()
+    pipeline.main()
diff --git a/src/swahiliNewsClassifier/pipeline/stage_03_model_evaluation.py b/src/swahiliNewsClassifier/pipeline/stage_03_model_evaluation.py
diff --git a/src/swahiliNewsClassifier/pipeline/stage_04_prediction.py b/src/swahiliNewsClassifier/pipeline/stage_04_prediction.py
diff --git a/template.py b/template.py
@@ -30,18 +30,14 @@ def create_file_with_directories(filepath: Path) -> None:
     f"src/{project_name}/__init__.py",
     f"src/{project_name}/components/__init__.py",
     f"src/{project_name}/components/data_ingestion.py",
-    f"src/{project_name}/components/prediction_service.py",
-    f"src/{project_name}/components/model_training.py",
-    f"src/{project_name}/components/model_evaluation.py",
+    f"src/{project_name}/components/model_training_and_evaluation.py",
     f"src/{project_name}/utilities/_init__.py",
     f"src/{project_name}/utilities/helper_functions.py",
     f"src/{project_name}/configuration/__init__.py",
     f"src/{project_name}/configuration/configuration.py",
     f"src/{project_name}/pipeline/__init__.py",
     f"src/{project_name}/pipeline/stage_01_data_ingestion.py",
-    f"src/{project_name}/pipeline/stage_02_model_training.py",
-    f"src/{project_name}/pipeline/stage_03_model_evaluation.py",
-    f"src/{project_name}/pipeline/stage_04_prediction.py",
+    f"src/{project_name}/pipeline/stage_02_model_training_and_evaluation.py",
     f"src/{project_name}/entity/__init__.py",
     f"src/{project_name}/entity/entities.py",
     f"src/{project_name}/constants/__init__.py",
@@ -57,8 +53,6 @@ def create_file_with_directories(filepath: Path) -> None:
     "logs/20240608-124455.log",
     "research/01_data_ingestion.ipynb",
     "research/02_model_training.ipynb",
-    "research/03_model_evaluation.ipynb",
-    "templates/index.html",
     "app.py",
     "autopep.py",
     ".env",