In [1]:
import os

In [2]:
pwd

'/media/kalema/9954-79C8/Projects/Swahili-News-Classifier/research'

In [3]:
os.chdir('../')

In [4]:
pwd

'/media/kalema/9954-79C8/Projects/Swahili-News-Classifier'

In [5]:
from dataclasses import dataclass
from pathlib import Path

from swahiliNewsClassifier.constants import *
from swahiliNewsClassifier.utilities.helper_functions import read_yaml, create_directories

In [10]:
@dataclass(frozen=True)
class ModelTrainingConfig:
    test_size: float
    learning_rate_1: float
    learning_rate_2: float
    learning_rate_3: float
    learning_rate_4: float
    learning_rate_5: float
    batch_size_1: int
    batch_size_2: int
    epochs_1: int
    epochs_2: int
    epochs_3: int
    epochs_4: int
    epochs_5: int
    training_data: Path
    number_of_classes: int
    root_dir: Path

In [9]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        """
        Initialize ConfigurationManager with configuration and parameter files.

        Args:
            config_filepath (str): Path to the configuration YAML file.
            params_filepath (str): Path to the parameters YAML file.
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_training_config(self) -> ModelTrainingConfig:
        """
        Get the model training configuration.

        Returns:
            ModelTrainingConfig: Configuration object for model training.
        """
        create_directories([self.config.training.root_dir])

        return ModelTrainingConfig(
            root_dir=self.config.training.root_dir,
            training_data=self.config.training.training_data_path,
            test_size=self.params.TEST_SIZE,
            learning_rate_1=self.params.LEARNING_RATE_1,
            learning_rate_2=self.params.LEARNING_RATE_2,
            learning_rate_3=self.params.LEARNING_RATE_3,
            learning_rate_4=self.params.LEARNING_RATE_4,
            learning_rate_5=self.params.LEARNING_RATE_5,
            batch_size_1=self.params.BATCH_SIZE_1,
            batch_size_2=self.params.BATCH_SIZE_2,
            epochs_1=self.params.EPOCHS_1,
            epochs_2=self.params.EPOCHS_2,
            epochs_3=self.params.EPOCHS_3,
            epochs_4=self.params.EPOCHS_4,
            epochs_5=self.params.EPOCHS_5,
            number_of_classes=self.params.NUMBER_OF_CLASSES,

        )

In [None]:
import torch
import fastai
from fastai.text.all import *
import pandas as pd
import numpy as np
from functools import partial
import io
import os
from sklearn.model_selection import train_test_split
from swahiliNewsClassifier import log

In [None]:
class ModelTraining:
    def __init__(self, model_training_config: ModelTrainingConfig):
        """
        Initialize ModelTraining object with the provided configuration.

        Args:
            model_training_config (ModelTrainingConfig): Configuration object for model training.
        """
        self.model_training_config = model_training_config

    def load_data(self) -> pd.DataFrame:
        """
        Load the training data from the specified path.

        Returns:
            pd.DataFrame: Loaded training data.
        """
        train = pd.read_csv(self.model_training_config.training_data)
        return train

    def prepare_data(self, train) -> 'tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]':
        """
        Prepare the data for training and validation.

        Args:
            train (pd.DataFrame): The loaded training data.

        Returns:
            tuple: A tuple containing training data (df_trn), validation data (df_val), and data for language model (df_lm).
        """
        df_trn, df_val = train_test_split(train, stratify=train['category'], test_size=self.model_training_config.test_size, random_state=123)
        df_lm = pd.concat([df_trn, df_val], axis=0)[['content']]
        return df_trn, df_val, df_lm

    def create_dataloaders(self, df_lm) -> DataLoaders:
        """
        Create dataloaders for the language model.

        Args:
            df_lm (pd.DataFrame): Data for language model.

        Returns:
            DataLoaders: Dataloaders for the language model.
        """
        dblock = DataBlock(
            blocks=TextBlock.from_df('content', is_lm=True),
            get_x=ColReader('text'),
            splitter=RandomSplitter(0.1))

        dls = dblock.dataloaders(df_lm, bs=self.model_training_config.batch_size_1)
        return dls

    def train_language_model(self, dls) -> Learner:
        """
        Train the language model.

        Args:
            dls (DataLoaders): Dataloaders for the language model.

        Returns:
            Learner: Trained language model learner.
        """
        learn = language_model_learner(dls, AWD_LSTM, drop_mult=0.3, metrics=[accuracy]).to_fp16()
        learn.lr_find()
        learn.fine_tune(self.model_training_config.epochs_1, self.model_training_config.learning_rate_1)
        learn.save_encoder('language_model_learner')
        return learn

    def create_text_classifier_dataloaders(self, df_trn, dls_lm) -> DataLoaders:
        """
        Create dataloaders for the text classifier.

        Args:
            df_trn (pd.DataFrame): Training data.
            dls_lm (DataLoaders): Dataloaders for the language model to get vocabulary and sequence length.

        Returns:
            DataLoaders: Dataloaders for the text classifier.
        """
        blocks = (TextBlock.from_df('content', seq_len=dls_lm.seq_len, vocab=dls_lm.vocab), CategoryBlock())
        dblock = DataBlock(
            blocks=blocks,
            get_x=ColReader('text'),
            get_y=ColReader('category'),
            splitter=RandomSplitter(0.2))

        dls = dblock.dataloaders(df_trn, bs=self.model_training_config.batch_size_2)
        return dls

    def train_text_classifier(self, dls) -> None:
        """
        Train the text classifier.

        Args:
            dls (DataLoaders): Dataloaders for the text classifier.
        """
        learn = text_classifier_learner(dls, AWD_LSTM, metrics=[accuracy]).to_fp16()
        learn.load_encoder('language_model_learner')
        learn.lr_find()
        learn.fit_one_cycle(self.model_training_config.epochs_2, self.model_training_config.learning_rate_2)
        learn.freeze_to(-2)
        learn.fit_one_cycle(self.model_training_config.epochs_3, slice(1e-3/(2.6**4), self.model_training_config.learning_rate_3))
        learn.freeze_to(-3)
        learn.fit_one_cycle(self.model_training_config.epochs_4, slice(1e-3/(2.6**4), self.model_training_config.learning_rate_4))
        learn.unfreeze()
        learn.fit_one_cycle(self.model_training_config.epochs_5, slice(1e-3/(2.6**4), self.model_training_config.learning_rate_5))
        learn.save_encoder('text_classifier_learner')

    def run_pipeline(self) -> None:
        """
        Run the complete model training pipeline.
        """
        train = self.load_data()
        df_trn, df_val, df_lm = self.prepare_data(train)
        dls_lm = self.create_dataloaders(df_lm)
        lm_learner = self.train_language_model(dls_lm)
        dls_clf = self.create_text_classifier_dataloaders(df_trn, dls_lm)
        self.train_text_classifier(dls_clf)

In [None]:
try:
    config_manager = ConfigurationManager()
    model_training_config = config_manager.get_model_training_config()
    model_training = ModelTraining(model_training_config=model_training_config)
    model_training.run_pipeline()
except Exception as e:
    raise e