In [1]:
import os
os.chdir("../") 

%pwd

'c:\\Users\\P52s\\Documents\\Python Project\\SpellX'

In [11]:
from dataclasses import dataclass
from pathlib import Path
import spello

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir : Path
    data_path : Path
    data_file : Path

In [12]:
from spellX.constants import *
from spellX.utils.common import read_yaml, create_directories

In [13]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            data_file = config.data_file,
            
        )

        return model_trainer_config

In [14]:
from spello.model import SpellCorrectionModel
import re
import pickle
from spellX.utils.trainer  import read_text

class ModelTrainer :
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        self.model_count = 0 

    def get_model_filename(self):
        # Generate the filename for the current model
        return f"{self.config.root_dir}/model{self.model_count}.pkl"
    
    def train(self):

        model_path = self.get_model_filename()

        if not os.path.exists(model_path):
            with open("./artifacts/data_ingestion/data/train/big.txt", "r") as f:
                big = f.readlines()
            big  = [i.strip() for i in big]
            #Remove \t - tab
            big_t = [re.sub('\\t', ' ', text) for text in big]
            #Remove \\
            big_ = [re.sub("\\'", "", text) for text in big_t]
            #Remove
            big_r = [text for text in big_ if text != '']
            #Remove Special characters
            big_star = [re.sub(r'[^a-zA-Z]+', ' ', text) for text in big_r]
            #Remove leading and trailing spaces
            big_stripped = [text.strip() for text in big_star]
            sp = SpellCorrectionModel(language='en')
            sp.train(big_stripped)
            self.sp = sp

            with open(model_path, 'wb') as file:
                pickle.dump(self.sp, file)

        else:
            with open(model_path, 'rb') as file:
                sp = pickle.load(file)
            data = read_text(self.config.data_path, self.config.data_file)
            sp.train(data)

            # Increment the model count for the next model
            self.model_count += 1
                





In [15]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2023-09-07 23:51:27,590: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-07 23:51:27,599: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-07 23:51:27,599: INFO: common: created directory at: artifacts]
[2023-09-07 23:51:27,607: INFO: common: created directory at: artifacts/model_trainer]


Spello training started..
[2023-09-07 23:51:30,191: DEBUG: model: Spello training started..]
Context model training started ...
[2023-09-07 23:51:30,947: DEBUG: model: Context model training started ...]
Symspell training started ...
[2023-09-07 23:52:04,197: DEBUG: model: Symspell training started ...]
[2023-09-07 23:52:04,205: INFO: symspell: Creating spell check dictionary...]
[2023-09-07 23:53:06,359: INFO: symspell: 62.16 seconds to run]
[2023-09-07 23:53:06,367: INFO: symspell: total words processed: 29624]
[2023-09-07 23:53:06,368: INFO: symspell: total unique words in corpus: 29624]
[2023-09-07 23:53:06,371: INFO: symspell: total items in dictionary (corpus words & deletions): 2112195]
[2023-09-07 23:53:06,371: INFO: symspell: edit distance for deletions: 3]
[2023-09-07 23:53:06,371: INFO: symspell: length of longest word in corpus: 18]
Phoneme training started ...
[2023-09-07 23:53:06,371: DEBUG: model: Phoneme training started ...]
Spello training completed successfully ...
[