MontrealCorpusTools · mmcauliffe · May 30, 2023 · May 24, 2023 · May 25, 2023 · May 27, 2023
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -8,10 +8,6 @@ on:
   repository_dispatch:
     types: rebuild
 
-
-env:
-  CACHE_NUMBER: 0  # increase to reset cache manually
-
 concurrency:
   group: run_tests-${{ github.ref }}
   cancel-in-progress: true
@@ -41,12 +37,13 @@ jobs:
           fetch-depth: 0
 
       - name: Install Conda environment with Micromamba
-        uses: mamba-org/provision-with-micromamba@main
+        uses: mamba-org/setup-micromamba@v1
         with:
           environment-file: environment.yml
           environment-name: mfa
-          extra-specs: |
+          create-args: >-
             python=3.9
+          cache-environment: true
 
       - name: Configure mfa
         shell: bash -l {0}

diff --git a/Dockerfile b/Dockerfile
@@ -5,6 +5,7 @@ RUN mkdir -p /mfa
 RUN mamba env create -p /env -f docker_environment.yaml && conda clean -afy
 
 COPY . /pkg
+RUN conda run -p /env python -m pip install speechbrain
 RUN conda run -p /env python -m pip install --no-deps /pkg
 
 RUN useradd -ms /bin/bash mfauser

diff --git a/docs/source/changelog/changelog_2.2.rst b/docs/source/changelog/changelog_2.2.rst
@@ -5,6 +5,13 @@
 2.2 Changelog
 *************
 
+2.2.12
+======
+
+- Re-established support for sqlite for most aspects of MFA (some functionality requires using PostgreSQL)
+- Added a configuration flag for `mfa configure --enable_use_postgres` and `mfa [command] ... --use_postgres` to use PostgreSQL as the database backend
+- Fixed a bug where adapted acoustic models would not contain all the necessary metadata to be used
+
 2.2.11
 ======
 

diff --git a/docs/source/first_steps/index.rst b/docs/source/first_steps/index.rst
@@ -178,7 +178,7 @@ Once we've validated the data, we can train an acoustic model (and output the al
 
    mfa train ~/mfa_data/my_corpus ~/mfa_data/my_dictionary.txt ~/mfa_data/new_acoustic_model.zip  # Export just the trained acoustic model
    mfa train ~/mfa_data/my_corpus ~/mfa_data/my_dictionary.txt ~/mfa_data/my_corpus_aligned  # Export just the training alignments
-   mfa train ~/mfa_data/my_corpus ~/mfa_data/my_dictionary.txt ~/mfa_data/new_acoustic_model.zip ~/mfa_data/my_corpus_aligned  # Export both trained model and alignments
+   mfa train ~/mfa_data/my_corpus ~/mfa_data/my_dictionary.txt ~/mfa_data/new_acoustic_model.zip --output_directory ~/mfa_data/my_corpus_aligned  # Export both trained model and alignments
 
 As for other commands, if your data is large, you'll likely want to increase the number of jobs that MFA uses.  For that and more advanced configuration of the training command, see :ref:`train_acoustic_model`.
 

diff --git a/environment.yml b/environment.yml
@@ -26,6 +26,7 @@ dependencies:
   - sqlalchemy>=2.0
   - pgvector
   - pgvector-python
+  - sqlite
   - postgresql
   - psycopg2
   - click

diff --git a/montreal_forced_aligner/abc.py b/montreal_forced_aligner/abc.py
@@ -74,6 +74,9 @@ def __init__(self, args: MfaArguments):
         self.log_path = self.args.log_path
 
     def db_engine(self):
+        db_string = self.db_string
+        if not GLOBAL_CONFIG.current_profile.use_postgres:
+            db_string += "?mode=ro&nolock=1&uri=true"
 
         return sqlalchemy.create_engine(
             self.db_string,
@@ -216,7 +219,6 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.db_backend = GLOBAL_CONFIG.database_backend
 
         self._db_engine = None
         self._db_path = None
@@ -228,7 +230,10 @@ def delete_database(self) -> None:
         Reset all schemas
         """
 
-        MfaSqlBase.metadata.drop_all(self.db_engine)
+        if GLOBAL_CONFIG.current_profile.use_postgres:
+            MfaSqlBase.metadata.drop_all(self.db_engine)
+        elif self.db_path.exists():
+            os.remove(self.db_path)
 
     def initialize_database(self) -> None:
         """
@@ -238,26 +243,29 @@ def initialize_database(self) -> None:
             return
         from montreal_forced_aligner.command_line.utils import check_databases
 
-        exist_check = True
-        try:
-            check_databases(self.identifier)
-        except Exception:
+        if GLOBAL_CONFIG.current_profile.use_postgres:
+            exist_check = True
             try:
-                subprocess.check_call(
-                    [
-                        "createdb",
-                        f"--host={GLOBAL_CONFIG.database_socket}",
-                        self.identifier,
-                    ],
-                    stderr=subprocess.DEVNULL,
-                    stdout=subprocess.DEVNULL,
-                )
+                check_databases(self.identifier)
             except Exception:
-                raise DatabaseError(
-                    f"There was an error connecting to the {GLOBAL_CONFIG.current_profile_name} MFA database server. "
-                    "Please ensure the server is initialized (mfa server init) or running (mfa server start)"
-                )
-            exist_check = False
+                try:
+                    subprocess.check_call(
+                        [
+                            "createdb",
+                            f"--host={GLOBAL_CONFIG.database_socket}",
+                            self.identifier,
+                        ],
+                        stderr=subprocess.DEVNULL,
+                        stdout=subprocess.DEVNULL,
+                    )
+                except Exception:
+                    raise DatabaseError(
+                        f"There was an error connecting to the {GLOBAL_CONFIG.current_profile_name} MFA database server. "
+                        "Please ensure the server is initialized (mfa server init) or running (mfa server start)"
+                    )
+                exist_check = False
+        else:
+            exist_check = self.db_path.exists()
         self.database_initialized = True
         if exist_check:
             if GLOBAL_CONFIG.current_profile.clean or getattr(self, "dirty", False):
@@ -267,11 +275,12 @@ def initialize_database(self) -> None:
                 return
 
         os.makedirs(self.output_directory, exist_ok=True)
-        with self.db_engine.connect() as conn:
-            conn.execute(sqlalchemy.text("CREATE EXTENSION IF NOT EXISTS vector"))
-            conn.execute(sqlalchemy.text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
-            conn.execute(sqlalchemy.text("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"))
-            conn.commit()
+        if GLOBAL_CONFIG.current_profile.use_postgres:
+            with self.db_engine.connect() as conn:
+                conn.execute(sqlalchemy.text("CREATE EXTENSION IF NOT EXISTS vector"))
+                conn.execute(sqlalchemy.text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
+                conn.execute(sqlalchemy.text("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"))
+                conn.commit()
 
         MfaSqlBase.metadata.create_all(self.db_engine)
 
@@ -338,9 +347,19 @@ def current_workflow(self) -> CorpusWorkflow:
         return wf
 
     @property
-    def db_string(self):
+    def db_path(self) -> Path:
+        """Connection path for sqlite database"""
+        return self.output_directory.joinpath(f"{self.identifier}.db")
+
+    @property
+    def db_string(self) -> str:
         """Connection string for the database"""
-        return f"postgresql+psycopg2://@/{self.identifier}?host={GLOBAL_CONFIG.database_socket}"
+        if GLOBAL_CONFIG.use_postgres:
+            return (
+                f"postgresql+psycopg2://@/{self.identifier}?host={GLOBAL_CONFIG.database_socket}"
+            )
+        else:
+            return f"sqlite:///{self.db_path}"
 
     def construct_engine(self, **kwargs) -> sqlalchemy.engine.Engine:
         """
@@ -358,10 +377,16 @@ def construct_engine(self, **kwargs) -> sqlalchemy.engine.Engine:
         :class:`~sqlalchemy.engine.Engine`
             SqlAlchemy engine
         """
+        db_string = self.db_string
+        if not GLOBAL_CONFIG.use_postgres:
+            if kwargs.pop("read_only", False):
+                db_string += "?mode=ro&nolock=1&uri=true"
+            kwargs["poolclass"] = sqlalchemy.NullPool
+        else:
+            kwargs["pool_size"] = 10
+            kwargs["max_overflow"] = 10
         e = sqlalchemy.create_engine(
-            self.db_string,
-            pool_size=10,
-            max_overflow=10,
+            db_string,
             logging_name="main_process_engine",
             **kwargs,
         ).execution_options(logging_token="main_process_engine")

diff --git a/montreal_forced_aligner/acoustic_modeling/base.py b/montreal_forced_aligner/acoustic_modeling/base.py
@@ -643,6 +643,7 @@ def export_model(self, output_model_path: Path) -> None:
         )
         acoustic_model.add_meta_file(self)
         acoustic_model.add_model(self.working_directory)
+        acoustic_model.add_model(self.worker.phones_dir)
         acoustic_model.add_pronunciation_models(
             self.working_directory, self.worker.dictionary_base_names.values()
         )

diff --git a/montreal_forced_aligner/acoustic_modeling/trainer.py b/montreal_forced_aligner/acoustic_modeling/trainer.py
@@ -750,6 +750,7 @@ def export_files(
             Flag for including the original text of the corpus files as a tier
         """
         self.align()
+        self.analyze_alignments()
         super(TrainableAligner, self).export_files(
             output_directory, output_format, include_original_text
         )

diff --git a/montreal_forced_aligner/alignment/adapting.py b/montreal_forced_aligner/alignment/adapting.py
@@ -358,6 +358,23 @@ def meta(self) -> MetaDict:
             "train_date": str(datetime.now()),
             "features": self.feature_options,
             "phone_set_type": str(self.phone_set_type),
+            "dictionaries": {
+                "names": sorted(self.dictionary_base_names.values()),
+                "default": self.dictionary_base_names[self._default_dictionary_id],
+                "silence_word": self.silence_word,
+                "use_g2p": self.use_g2p,
+                "oov_word": self.oov_word,
+                "bracketed_word": self.bracketed_word,
+                "laughter_word": self.laughter_word,
+                "clitic_marker": self.clitic_marker,
+                "position_dependent_phones": self.position_dependent_phones,
+            },
+            "oov_phone": self.oov_phone,
+            "optional_silence_phone": self.optional_silence_phone,
+            "silence_probability": self.silence_probability,
+            "initial_silence_probability": self.initial_silence_probability,
+            "final_silence_correction": self.final_silence_correction,
+            "final_non_silence_correction": self.final_non_silence_correction,
         }
         return data
 
@@ -377,6 +394,7 @@ def export_model(self, output_model_path: Path) -> None:
         )
         acoustic_model.add_meta_file(self)
         acoustic_model.add_model(self.working_directory)
+        acoustic_model.add_model(self.phones_dir)
         if directory:
             os.makedirs(directory, exist_ok=True)
         basename, _ = os.path.splitext(output_model_path)