KhiopsML · folmos-at-orange · Jan 8, 2025 · Dec 20, 2024
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -214,7 +214,7 @@ jobs:
           git config --global --add safe.directory $(Resolve-Path '.' | % {$_.toString()})
           python setup.py egg_info
 
-          # Install the Pyrhon requirements
+          # Install the Python requirements
           Get-Content .\khiops.egg-info\requires.txt `
             | Select-String -Pattern '^\[' -NotMatch `
             | Select-String -Pattern '^$' -NotMatch `

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -87,7 +87,7 @@
 
 ### Fixed
 - `core`
-  - Metric name search in estimator analyis report.
+  - Metric name search in estimator analysis report.
 
 ## 10.2.1.0 - 2024-03-26
 
@@ -219,7 +219,7 @@ Note: This release marks the open sourcing of Khiops:
   - Estimators now accept dataframes with numerical column indexes.
   - `KhiopsClassifier` now accepts integer target vectors.
   - `classes_` estimator attribute for `KhiopsClassifier` (available once fitted).
-  - `feature_names_out_` estimator attirbute for `KhiopsEncoder` (available once fitted).
+  - `feature_names_out_` estimator attribute for `KhiopsEncoder` (available once fitted).
   - `export_report_file` and `export_dictionary_file` to export Khiops report and dictionary files
     once the estimators are fitted.
   - `internal_sort` parameter for estimators that may be used to not sort the tables on the

diff --git a/doc/README.md b/doc/README.md
@@ -15,7 +15,7 @@ Khiops Python library.
 
 # You'll also need a system-wide installation of pandoc (https://pandoc.org)
 
-# Execute this if there were non commited updates to samples.py or samples_sklearn.py:
+# Execute this if there were non committed updates to samples.py or samples_sklearn.py:
 # ./convert-samples-hook
 
 # To clean the html documentation

diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst
@@ -400,7 +400,7 @@ Samples
         os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
     )
 
-    # Split the secondary dataframe with the keys of the splitted root dataframe
+    # Split the secondary dataframe with the keys of the split root dataframe
     X_train_ids = X_train_main["AccidentId"].to_frame()
     X_test_ids = X_test_main["AccidentId"].to_frame()
     X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
@@ -825,7 +825,7 @@ Samples
     # Load the secondary table of the dataset into a pandas dataframe
     vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
 
-    # Split the secondary dataframe with the keys of the splitted root dataframe
+    # Split the secondary dataframe with the keys of the split root dataframe
     X_train_ids = X_train["AccidentId"].to_frame()
     X_test_ids = X_test["AccidentId"].to_frame()
     X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
@@ -891,7 +891,7 @@ Samples
         os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
     )
 
-    # Split the secondary dataframe with the keys of the splitted root dataframe
+    # Split the secondary dataframe with the keys of the split root dataframe
     X_train_ids = X_train_main["AccidentId"].to_frame()
     X_test_ids = X_test_main["AccidentId"].to_frame()
     X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")

diff --git a/khiops/core/api.py b/khiops/core/api.py
@@ -757,7 +757,7 @@ def train_predictor(
         Pairs specified with ``specific_pairs`` have top priority: they are constructed
         first.
     only_pairs_with : str, default ""
-        Constructs only pairs with the specifed variable name. If equal to the empty
+        Constructs only pairs with the specified variable name. If equal to the empty
         string "" it considers all variables to make pairs.
         **Deprecated** will be removed in Khiops Python 11, use ``specific_pairs``.
     group_target_value : bool, default ``False``
@@ -1083,7 +1083,7 @@ def train_recoder(
         Pairs specified with ``specific_pairs`` have top priority: they are constructed
         first.
     only_pairs_with : str, default ""
-        Constructs only pairs with the specifed variable name. If equal to the empty
+        Constructs only pairs with the specified variable name. If equal to the empty
         string "" it considers all variables to make pairs.
         **Deprecated** will be removed in Khiops Python 11, use ``specific_pairs``.
     group_target_value : bool, default ``False``
@@ -1133,7 +1133,7 @@ def train_recoder(
             - "conditional info": Conditional information of the interval/group
             - "none": Keeps the variable as-is
     grouping_method : str
-        Name of the grouping method. Its vaild values depend on the task:
+        Name of the grouping method. Its valid values depend on the task:
             - Supervised: "MODL" (default) or "BasicGrouping"
             - Unsupervised: "BasicGrouping" (default) or "None"
     min_group_frequency : int, default 0

diff --git a/khiops/core/coclustering_results.py b/khiops/core/coclustering_results.py
@@ -490,7 +490,7 @@ class CoclusteringDimension:
     """A coclustering dimension (variable)
 
     A coclustering dimension is a hierarchical clustering of an input variable. The
-    leafs of this hierarchy are linked to an element of a partition of the input
+    leaves of this hierarchy are linked to an element of a partition of the input
     variable. Leaf clusters have variable parts as their children.
 
     It only has a no-parameter constructor.
@@ -553,7 +553,7 @@ def __init__(self):
         # Default group attribute only for categorical dimensions
         self.default_group = None
 
-        # List of the hierarchy clusters, ranging from the root to the leafs
+        # List of the hierarchy clusters, ranging from the root to the leaves
         self.root_cluster = None
         self.clusters = []
 
@@ -939,7 +939,7 @@ def __init__(self, json_data=None):
         # Transform to an empty dictionary if json_data is not specified
         if json_data is None:
             json_data = {}
-        # Otherise check the validity of json_data
+        # Otherwise check the validity of json_data
         elif "cluster" not in json_data:
             raise KhiopsJSONError("'cluster' key not found")
 
@@ -955,7 +955,7 @@ class CoclusteringDimensionPartInterval(CoclusteringDimensionPart):
     json_data : dict, optional
         Python dictionary representing an element of type "Numerical" of the list at the
         ``dimensionPartitions`` field of a Khiops Coclustering JSON report file. If not
-        specifed it returns an empty instance.
+        specified it returns an empty instance.
 
     Raises
     ------
@@ -1205,7 +1205,7 @@ def __init__(self, json_data=None):
         # Transform to an empty dictionary if json_data is not specified
         if json_data is None:
             json_data = {}
-        # Otherwise check the "cluter" and "parentCluster" keys
+        # Otherwise check the "cluster" and "parentCluster" keys
         else:
             if "cluster" not in json_data:
                 raise KhiopsJSONError("'cluster' key not found")
@@ -1224,7 +1224,7 @@ def __init__(self, json_data=None):
         self.short_description = json_data.get("shortDescription", "")
         self.description = json_data.get("description", "")
 
-        # Link to child clusters, None for the leafs of the hierarchy
+        # Link to child clusters, None for the leaves of the hierarchy
         # The user must specify the CoclusteringCluster references parent_cluster
         # and child_cluster that link this instance to the hierarchy
         self.parent_cluster = None

diff --git a/khiops/core/internals/filesystems.py b/khiops/core/internals/filesystems.py
@@ -19,7 +19,7 @@
 # pylint: disable=invalid-name
 
 # Import boto3 if available
-# Delay the raising of an ImportError to an instantation of a AmazonS3Resource
+# Delay an ImportError raising to an instantiation of a AmazonS3Resource
 try:
     import boto3
     import boto3.session
@@ -30,7 +30,7 @@
     boto3_import_error = import_error
 
 # Import google.could if available
-# Delay the raising of an ImportError to an instantation of a GoogleCloudStorageResource
+# Delay an ImportError raising to an instantiation of a GoogleCloudStorageResource
 try:
     from google.cloud import storage
 

diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py
@@ -365,7 +365,7 @@ def samples_dir(self):
         return self._get_samples_dir()
 
     def _get_samples_dir(self):
-        """To be overriden by subclasses"""
+        """To be overridden by subclasses"""
         return self._samples_dir
 
     @samples_dir.setter
@@ -386,7 +386,7 @@ def khiops_version(self):
         return self._get_khiops_version()
 
     def _get_khiops_version(self):
-        """khiops_version getter to be overriden by subclasses"""
+        """khiops_version getter to be overridden by subclasses"""
         return self._khiops_version
 
     def _build_status_message(self):
@@ -1072,7 +1072,7 @@ def _set_samples_dir(self, samples_dir):
         super()._set_samples_dir(samples_dir)
 
     def _get_samples_dir(self):
-        # Check the samples dir once (the check emmits only warnings)
+        # Check the samples dir once (the check emits only warnings)
         if not self._samples_dir_checked:
             _check_samples_dir(self._samples_dir)
             self._samples_dir_checked = True

diff --git a/khiops/core/internals/scenario.py b/khiops/core/internals/scenario.py
@@ -82,7 +82,7 @@ def _parse_template(self):
             line = next(line_iter, None)
             line_num += 1
 
-        # Eliminate empty lines at the beggining and end if any
+        # Eliminate empty lines at the beginning and end if any
         if self._parsed_template[0] == "":
             self._parsed_template.pop(0)
         if self._parsed_template[-1] == "":
@@ -128,7 +128,7 @@ def _parse_section(self, section_keyword, line_iter):
             if statement_re.match(statement) is None:
                 raise ValueError(
                     "Statement must contain only alphabetic characters and '.' "
-                    f"(no '.' at the beggining): '{statement}'"
+                    f"(no '.' at the beginning): '{statement}'"
                 )
 
         return section_param_name, section_spec

diff --git a/khiops/core/internals/task.py b/khiops/core/internals/task.py
@@ -210,7 +210,7 @@ def __init__(
             for kwarg_signature in self.kwargs_signature
         }
 
-        # Post-intialization checks
+        # Post-initialization checks
         # Check that the path_valued_arg_names are contained in either args or kwargs
         all_arg_names = list(self._args_signature_by_name.keys()) + list(
             self._kwargs_signature_by_name.keys()

diff --git a/khiops/core/internals/version.py b/khiops/core/internals/version.py
@@ -97,7 +97,7 @@ def __init__(self, version_str):
                     version_str,
                 )
 
-            # Store the rest of the prelease (if any) and check it is a number
+            # Store the rest of the prerelease (if any) and check it is a number
             # We accept not having a "." in the pre-release increment for backward
             # compatibility.
             self._pre_release_increment = _pre_release.replace(

diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb
@@ -464,7 +464,7 @@
     "    os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n",
     ")\n",
     "\n",
-    "# Split the secondary dataframe with the keys of the splitted root dataframe\n",
+    "# Split the secondary dataframe with the keys of the split root dataframe\n",
     "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n",
     "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n",
     "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n",
@@ -1006,7 +1006,7 @@
     "# Load the secondary table of the dataset into a pandas dataframe\n",
     "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n",
     "\n",
-    "# Split the secondary dataframe with the keys of the splitted root dataframe\n",
+    "# Split the secondary dataframe with the keys of the split root dataframe\n",
     "X_train_ids = X_train[\"AccidentId\"].to_frame()\n",
     "X_test_ids = X_test[\"AccidentId\"].to_frame()\n",
     "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n",
@@ -1085,7 +1085,7 @@
     "    os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n",
     ")\n",
     "\n",
-    "# Split the secondary dataframe with the keys of the splitted root dataframe\n",
+    "# Split the secondary dataframe with the keys of the split root dataframe\n",
     "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n",
     "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n",
     "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n",

diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py
@@ -405,7 +405,7 @@ def khiops_classifier_with_hyperparameters():
         os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
     )
 
-    # Split the secondary dataframe with the keys of the splitted root dataframe
+    # Split the secondary dataframe with the keys of the split root dataframe
     X_train_ids = X_train_main["AccidentId"].to_frame()
     X_test_ids = X_test_main["AccidentId"].to_frame()
     X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
@@ -882,7 +882,7 @@ def khiops_classifier_multitable_list():
         os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t"
     )
 
-    # Split the secondary dataframe with the keys of the splitted root dataframe
+    # Split the secondary dataframe with the keys of the split root dataframe
     X_train_ids = X_train["AccidentId"].to_frame()
     X_test_ids = X_test["AccidentId"].to_frame()
     X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
@@ -956,7 +956,7 @@ def khiops_classifier_multitable_star_file():
         os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
     )
 
-    # Split the secondary dataframe with the keys of the splitted root dataframe
+    # Split the secondary dataframe with the keys of the split root dataframe
     X_train_ids = X_train_main["AccidentId"].to_frame()
     X_test_ids = X_test_main["AccidentId"].to_frame()
     X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")

diff --git a/khiops/sklearn/dataset.py b/khiops/sklearn/dataset.py
@@ -223,7 +223,7 @@ def _check_relations_entry(main_table_name, tables_spec, relations_spec):
                 f"Each relation must be unique."
             )
 
-        # Check hierachical keys
+        # Check hierarchical keys
         _check_hierarchical_keys(
             i,
             parent_table,
@@ -694,7 +694,7 @@ def _init_tables_from_mapping(self, X):
                 )
             self.secondary_tables = []
 
-        # If the relations are not specified intialize to a star schema
+        # If the relations are not specified initialize to a star schema
         if "relations" not in X:
             self.relations = [
                 (self.main_table.name, table.name, False)

diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py
@@ -680,7 +680,7 @@ class KhiopsCoclustering(ClusterMixin, KhiopsEstimator):
     max_part_numbers : dict, optional
         Maximum number of clusters for each of the co-clustered column. Specifically, a
         key-value pair of this dictionary represents the column name and its respective
-        maximum number of clusters. If not specified there is no maximun number of
+        maximum number of clusters. If not specified there is no maximum number of
         clusters is imposed on any column.
         **Deprecated** will be removed in Khiops 11. Use the ``max_part_number``
         parameter of the `fit` method.
@@ -1294,7 +1294,7 @@ def _transform_check_dataset(self, ds):
         # The "model dictionary domain" in the coclustering case it is just composed
         # of the secondary table. The main "keys" table is a technical object.
         # So we check the compatibility against only this dictionary and override
-        # the parents implementaion
+        # the parents implementation
         for dictionary in self.model_.dictionaries:
             if dictionary.name != self.model_main_dictionary_name_:
                 _check_dictionary_compatibility(
@@ -1702,7 +1702,7 @@ def _transform_check_dataset(self, ds):
         # Call the parent method
         super()._transform_check_dataset(ds)
 
-        # Check the coherence between thi input table and the model
+        # Check the coherence between the input table and the model
         if self.is_multitable_model_ and not ds.is_multitable:
             raise ValueError(
                 "You are trying to apply on single-table inputs a model which has "
@@ -2263,7 +2263,7 @@ def predict(self, X):
         y_pred = super().predict(X)
 
         # Adjust the data type according to the original target type
-        # Note: String is coerced explictly because astype does not work as expected
+        # Note: String is coerced explicitly because astype does not work as expected
         if isinstance(y_pred, pd.DataFrame):
             # Transform to numpy.ndarray
             y_pred = y_pred.to_numpy(copy=False).ravel()
@@ -2332,7 +2332,7 @@ def predict_proba(self, X):
 
         # Create the input dataset
 
-        # Call the generic transfrom method
+        # Call the generic transform method
         try:
             ds = Dataset(X, key=self.key)
             y_probas = self._transform(

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1290,7 +1290,7 @@ def _test_evaluation_report_accessors(
         with self.assertRaises(KeyError):
             report.get_predictor_performance("INEXISTENT REPORT NAME")
 
-        # Test anomalous access to perfomance objects
+        # Test anomalous access to performance objects
         for predictor_name in report.get_predictor_names():
             self._test_performance_report_accessors(
                 result_file_name,
@@ -1750,7 +1750,7 @@ def test_dictionary_extract_data_paths(self):
         test_resources_dir = os.path.join(resources_dir(), "dictionary")
         ref_kdicj_dir = os.path.join(test_resources_dir, "ref_kdicj")
 
-        # Set the expeced outputs
+        # Set the expected outputs
         expected_data_paths = {
             "Adult": {"Adult": []},
             "SpliceJunction": {
@@ -2482,7 +2482,7 @@ def test_scenario_generation(self):
                 scenario.write(writer, arguments[template_name, argument_set])
                 output_scenario = stream.getvalue().decode("ascii").replace("\r", "")
 
-                # Compare the output scenario and the expected fixutre
+                # Compare the output scenario and the expected fixture
                 self.assertEqual(output_scenario, expected_scenario)
 
     def test_invalid_templates(self):

diff --git a/tests/test_dataset_errors.py b/tests/test_dataset_errors.py
@@ -30,7 +30,7 @@ class AnotherType(object):
 class DatasetSpecErrorsTests(unittest.TestCase):
     """Test the output message when the input data contains errors
 
-    Each test covers an egde-case for the initialization of Dataset/DatasetTable and
+    Each test covers an edge-case for the initialization of Dataset/DatasetTable and
     checks:
 
     - that either `TypeError` or `ValueError` is raised and
@@ -796,7 +796,7 @@ def test_dict_spec_table_relation_must_be_str(self):
         )
         self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)
 
-    def test_dict_spec_entiy_flag_relation_must_be_bool(self):
+    def test_dict_spec_entity_flag_relation_must_be_bool(self):
         """Test Dataset raising TypeError when the entity flag is not boolean"""
         bad_spec, y = self.create_fixture_dataset_spec()
         bad_spec["relations"][0] = ("B", "D", AnotherType())