diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 582c719d..0fae6d0c 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -214,7 +214,7 @@ jobs: git config --global --add safe.directory $(Resolve-Path '.' | % {$_.toString()}) python setup.py egg_info - # Install the Pyrhon requirements + # Install the Python requirements Get-Content .\khiops.egg-info\requires.txt ` | Select-String -Pattern '^\[' -NotMatch ` | Select-String -Pattern '^$' -NotMatch ` diff --git a/CHANGELOG.md b/CHANGELOG.md index ef8543a7..f2cec058 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -87,7 +87,7 @@ ### Fixed - `core` - - Metric name search in estimator analyis report. + - Metric name search in estimator analysis report. ## 10.2.1.0 - 2024-03-26 @@ -219,7 +219,7 @@ Note: This release marks the open sourcing of Khiops: - Estimators now accept dataframes with numerical column indexes. - `KhiopsClassifier` now accepts integer target vectors. - `classes_` estimator attribute for `KhiopsClassifier` (available once fitted). - - `feature_names_out_` estimator attirbute for `KhiopsEncoder` (available once fitted). + - `feature_names_out_` estimator attribute for `KhiopsEncoder` (available once fitted). - `export_report_file` and `export_dictionary_file` to export Khiops report and dictionary files once the estimators are fitted. - `internal_sort` parameter for estimators that may be used to not sort the tables on the diff --git a/doc/README.md b/doc/README.md index 6dbdf810..62086e55 100644 --- a/doc/README.md +++ b/doc/README.md @@ -15,7 +15,7 @@ Khiops Python library. # You'll also need a system-wide installation of pandoc (https://pandoc.org) -# Execute this if there were non commited updates to samples.py or samples_sklearn.py: +# Execute this if there were non committed updates to samples.py or samples_sklearn.py: # ./convert-samples-hook # To clean the html documentation diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index 9693ba27..22fc4cfd 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -400,7 +400,7 @@ Samples os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" ) - # Split the secondary dataframe with the keys of the splitted root dataframe + # Split the secondary dataframe with the keys of the split root dataframe X_train_ids = X_train_main["AccidentId"].to_frame() X_test_ids = X_test_main["AccidentId"].to_frame() X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") @@ -825,7 +825,7 @@ Samples # Load the secondary table of the dataset into a pandas dataframe vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") - # Split the secondary dataframe with the keys of the splitted root dataframe + # Split the secondary dataframe with the keys of the split root dataframe X_train_ids = X_train["AccidentId"].to_frame() X_test_ids = X_test["AccidentId"].to_frame() X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") @@ -891,7 +891,7 @@ Samples os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" ) - # Split the secondary dataframe with the keys of the splitted root dataframe + # Split the secondary dataframe with the keys of the split root dataframe X_train_ids = X_train_main["AccidentId"].to_frame() X_test_ids = X_test_main["AccidentId"].to_frame() X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") diff --git a/khiops/core/api.py b/khiops/core/api.py index 370ea11b..66f0ec64 100644 --- a/khiops/core/api.py +++ b/khiops/core/api.py @@ -757,7 +757,7 @@ def train_predictor( Pairs specified with ``specific_pairs`` have top priority: they are constructed first. only_pairs_with : str, default "" - Constructs only pairs with the specifed variable name. If equal to the empty + Constructs only pairs with the specified variable name. If equal to the empty string "" it considers all variables to make pairs. **Deprecated** will be removed in Khiops Python 11, use ``specific_pairs``. group_target_value : bool, default ``False`` @@ -1083,7 +1083,7 @@ def train_recoder( Pairs specified with ``specific_pairs`` have top priority: they are constructed first. only_pairs_with : str, default "" - Constructs only pairs with the specifed variable name. If equal to the empty + Constructs only pairs with the specified variable name. If equal to the empty string "" it considers all variables to make pairs. **Deprecated** will be removed in Khiops Python 11, use ``specific_pairs``. group_target_value : bool, default ``False`` @@ -1133,7 +1133,7 @@ def train_recoder( - "conditional info": Conditional information of the interval/group - "none": Keeps the variable as-is grouping_method : str - Name of the grouping method. Its vaild values depend on the task: + Name of the grouping method. Its valid values depend on the task: - Supervised: "MODL" (default) or "BasicGrouping" - Unsupervised: "BasicGrouping" (default) or "None" min_group_frequency : int, default 0 diff --git a/khiops/core/coclustering_results.py b/khiops/core/coclustering_results.py index a7935239..6ddb8278 100644 --- a/khiops/core/coclustering_results.py +++ b/khiops/core/coclustering_results.py @@ -490,7 +490,7 @@ class CoclusteringDimension: """A coclustering dimension (variable) A coclustering dimension is a hierarchical clustering of an input variable. The - leafs of this hierarchy are linked to an element of a partition of the input + leaves of this hierarchy are linked to an element of a partition of the input variable. Leaf clusters have variable parts as their children. It only has a no-parameter constructor. @@ -553,7 +553,7 @@ def __init__(self): # Default group attribute only for categorical dimensions self.default_group = None - # List of the hierarchy clusters, ranging from the root to the leafs + # List of the hierarchy clusters, ranging from the root to the leaves self.root_cluster = None self.clusters = [] @@ -939,7 +939,7 @@ def __init__(self, json_data=None): # Transform to an empty dictionary if json_data is not specified if json_data is None: json_data = {} - # Otherise check the validity of json_data + # Otherwise check the validity of json_data elif "cluster" not in json_data: raise KhiopsJSONError("'cluster' key not found") @@ -955,7 +955,7 @@ class CoclusteringDimensionPartInterval(CoclusteringDimensionPart): json_data : dict, optional Python dictionary representing an element of type "Numerical" of the list at the ``dimensionPartitions`` field of a Khiops Coclustering JSON report file. If not - specifed it returns an empty instance. + specified it returns an empty instance. Raises ------ @@ -1205,7 +1205,7 @@ def __init__(self, json_data=None): # Transform to an empty dictionary if json_data is not specified if json_data is None: json_data = {} - # Otherwise check the "cluter" and "parentCluster" keys + # Otherwise check the "cluster" and "parentCluster" keys else: if "cluster" not in json_data: raise KhiopsJSONError("'cluster' key not found") @@ -1224,7 +1224,7 @@ def __init__(self, json_data=None): self.short_description = json_data.get("shortDescription", "") self.description = json_data.get("description", "") - # Link to child clusters, None for the leafs of the hierarchy + # Link to child clusters, None for the leaves of the hierarchy # The user must specify the CoclusteringCluster references parent_cluster # and child_cluster that link this instance to the hierarchy self.parent_cluster = None diff --git a/khiops/core/internals/filesystems.py b/khiops/core/internals/filesystems.py index e10f45fd..37b8a3fe 100644 --- a/khiops/core/internals/filesystems.py +++ b/khiops/core/internals/filesystems.py @@ -19,7 +19,7 @@ # pylint: disable=invalid-name # Import boto3 if available -# Delay the raising of an ImportError to an instantation of a AmazonS3Resource +# Delay an ImportError raising to an instantiation of a AmazonS3Resource try: import boto3 import boto3.session @@ -30,7 +30,7 @@ boto3_import_error = import_error # Import google.could if available -# Delay the raising of an ImportError to an instantation of a GoogleCloudStorageResource +# Delay an ImportError raising to an instantiation of a GoogleCloudStorageResource try: from google.cloud import storage diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index 529136e8..2187ab94 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -365,7 +365,7 @@ def samples_dir(self): return self._get_samples_dir() def _get_samples_dir(self): - """To be overriden by subclasses""" + """To be overridden by subclasses""" return self._samples_dir @samples_dir.setter @@ -386,7 +386,7 @@ def khiops_version(self): return self._get_khiops_version() def _get_khiops_version(self): - """khiops_version getter to be overriden by subclasses""" + """khiops_version getter to be overridden by subclasses""" return self._khiops_version def _build_status_message(self): @@ -1072,7 +1072,7 @@ def _set_samples_dir(self, samples_dir): super()._set_samples_dir(samples_dir) def _get_samples_dir(self): - # Check the samples dir once (the check emmits only warnings) + # Check the samples dir once (the check emits only warnings) if not self._samples_dir_checked: _check_samples_dir(self._samples_dir) self._samples_dir_checked = True diff --git a/khiops/core/internals/scenario.py b/khiops/core/internals/scenario.py index ff025a7d..e9fd74df 100644 --- a/khiops/core/internals/scenario.py +++ b/khiops/core/internals/scenario.py @@ -82,7 +82,7 @@ def _parse_template(self): line = next(line_iter, None) line_num += 1 - # Eliminate empty lines at the beggining and end if any + # Eliminate empty lines at the beginning and end if any if self._parsed_template[0] == "": self._parsed_template.pop(0) if self._parsed_template[-1] == "": @@ -128,7 +128,7 @@ def _parse_section(self, section_keyword, line_iter): if statement_re.match(statement) is None: raise ValueError( "Statement must contain only alphabetic characters and '.' " - f"(no '.' at the beggining): '{statement}'" + f"(no '.' at the beginning): '{statement}'" ) return section_param_name, section_spec diff --git a/khiops/core/internals/task.py b/khiops/core/internals/task.py index 57f59525..12baf979 100644 --- a/khiops/core/internals/task.py +++ b/khiops/core/internals/task.py @@ -210,7 +210,7 @@ def __init__( for kwarg_signature in self.kwargs_signature } - # Post-intialization checks + # Post-initialization checks # Check that the path_valued_arg_names are contained in either args or kwargs all_arg_names = list(self._args_signature_by_name.keys()) + list( self._kwargs_signature_by_name.keys() diff --git a/khiops/core/internals/version.py b/khiops/core/internals/version.py index e4303762..d84b8be4 100644 --- a/khiops/core/internals/version.py +++ b/khiops/core/internals/version.py @@ -97,7 +97,7 @@ def __init__(self, version_str): version_str, ) - # Store the rest of the prelease (if any) and check it is a number + # Store the rest of the prerelease (if any) and check it is a number # We accept not having a "." in the pre-release increment for backward # compatibility. self._pre_release_increment = _pre_release.replace( diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 66bab90b..e2aec0f1 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -464,7 +464,7 @@ " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", ")\n", "\n", - "# Split the secondary dataframe with the keys of the splitted root dataframe\n", + "# Split the secondary dataframe with the keys of the split root dataframe\n", "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", @@ -1006,7 +1006,7 @@ "# Load the secondary table of the dataset into a pandas dataframe\n", "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", "\n", - "# Split the secondary dataframe with the keys of the splitted root dataframe\n", + "# Split the secondary dataframe with the keys of the split root dataframe\n", "X_train_ids = X_train[\"AccidentId\"].to_frame()\n", "X_test_ids = X_test[\"AccidentId\"].to_frame()\n", "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", @@ -1085,7 +1085,7 @@ " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", ")\n", "\n", - "# Split the secondary dataframe with the keys of the splitted root dataframe\n", + "# Split the secondary dataframe with the keys of the split root dataframe\n", "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index b6fa0bee..da591810 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -405,7 +405,7 @@ def khiops_classifier_with_hyperparameters(): os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" ) - # Split the secondary dataframe with the keys of the splitted root dataframe + # Split the secondary dataframe with the keys of the split root dataframe X_train_ids = X_train_main["AccidentId"].to_frame() X_test_ids = X_test_main["AccidentId"].to_frame() X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") @@ -882,7 +882,7 @@ def khiops_classifier_multitable_list(): os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) - # Split the secondary dataframe with the keys of the splitted root dataframe + # Split the secondary dataframe with the keys of the split root dataframe X_train_ids = X_train["AccidentId"].to_frame() X_test_ids = X_test["AccidentId"].to_frame() X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") @@ -956,7 +956,7 @@ def khiops_classifier_multitable_star_file(): os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" ) - # Split the secondary dataframe with the keys of the splitted root dataframe + # Split the secondary dataframe with the keys of the split root dataframe X_train_ids = X_train_main["AccidentId"].to_frame() X_test_ids = X_test_main["AccidentId"].to_frame() X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") diff --git a/khiops/sklearn/dataset.py b/khiops/sklearn/dataset.py index 45ff069f..26da745f 100644 --- a/khiops/sklearn/dataset.py +++ b/khiops/sklearn/dataset.py @@ -223,7 +223,7 @@ def _check_relations_entry(main_table_name, tables_spec, relations_spec): f"Each relation must be unique." ) - # Check hierachical keys + # Check hierarchical keys _check_hierarchical_keys( i, parent_table, @@ -694,7 +694,7 @@ def _init_tables_from_mapping(self, X): ) self.secondary_tables = [] - # If the relations are not specified intialize to a star schema + # If the relations are not specified initialize to a star schema if "relations" not in X: self.relations = [ (self.main_table.name, table.name, False) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 0f68232a..afa18a7a 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -680,7 +680,7 @@ class KhiopsCoclustering(ClusterMixin, KhiopsEstimator): max_part_numbers : dict, optional Maximum number of clusters for each of the co-clustered column. Specifically, a key-value pair of this dictionary represents the column name and its respective - maximum number of clusters. If not specified there is no maximun number of + maximum number of clusters. If not specified there is no maximum number of clusters is imposed on any column. **Deprecated** will be removed in Khiops 11. Use the ``max_part_number`` parameter of the `fit` method. @@ -1294,7 +1294,7 @@ def _transform_check_dataset(self, ds): # The "model dictionary domain" in the coclustering case it is just composed # of the secondary table. The main "keys" table is a technical object. # So we check the compatibility against only this dictionary and override - # the parents implementaion + # the parents implementation for dictionary in self.model_.dictionaries: if dictionary.name != self.model_main_dictionary_name_: _check_dictionary_compatibility( @@ -1702,7 +1702,7 @@ def _transform_check_dataset(self, ds): # Call the parent method super()._transform_check_dataset(ds) - # Check the coherence between thi input table and the model + # Check the coherence between the input table and the model if self.is_multitable_model_ and not ds.is_multitable: raise ValueError( "You are trying to apply on single-table inputs a model which has " @@ -2263,7 +2263,7 @@ def predict(self, X): y_pred = super().predict(X) # Adjust the data type according to the original target type - # Note: String is coerced explictly because astype does not work as expected + # Note: String is coerced explicitly because astype does not work as expected if isinstance(y_pred, pd.DataFrame): # Transform to numpy.ndarray y_pred = y_pred.to_numpy(copy=False).ravel() @@ -2332,7 +2332,7 @@ def predict_proba(self, X): # Create the input dataset - # Call the generic transfrom method + # Call the generic transform method try: ds = Dataset(X, key=self.key) y_probas = self._transform( diff --git a/tests/test_core.py b/tests/test_core.py index ca6008e5..7f0ecfbd 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1290,7 +1290,7 @@ def _test_evaluation_report_accessors( with self.assertRaises(KeyError): report.get_predictor_performance("INEXISTENT REPORT NAME") - # Test anomalous access to perfomance objects + # Test anomalous access to performance objects for predictor_name in report.get_predictor_names(): self._test_performance_report_accessors( result_file_name, @@ -1750,7 +1750,7 @@ def test_dictionary_extract_data_paths(self): test_resources_dir = os.path.join(resources_dir(), "dictionary") ref_kdicj_dir = os.path.join(test_resources_dir, "ref_kdicj") - # Set the expeced outputs + # Set the expected outputs expected_data_paths = { "Adult": {"Adult": []}, "SpliceJunction": { @@ -2482,7 +2482,7 @@ def test_scenario_generation(self): scenario.write(writer, arguments[template_name, argument_set]) output_scenario = stream.getvalue().decode("ascii").replace("\r", "") - # Compare the output scenario and the expected fixutre + # Compare the output scenario and the expected fixture self.assertEqual(output_scenario, expected_scenario) def test_invalid_templates(self): diff --git a/tests/test_dataset_errors.py b/tests/test_dataset_errors.py index d00a6660..5c337930 100644 --- a/tests/test_dataset_errors.py +++ b/tests/test_dataset_errors.py @@ -30,7 +30,7 @@ class AnotherType(object): class DatasetSpecErrorsTests(unittest.TestCase): """Test the output message when the input data contains errors - Each test covers an egde-case for the initialization of Dataset/DatasetTable and + Each test covers an edge-case for the initialization of Dataset/DatasetTable and checks: - that either `TypeError` or `ValueError` is raised and @@ -796,7 +796,7 @@ def test_dict_spec_table_relation_must_be_str(self): ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) - def test_dict_spec_entiy_flag_relation_must_be_bool(self): + def test_dict_spec_entity_flag_relation_must_be_bool(self): """Test Dataset raising TypeError when the entity flag is not boolean""" bad_spec, y = self.create_fixture_dataset_spec() bad_spec["relations"][0] = ("B", "D", AnotherType()) diff --git a/tests/test_remote_access.py b/tests/test_remote_access.py index e1744958..ad3bbc96 100644 --- a/tests/test_remote_access.py +++ b/tests/test_remote_access.py @@ -62,11 +62,11 @@ def results_dir_root(self): return os.curdir def config_exists(self): - """To be overriden by descendants""" + """To be overridden by descendants""" return False def remote_access_test_case(self): - """To be overriden by descendants""" + """To be overridden by descendants""" return "" def print_test_title(self): @@ -258,11 +258,11 @@ def setUpClass(cls): """Sets up docker runner service for this test case This method executes the following steps: - If environment variable ``KHIOPS_RUNNER_SERVICE_PATH`` is set then it - launches the service and makes sure it is operational before excuting the + launches the service and makes sure it is operational before executing the test case. Otherwise it skips the test case. - Then it copies ``samples`` to a shared directory accessible to both the local Khiops runner service and the process using Khiops Python. - - Finnaly it creates create the `.KhiopsDockerRunner` client for the + - Finally it creates the `.KhiopsDockerRunner` client for the Khiops service and set it as current runner. """ # Save the initial Khiops Python runner diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 67b7ffcd..ea656d40 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -1857,7 +1857,7 @@ def _define_resources(self, dataset, estimator_type, estimator_method): assert estimator_method == "fit", f"Real: {estimator_method}" prediction_table_path = "" - # Buld the resources + # Build the resources resources = { "report_path": report_path, "model_kdic_path": model_kdic_path, diff --git a/tests/test_sklearn_output_types.py b/tests/test_sklearn_output_types.py index 7e728f70..868317dd 100644 --- a/tests/test_sklearn_output_types.py +++ b/tests/test_sklearn_output_types.py @@ -25,7 +25,7 @@ def create_iris(): """Returns a mono table iris dataset""" X_iris_array, y_iris_array = datasets.load_iris(return_X_y=True) X_iris_df = pd.DataFrame( - X_iris_array, columns=["SepalLenght", "SepalWidth", "PetalLength", "PetalWidth"] + X_iris_array, columns=["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"] ) y_iris_series = pd.Series(y_iris_array, name="Class") return X_iris_df, y_iris_series @@ -39,7 +39,7 @@ def create_iris_mt(): id_vars=["Id"], var_name="Measurement", value_name="Value" ) X_iris_df = X_iris_df.drop( - ["SepalLenght", "SepalWidth", "PetalLength", "PetalWidth"], axis=1 + ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"], axis=1 ) return X_iris_df, X_iris_sec_df, y_iris_series