Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ jobs:
git config --global --add safe.directory $(Resolve-Path '.' | % {$_.toString()})
python setup.py egg_info

# Install the Pyrhon requirements
# Install the Python requirements
Get-Content .\khiops.egg-info\requires.txt `
| Select-String -Pattern '^\[' -NotMatch `
| Select-String -Pattern '^$' -NotMatch `
Expand Down
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@

### Fixed
- `core`
- Metric name search in estimator analyis report.
- Metric name search in estimator analysis report.

## 10.2.1.0 - 2024-03-26

Expand Down Expand Up @@ -219,7 +219,7 @@ Note: This release marks the open sourcing of Khiops:
- Estimators now accept dataframes with numerical column indexes.
- `KhiopsClassifier` now accepts integer target vectors.
- `classes_` estimator attribute for `KhiopsClassifier` (available once fitted).
- `feature_names_out_` estimator attirbute for `KhiopsEncoder` (available once fitted).
- `feature_names_out_` estimator attribute for `KhiopsEncoder` (available once fitted).
- `export_report_file` and `export_dictionary_file` to export Khiops report and dictionary files
once the estimators are fitted.
- `internal_sort` parameter for estimators that may be used to not sort the tables on the
Expand Down
2 changes: 1 addition & 1 deletion doc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Khiops Python library.

# You'll also need a system-wide installation of pandoc (https://pandoc.org)

# Execute this if there were non commited updates to samples.py or samples_sklearn.py:
# Execute this if there were non committed updates to samples.py or samples_sklearn.py:
# ./convert-samples-hook

# To clean the html documentation
Expand Down
6 changes: 3 additions & 3 deletions doc/samples/samples_sklearn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ Samples
os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
)

# Split the secondary dataframe with the keys of the splitted root dataframe
# Split the secondary dataframe with the keys of the split root dataframe
X_train_ids = X_train_main["AccidentId"].to_frame()
X_test_ids = X_test_main["AccidentId"].to_frame()
X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
Expand Down Expand Up @@ -825,7 +825,7 @@ Samples
# Load the secondary table of the dataset into a pandas dataframe
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")

# Split the secondary dataframe with the keys of the splitted root dataframe
# Split the secondary dataframe with the keys of the split root dataframe
X_train_ids = X_train["AccidentId"].to_frame()
X_test_ids = X_test["AccidentId"].to_frame()
X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
Expand Down Expand Up @@ -891,7 +891,7 @@ Samples
os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
)

# Split the secondary dataframe with the keys of the splitted root dataframe
# Split the secondary dataframe with the keys of the split root dataframe
X_train_ids = X_train_main["AccidentId"].to_frame()
X_test_ids = X_test_main["AccidentId"].to_frame()
X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
Expand Down
6 changes: 3 additions & 3 deletions khiops/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,7 @@ def train_predictor(
Pairs specified with ``specific_pairs`` have top priority: they are constructed
first.
only_pairs_with : str, default ""
Constructs only pairs with the specifed variable name. If equal to the empty
Constructs only pairs with the specified variable name. If equal to the empty
string "" it considers all variables to make pairs.
**Deprecated** will be removed in Khiops Python 11, use ``specific_pairs``.
group_target_value : bool, default ``False``
Expand Down Expand Up @@ -1083,7 +1083,7 @@ def train_recoder(
Pairs specified with ``specific_pairs`` have top priority: they are constructed
first.
only_pairs_with : str, default ""
Constructs only pairs with the specifed variable name. If equal to the empty
Constructs only pairs with the specified variable name. If equal to the empty
string "" it considers all variables to make pairs.
**Deprecated** will be removed in Khiops Python 11, use ``specific_pairs``.
group_target_value : bool, default ``False``
Expand Down Expand Up @@ -1133,7 +1133,7 @@ def train_recoder(
- "conditional info": Conditional information of the interval/group
- "none": Keeps the variable as-is
grouping_method : str
Name of the grouping method. Its vaild values depend on the task:
Name of the grouping method. Its valid values depend on the task:
- Supervised: "MODL" (default) or "BasicGrouping"
- Unsupervised: "BasicGrouping" (default) or "None"
min_group_frequency : int, default 0
Expand Down
12 changes: 6 additions & 6 deletions khiops/core/coclustering_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ class CoclusteringDimension:
"""A coclustering dimension (variable)

A coclustering dimension is a hierarchical clustering of an input variable. The
leafs of this hierarchy are linked to an element of a partition of the input
leaves of this hierarchy are linked to an element of a partition of the input
variable. Leaf clusters have variable parts as their children.

It only has a no-parameter constructor.
Expand Down Expand Up @@ -553,7 +553,7 @@ def __init__(self):
# Default group attribute only for categorical dimensions
self.default_group = None

# List of the hierarchy clusters, ranging from the root to the leafs
# List of the hierarchy clusters, ranging from the root to the leaves
self.root_cluster = None
self.clusters = []

Expand Down Expand Up @@ -939,7 +939,7 @@ def __init__(self, json_data=None):
# Transform to an empty dictionary if json_data is not specified
if json_data is None:
json_data = {}
# Otherise check the validity of json_data
# Otherwise check the validity of json_data
elif "cluster" not in json_data:
raise KhiopsJSONError("'cluster' key not found")

Expand All @@ -955,7 +955,7 @@ class CoclusteringDimensionPartInterval(CoclusteringDimensionPart):
json_data : dict, optional
Python dictionary representing an element of type "Numerical" of the list at the
``dimensionPartitions`` field of a Khiops Coclustering JSON report file. If not
specifed it returns an empty instance.
specified it returns an empty instance.

Raises
------
Expand Down Expand Up @@ -1205,7 +1205,7 @@ def __init__(self, json_data=None):
# Transform to an empty dictionary if json_data is not specified
if json_data is None:
json_data = {}
# Otherwise check the "cluter" and "parentCluster" keys
# Otherwise check the "cluster" and "parentCluster" keys
else:
if "cluster" not in json_data:
raise KhiopsJSONError("'cluster' key not found")
Expand All @@ -1224,7 +1224,7 @@ def __init__(self, json_data=None):
self.short_description = json_data.get("shortDescription", "")
self.description = json_data.get("description", "")

# Link to child clusters, None for the leafs of the hierarchy
# Link to child clusters, None for the leaves of the hierarchy
# The user must specify the CoclusteringCluster references parent_cluster
# and child_cluster that link this instance to the hierarchy
self.parent_cluster = None
Expand Down
4 changes: 2 additions & 2 deletions khiops/core/internals/filesystems.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
# pylint: disable=invalid-name

# Import boto3 if available
# Delay the raising of an ImportError to an instantation of a AmazonS3Resource
# Delay an ImportError raising to an instantiation of a AmazonS3Resource
try:
import boto3
import boto3.session
Expand All @@ -30,7 +30,7 @@
boto3_import_error = import_error

# Import google.could if available
# Delay the raising of an ImportError to an instantation of a GoogleCloudStorageResource
# Delay an ImportError raising to an instantiation of a GoogleCloudStorageResource
try:
from google.cloud import storage

Expand Down
6 changes: 3 additions & 3 deletions khiops/core/internals/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ def samples_dir(self):
return self._get_samples_dir()

def _get_samples_dir(self):
"""To be overriden by subclasses"""
"""To be overridden by subclasses"""
return self._samples_dir

@samples_dir.setter
Expand All @@ -386,7 +386,7 @@ def khiops_version(self):
return self._get_khiops_version()

def _get_khiops_version(self):
"""khiops_version getter to be overriden by subclasses"""
"""khiops_version getter to be overridden by subclasses"""
return self._khiops_version

def _build_status_message(self):
Expand Down Expand Up @@ -1072,7 +1072,7 @@ def _set_samples_dir(self, samples_dir):
super()._set_samples_dir(samples_dir)

def _get_samples_dir(self):
# Check the samples dir once (the check emmits only warnings)
# Check the samples dir once (the check emits only warnings)
if not self._samples_dir_checked:
_check_samples_dir(self._samples_dir)
self._samples_dir_checked = True
Expand Down
4 changes: 2 additions & 2 deletions khiops/core/internals/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def _parse_template(self):
line = next(line_iter, None)
line_num += 1

# Eliminate empty lines at the beggining and end if any
# Eliminate empty lines at the beginning and end if any
if self._parsed_template[0] == "":
self._parsed_template.pop(0)
if self._parsed_template[-1] == "":
Expand Down Expand Up @@ -128,7 +128,7 @@ def _parse_section(self, section_keyword, line_iter):
if statement_re.match(statement) is None:
raise ValueError(
"Statement must contain only alphabetic characters and '.' "
f"(no '.' at the beggining): '{statement}'"
f"(no '.' at the beginning): '{statement}'"
)

return section_param_name, section_spec
Expand Down
2 changes: 1 addition & 1 deletion khiops/core/internals/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def __init__(
for kwarg_signature in self.kwargs_signature
}

# Post-intialization checks
# Post-initialization checks
# Check that the path_valued_arg_names are contained in either args or kwargs
all_arg_names = list(self._args_signature_by_name.keys()) + list(
self._kwargs_signature_by_name.keys()
Expand Down
2 changes: 1 addition & 1 deletion khiops/core/internals/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def __init__(self, version_str):
version_str,
)

# Store the rest of the prelease (if any) and check it is a number
# Store the rest of the prerelease (if any) and check it is a number
# We accept not having a "." in the pre-release increment for backward
# compatibility.
self._pre_release_increment = _pre_release.replace(
Expand Down
6 changes: 3 additions & 3 deletions khiops/samples/samples_sklearn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,7 @@
" os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n",
")\n",
"\n",
"# Split the secondary dataframe with the keys of the splitted root dataframe\n",
"# Split the secondary dataframe with the keys of the split root dataframe\n",
"X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n",
"X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n",
"X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n",
Expand Down Expand Up @@ -1006,7 +1006,7 @@
"# Load the secondary table of the dataset into a pandas dataframe\n",
"vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n",
"\n",
"# Split the secondary dataframe with the keys of the splitted root dataframe\n",
"# Split the secondary dataframe with the keys of the split root dataframe\n",
"X_train_ids = X_train[\"AccidentId\"].to_frame()\n",
"X_test_ids = X_test[\"AccidentId\"].to_frame()\n",
"X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n",
Expand Down Expand Up @@ -1085,7 +1085,7 @@
" os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n",
")\n",
"\n",
"# Split the secondary dataframe with the keys of the splitted root dataframe\n",
"# Split the secondary dataframe with the keys of the split root dataframe\n",
"X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n",
"X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n",
"X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n",
Expand Down
6 changes: 3 additions & 3 deletions khiops/samples/samples_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ def khiops_classifier_with_hyperparameters():
os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
)

# Split the secondary dataframe with the keys of the splitted root dataframe
# Split the secondary dataframe with the keys of the split root dataframe
X_train_ids = X_train_main["AccidentId"].to_frame()
X_test_ids = X_test_main["AccidentId"].to_frame()
X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
Expand Down Expand Up @@ -882,7 +882,7 @@ def khiops_classifier_multitable_list():
os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t"
)

# Split the secondary dataframe with the keys of the splitted root dataframe
# Split the secondary dataframe with the keys of the split root dataframe
X_train_ids = X_train["AccidentId"].to_frame()
X_test_ids = X_test["AccidentId"].to_frame()
X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
Expand Down Expand Up @@ -956,7 +956,7 @@ def khiops_classifier_multitable_star_file():
os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
)

# Split the secondary dataframe with the keys of the splitted root dataframe
# Split the secondary dataframe with the keys of the split root dataframe
X_train_ids = X_train_main["AccidentId"].to_frame()
X_test_ids = X_test_main["AccidentId"].to_frame()
X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
Expand Down
4 changes: 2 additions & 2 deletions khiops/sklearn/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def _check_relations_entry(main_table_name, tables_spec, relations_spec):
f"Each relation must be unique."
)

# Check hierachical keys
# Check hierarchical keys
_check_hierarchical_keys(
i,
parent_table,
Expand Down Expand Up @@ -694,7 +694,7 @@ def _init_tables_from_mapping(self, X):
)
self.secondary_tables = []

# If the relations are not specified intialize to a star schema
# If the relations are not specified initialize to a star schema
if "relations" not in X:
self.relations = [
(self.main_table.name, table.name, False)
Expand Down
10 changes: 5 additions & 5 deletions khiops/sklearn/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,7 +680,7 @@ class KhiopsCoclustering(ClusterMixin, KhiopsEstimator):
max_part_numbers : dict, optional
Maximum number of clusters for each of the co-clustered column. Specifically, a
key-value pair of this dictionary represents the column name and its respective
maximum number of clusters. If not specified there is no maximun number of
maximum number of clusters. If not specified there is no maximum number of
clusters is imposed on any column.
**Deprecated** will be removed in Khiops 11. Use the ``max_part_number``
parameter of the `fit` method.
Expand Down Expand Up @@ -1294,7 +1294,7 @@ def _transform_check_dataset(self, ds):
# The "model dictionary domain" in the coclustering case it is just composed
# of the secondary table. The main "keys" table is a technical object.
# So we check the compatibility against only this dictionary and override
# the parents implementaion
# the parents implementation
for dictionary in self.model_.dictionaries:
if dictionary.name != self.model_main_dictionary_name_:
_check_dictionary_compatibility(
Expand Down Expand Up @@ -1702,7 +1702,7 @@ def _transform_check_dataset(self, ds):
# Call the parent method
super()._transform_check_dataset(ds)

# Check the coherence between thi input table and the model
# Check the coherence between the input table and the model
if self.is_multitable_model_ and not ds.is_multitable:
raise ValueError(
"You are trying to apply on single-table inputs a model which has "
Expand Down Expand Up @@ -2263,7 +2263,7 @@ def predict(self, X):
y_pred = super().predict(X)

# Adjust the data type according to the original target type
# Note: String is coerced explictly because astype does not work as expected
# Note: String is coerced explicitly because astype does not work as expected
if isinstance(y_pred, pd.DataFrame):
# Transform to numpy.ndarray
y_pred = y_pred.to_numpy(copy=False).ravel()
Expand Down Expand Up @@ -2332,7 +2332,7 @@ def predict_proba(self, X):

# Create the input dataset

# Call the generic transfrom method
# Call the generic transform method
try:
ds = Dataset(X, key=self.key)
y_probas = self._transform(
Expand Down
6 changes: 3 additions & 3 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1290,7 +1290,7 @@ def _test_evaluation_report_accessors(
with self.assertRaises(KeyError):
report.get_predictor_performance("INEXISTENT REPORT NAME")

# Test anomalous access to perfomance objects
# Test anomalous access to performance objects
for predictor_name in report.get_predictor_names():
self._test_performance_report_accessors(
result_file_name,
Expand Down Expand Up @@ -1750,7 +1750,7 @@ def test_dictionary_extract_data_paths(self):
test_resources_dir = os.path.join(resources_dir(), "dictionary")
ref_kdicj_dir = os.path.join(test_resources_dir, "ref_kdicj")

# Set the expeced outputs
# Set the expected outputs
expected_data_paths = {
"Adult": {"Adult": []},
"SpliceJunction": {
Expand Down Expand Up @@ -2482,7 +2482,7 @@ def test_scenario_generation(self):
scenario.write(writer, arguments[template_name, argument_set])
output_scenario = stream.getvalue().decode("ascii").replace("\r", "")

# Compare the output scenario and the expected fixutre
# Compare the output scenario and the expected fixture
self.assertEqual(output_scenario, expected_scenario)

def test_invalid_templates(self):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_dataset_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class AnotherType(object):
class DatasetSpecErrorsTests(unittest.TestCase):
"""Test the output message when the input data contains errors

Each test covers an egde-case for the initialization of Dataset/DatasetTable and
Each test covers an edge-case for the initialization of Dataset/DatasetTable and
checks:

- that either `TypeError` or `ValueError` is raised and
Expand Down Expand Up @@ -796,7 +796,7 @@ def test_dict_spec_table_relation_must_be_str(self):
)
self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg)

def test_dict_spec_entiy_flag_relation_must_be_bool(self):
def test_dict_spec_entity_flag_relation_must_be_bool(self):
"""Test Dataset raising TypeError when the entity flag is not boolean"""
bad_spec, y = self.create_fixture_dataset_spec()
bad_spec["relations"][0] = ("B", "D", AnotherType())
Expand Down
Loading