Move to sklearn 0.20's OneHotEncoder and fix README results

MaxHalford · Oct 11, 2018 · fb9eff1 · fb9eff1
1 parent 5734562
commit fb9eff1
Show file tree

Hide file tree

Showing 6 changed files with 47 additions and 165 deletions.
diff --git a/README.md b/README.md
@@ -29,9 +29,20 @@
 
 <br/>
 
-## Introduction
+Prince is a library for doing [factor analysis](https://www.wikiwand.com/en/Factor_analysis). This includes a variety of methods including [principal component analysis (PCA)](https://www.wikiwand.com/en/Principal_component_analysis) and [correspondence analysis (CA)](https://www.wikiwand.com/en/Correspondence_analysis). The goal is to provide an efficient implementation for each algorithm along with a scikit-learn API.
 
-Prince is a library for doing [factor analysis](https://www.wikiwand.com/en/Factor_analysis). This includes a variety of methods including [principal component analysis (PCA)](https://www.wikiwand.com/en/Principal_component_analysis) and [correspondence analysis (CA)](https://www.wikiwand.com/en/Correspondence_analysis). The goal is to provide an efficient implementation for each algorithm along with a nice API.
+## Table of contents
+
+- [Installation](#installation)
+- [Usage](#usage)
+  - [Guidelines](#guidelines)
+  - [Principal component analysis (PCA)](#principal-component-analysis-(pca))
+  - [Correspondence analysis (CA)](#correspondence-analysis-(ca))
+  - [Multiple correspondence analysis (MCA)](#multiple-correspondence-analysis-(mca))
+  - [Multiple factor analysis (MFA)](#multiple-factor-analysis-(mfa))
+  - [Factor analysis of mixed data (FAMD)](#factor-analysis-of-mixed-data-(famd))
+- [Going faster](#going-faster)
+- [License](#license)
 
 ## Installation
 
@@ -137,11 +148,11 @@ Once the `PCA` has been fitted, it can be used to extract the row principal coor
 ```python
 >>> pca.transform(X).head()  # Same as pca.row_coordinates(X).head()
           0         1
-0 -2.264542  0.505704
-1 -2.086426 -0.655405
-2 -2.367950 -0.318477
-3 -2.304197 -0.575368
-4 -2.388777  0.674767
+0 -2.264703  0.480027
+1 -2.080961 -0.674134
+2 -2.364229 -0.341908
+3 -2.299384 -0.597395
+4 -2.389842  0.646835
 
 ```
 
@@ -172,21 +183,21 @@ Each principal component explains part of the underlying of the distribution. Yo
 
 ```python
 >>> pca.explained_inertia_  # doctest: +ELLIPSIS
-[0.727704..., 0.230305...]
+[0.729624..., 0.228507...]
 
 ```
 
 The explained inertia represents the percentage of the inertia each principal component contributes. It sums up to 1 if the `n_components` property is equal to the number of columns in the original dataset. you The explained inertia is obtained by dividing the eigenvalues obtained with the SVD by the total inertia, both of which are also accessible.
 
 ```python
 >>> pca.eigenvalues_  # doctest: +ELLIPSIS
-[436.622712..., 138.183139...]
+[437.774672..., 137.104570...]
 
->>> pca.total_inertia_
-600.0
+>>> pca.total_inertia_  # doctest: +ELLIPSIS
+600.0...
 
 >>> pca.explained_inertia_
-[0.727704..., 0.230305...]
+[0.729624..., 0.228507...]
 
 ```
 
@@ -195,10 +206,10 @@ You can also obtain the correlations between the original variables and the prin
 ```python
 >>> pca.column_correlations(X)
                      0         1
-Petal length  0.991684  0.020247
-Petal width   0.964996  0.062786
-Sepal length  0.891224  0.357352
-Sepal width  -0.449313  0.888351
+Petal length  0.991555  0.023415
+Petal width   0.964979  0.064000
+Sepal length  0.890169  0.360830
+Sepal width  -0.460143  0.882716
 
 ```
 
@@ -207,11 +218,11 @@ You may also want to know how much each observation contributes to each principa
 ```python
 >>> pca.row_contributions(X).head()
           0         1
-0  0.011745  0.001851
-1  0.009970  0.003109
-2  0.012842  0.000734
-3  0.012160  0.002396
-4  0.013069  0.003295
+0  0.011716  0.001681
+1  0.009892  0.003315
+2  0.012768  0.000853
+3  0.012077  0.002603
+4  0.013046  0.003052
 
 ```
 

diff --git a/images/pca_row_coordinates.png b/images/pca_row_coordinates.png
diff --git a/prince/mca.py b/prince/mca.py
@@ -2,6 +2,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from sklearn import preprocessing
 from sklearn import utils
 
 from . import ca

diff --git a/prince/one_hot.py b/prince/one_hot.py
@@ -1,150 +1,18 @@
 """This module contains a custom one-hot encoder. It inherits from sklearn's
-OneHotEncoder and returns a pandas DataFrame with appropriate column names.
+OneHotEncoder and returns a pandas.SparseDataFrame with appropriate column
+names and index values.
 """
 import itertools
 
 import numpy as np
 import pandas as pd
-from scipy import sparse
 from sklearn import preprocessing
-from sklearn import base
-from sklearn import utils
 
 
-class CategoricalEncoder(base.BaseEstimator, base.TransformerMixin):
-
-    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
-                 handle_unknown='error'):
-        self.encoding = encoding
-        self.categories = categories
-        self.dtype = dtype
-        self.handle_unknown = handle_unknown
-
-    def fit(self, X, y=None):
-        """Fit the CategoricalEncoder to X.
-        Parameters
-        ----------
-        X : array-like, shape [n_samples, n_features]
-            The data to determine the categories of each feature.
-        Returns
-        -------
-        self
-        """
-        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
-            template = ("encoding should be either 'onehot', 'onehot-dense' "
-                        "or 'ordinal', got %s")
-            raise ValueError(template % self.handle_unknown)
-
-        if self.handle_unknown not in ['error', 'ignore']:
-            template = ("handle_unknown should be either 'error' or "
-                        "'ignore', got %s")
-            raise ValueError(template % self.handle_unknown)
-
-        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
-            raise ValueError("handle_unknown='ignore' is not supported for"
-                             " encoding='ordinal'")
-
-        if self.categories != 'auto':
-            for cats in self.categories:
-                if not np.all(np.sort(cats) == np.array(cats)):
-                    raise ValueError("Unsorted categories are not yet "
-                                     "supported")
-
-        X_temp = utils.check_array(X, dtype=None)
-        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
-            X = utils.check_array(X, dtype=np.object)
-        else:
-            X = X_temp
-
-        n_samples, n_features = X.shape
-
-        self._label_encoders_ = [preprocessing.LabelEncoder() for _ in range(n_features)]
-
-        for i in range(n_features):
-            le = self._label_encoders_[i]
-            Xi = X[:, i]
-            if self.categories == 'auto':
-                le.fit(Xi)
-            else:
-                if self.handle_unknown == 'error':
-                    valid_mask = np.in1d(Xi, self.categories[i])
-                    if not np.all(valid_mask):
-                        diff = np.unique(Xi[~valid_mask])
-                        msg = ("Found unknown categories {0} in column {1}"
-                               " during fit".format(diff, i))
-                        raise ValueError(msg)
-                le.classes_ = np.array(self.categories[i])
-
-        self.categories_ = [le.classes_ for le in self._label_encoders_]
-
-        return self
-
-    def transform(self, X):
-        """Transform X using specified encoding scheme.
-        Parameters
-        ----------
-        X : array-like, shape [n_samples, n_features]
-            The data to encode.
-        Returns
-        -------
-        X_out : sparse matrix or a 2-d array
-            Transformed input.
-        """
-        X_temp = utils.check_array(X, dtype=None)
-        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
-            X = utils.check_array(X, dtype=np.object)
-        else:
-            X = X_temp
-
-        n_samples, n_features = X.shape
-        X_int = np.zeros_like(X, dtype=np.int)
-        X_mask = np.ones_like(X, dtype=np.bool)
-
-        for i in range(n_features):
-            Xi = X[:, i]
-            valid_mask = np.in1d(Xi, self.categories_[i])
-
-            if not np.all(valid_mask):
-                if self.handle_unknown == 'error':
-                    diff = np.unique(X[~valid_mask, i])
-                    msg = ("Found unknown categories {0} in column {1}"
-                           " during transform".format(diff, i))
-                    raise ValueError(msg)
-                else:
-                    # Set the problematic rows to an acceptable value and
-                    # continue `The rows are marked `X_mask` and will be
-                    # removed later.
-                    X_mask[:, i] = valid_mask
-                    Xi = Xi.copy()
-                    Xi[~valid_mask] = self.categories_[i][0]
-            X_int[:, i] = self._label_encoders_[i].transform(Xi)
-
-        if self.encoding == 'ordinal':
-            return X_int.astype(self.dtype, copy=False)
-
-        mask = X_mask.ravel()
-        n_values = [cats.shape[0] for cats in self.categories_]
-        n_values = np.array([0] + n_values)
-        feature_indices = np.cumsum(n_values)
-
-        indices = (X_int + feature_indices[:-1]).ravel()[mask]
-        indptr = X_mask.sum(axis=1).cumsum()
-        indptr = np.insert(indptr, 0, 0)
-        data = np.ones(n_samples * n_features)[mask]
-
-        out = sparse.csr_matrix((data, indices, indptr),
-                                shape=(n_samples, feature_indices[-1]),
-                                dtype=self.dtype)
-        if self.encoding == 'onehot-dense':
-            return out.toarray()
-        else:
-            return out
-
-
-class OneHotEncoder(CategoricalEncoder):
+class OneHotEncoder(preprocessing.OneHotEncoder):
 
     def __init__(self):
-        super().__init__(encoding='onehot-dense')
+        super().__init__(sparse=True, dtype=np.uint8)
 
     def fit(self, X, y=None):
 
@@ -159,11 +27,13 @@ def fit(self, X, y=None):
             ]
             for i, col in enumerate(X.columns)
         ]))
+
         return self
 
     def transform(self, X):
-        return pd.DataFrame(
+        return pd.SparseDataFrame(
             data=super().transform(X),
             columns=self.column_names_,
-            index=X.index if isinstance(X, pd.DataFrame) else None
+            index=X.index if isinstance(X, pd.DataFrame) else None,
+            default_fill_value=0
         )
diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -1,10 +1,10 @@
 coveralls>=1.3.0
 nose>=1.3.7
-pytest>=3.5.0
-pytest-cov>=2.5.1
+pytest>=3.5.1
+pytest-cov>=2.6.0
 fbpca>=1.0
 matplotlib>=2.2.2
 numpy>=1.14.0
-pandas>=0.22.0
+pandas>=0.23.0
 scipy>=1.0.1
-scikit-learn>=0.19.1
+scikit-learn>=0.20.0
diff --git a/setup.py b/setup.py
@@ -25,9 +25,9 @@
 REQUIRED = [
     'matplotlib>=2.2.2',
     'numpy>=1.14.0',
-    'pandas>=0.22.0',
+    'pandas>=0.23.0',
     'scipy>=1.0.1',
-    'scikit-learn>=0.19.1'
+    'scikit-learn>=0.20.0'
 ]
 
 # The rest you shouldn't have to touch too much :)