Kaysera · Kaysera · Feb 5, 2024 · Jun 20, 2022 · Jun 20, 2022 · Jul 6, 2022
diff --git a/.gitignore b/.gitignore
@@ -135,4 +135,7 @@ dmypy.json
 .pre-commit-config.yaml
 
 # DS files
-.DS_Store
+.DS_Store
+
+flore-experiments/
+knowledge-extraction-experiments/
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,5 +1,5 @@
 recursive-include docs *
-recursive-include teacher/datasets/data *.csv
+recursive-include src/teacher/datasets/data *.csv
 
 include LICENSE
 include MANIFEST.in

diff --git a/README.md b/README.md
@@ -52,6 +52,7 @@ For detailed instructions on how to use teacher, please refer to the [API Refere
 The following list summarizes the models and explainers currently supported
 - **Fuzzy Factuals and Counterfactuals**: Explainer obtained from a fuzzy tree that can be used for global or local explanations
 - **LORE**: Local explainer generated from a neighborhood
+- **FLARE**: Fuzzy local explainer generated from a neighborhood
 
 ## Metrics
 
@@ -73,4 +74,5 @@ The following list summarizes the metrics and scores that can be extracted from
   - Documentation <https://xai-teacher.readthedocs.io/en/latest/>
   - Experiments: <https://github.com/Kaysera/teacher-experiments>
 - LORE ([Guidotti et al., 2018](https://doi.org/10.1109/MIS.2019.2957223))
-  - Documentation and examples: <https://doi.org/10.1109/MIS.2019.2957223>
+  - Documentation and examples: <https://doi.org/10.1109/MIS.2019.2957223>
+- FLARE ([Fernandez et al., 2023 preprint](https://dsi.uclm.es/descargas/technicalreports/DIAB-24-02-1/FLARE_Tech_Rep.pdf))
diff --git a/src/teacher/datasets/__init__.py b/src/teacher/datasets/__init__.py
@@ -109,7 +109,18 @@
 # =============================================================================
 
 # Local application
-from ._base import load_german, load_adult, load_compas, load_heloc, load_beer, load_pima, load_breast
+from ._base import (load_german,
+                    load_adult,
+                    load_compas,
+                    load_heloc,
+                    load_beer,
+                    load_pima,
+                    load_breast,
+                    load_basket,
+                    load_phishing,
+                    load_flavia,
+                    load_iris,
+                    load_wine)
 
 
 # =============================================================================
@@ -120,10 +131,15 @@
 # from the module teacher.datasets
 __all__ = [
     "load_adult",
+    "load_basket",
     "load_beer",
     "load_breast",
     "load_compas",
     "load_german",
     "load_heloc",
-    "load_pima"
+    "load_pima",
+    "load_phishing",
+    "load_flavia",
+    "load_iris",
+    "load_wine"
 ]
diff --git a/src/teacher/datasets/_base.py b/src/teacher/datasets/_base.py
@@ -14,6 +14,8 @@
 # Third party
 import pandas as pd
 import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn import datasets
 
 # Local application
 from teacher.utils import recognize_features_type, set_discrete_continuous, label_encode
@@ -30,7 +32,7 @@
 # Functions
 # =============================================================================
 
-def generate_dataset(df, columns, class_name, discrete, name):
+def generate_dataset(df, columns, class_name, discrete, name, normalize=False):
     """Generate the dataset suitable for LORE usage
 
     Parameters
@@ -45,6 +47,8 @@ def generate_dataset(df, columns, class_name, discrete, name):
         List with all the columns to be considered to have discrete values
     name : str
         Name of the dataset
+    normalize : bool
+        Whether to normalize the continuous features or not
 
     Returns
     -------
@@ -63,6 +67,7 @@ def generate_dataset(df, columns, class_name, discrete, name):
             label_encoder : label encoder for the discrete values
             X : NumPy array with all the columns except for the class
             y : NumPy array with the class column
+            normalize_scaler : scaler used to normalize the continuous features
     """
     possible_outcomes = list(df[class_name].unique())
 
@@ -72,7 +77,12 @@ def generate_dataset(df, columns, class_name, discrete, name):
     columns_tmp = list(columns)
     columns_tmp.remove(class_name)
     idx_features = {i: col for i, col in enumerate(columns_tmp)}
-
+    df[continuous] += 1
+    if normalize:
+        scaler = StandardScaler()
+        df[continuous] = scaler.fit_transform(df[continuous])
+    else:
+        scaler = None
     # Dataset Preparation for Scikit Alorithms
     df_le, label_encoder = label_encode(df, discrete)
     X = df_le.loc[:, df_le.columns != class_name].values
@@ -90,14 +100,15 @@ def generate_dataset(df, columns, class_name, discrete, name):
         'continuous': continuous,
         'idx_features': idx_features,
         'label_encoder': label_encoder,
+        'normalize_scaler': scaler,
         'X': X,
         'y': y,
     }
 
     return dataset
 
 
-def load_german():
+def load_german(normalize=False):
     """
     Load and return the german credit dataset.
 
@@ -115,10 +126,10 @@ def load_german():
 
     discrete = ['installment_as_income_perc', 'present_res_since', 'credits_this_bank', 'people_under_maintenance']
 
-    return generate_dataset(df, columns, class_name, discrete, 'german_credit')
+    return generate_dataset(df, columns, class_name, discrete, 'german_credit', normalize)
 
 
-def load_adult():
+def load_adult(normalize=False):
     """
     Load and return the adult dataset.
 
@@ -145,10 +156,10 @@ def load_adult():
     class_name = 'class'
 
     discrete = []
-    return generate_dataset(df, columns, class_name, discrete, 'adult')
+    return generate_dataset(df, columns, class_name, discrete, 'adult', normalize)
 
 
-def load_compas():
+def load_compas(normalize=False):
     """
     Load and return the COMPAS scores dataset.
 
@@ -196,10 +207,10 @@ def get_class(x):
     class_name = 'class'
     discrete = ['is_recid', 'is_violent_recid', 'two_year_recid']
 
-    return generate_dataset(df, columns, class_name, discrete, 'compas-scores-two-years')
+    return generate_dataset(df, columns, class_name, discrete, 'compas-scores-two-years', normalize)
 
 
-def load_heloc():
+def load_heloc(normalize=False):
     """
     Load and return the HELOC dataset.
 
@@ -215,10 +226,10 @@ def load_heloc():
     class_name = 'RiskPerformance'
 
     discrete = []
-    return generate_dataset(df, columns, class_name, discrete, 'heloc_dataset_v1')
+    return generate_dataset(df, columns, class_name, discrete, 'heloc_dataset_v1', normalize)
 
 
-def load_beer():
+def load_beer(normalize=False):
     """
     Load and return the beer dataset.
 
@@ -238,10 +249,10 @@ def load_beer():
 
     discrete = []
     columns = df.columns
-    return generate_dataset(df, columns, class_name, discrete, 'beer')
+    return generate_dataset(df, columns, class_name, discrete, 'beer', normalize)
 
 
-def load_pima():
+def load_pima(normalize=False):
     """
     Load and return the pima indians dataset.
 
@@ -261,10 +272,117 @@ def load_pima():
 
     discrete = []
     columns = df.columns
-    return generate_dataset(df, columns, class_name, discrete, 'pima')
+    return generate_dataset(df, columns, class_name, discrete, 'pima', normalize)
+
 
+def load_flavia(normalize=False):
+    """
+    Load and return the FLAVIA dataset.
 
-def load_breast():
+    Returns
+    -------
+    dataset : dict
+    """
+    # Read Dataset
+    df = pd.read_csv(MODULE_PATH + '/data/FLAVIA3.csv', delimiter=',')
+
+    # Features Categorization
+    class_name = 'Class'
+    df_cols = list(df.columns)
+    df_cols.remove(class_name)
+    new_cols = [class_name] + df_cols
+    df = df[new_cols]
+
+    discrete = []
+    columns = df.columns
+    return generate_dataset(df, columns, class_name, discrete, 'flavia', normalize)
+
+
+def load_phishing(normalize=False):
+    """
+    Load and return the phishing dataset.
+
+    Returns
+    -------
+    dataset : dict
+    """
+    # Read Dataset
+    df = pd.read_csv(MODULE_PATH + '/data/phishing.csv', delimiter=',')
+    del df['id']
+    del df['PctExtResourceUrls']
+    del df['PctNullSelfRedirectHyperlinks']
+    del df['SubdomainLevelRT']
+    del df['UrlLengthRT']
+    del df['PctExtResourceUrlsRT']
+    del df['AbnormalExtFormActionR']
+    del df['ExtMetaScriptLinkRT']
+    del df['PctExtNullSelfRedirectHyperlinksRT']
+
+    # Features Categorization
+    class_name = 'CLASS_LABEL'
+    df_cols = list(df.columns)
+    df_cols.remove(class_name)
+    new_cols = [class_name] + df_cols
+    df = df[new_cols]
+
+    discrete = []
+    columns = df.columns
+    return generate_dataset(df, columns, class_name, discrete, 'phishing', normalize)
+
+
+def load_iris(normalize=False):
+    """
+    Load and return the iris dataset.
+
+    Returns
+    -------
+    dataset : dict
+    """
+    # Read Dataset
+    iris = datasets.load_iris(as_frame=True)
+    df = iris.frame
+
+    # Features Categorization
+    columns = df.columns
+    class_name = columns[-1]
+
+    df_cols = list(df.columns)
+    df_cols.remove(class_name)
+    new_cols = [class_name] + df_cols
+    df = df[new_cols]
+
+    discrete = []
+    columns = df.columns
+    return generate_dataset(df, columns, class_name, discrete, 'iris', normalize)
+
+
+def load_wine(normalize=False):
+    """
+    Load and return the wine dataset.
+
+    Returns
+    -------
+    dataset : dict
+    """
+    # Read Dataset
+    wine = datasets.load_wine(as_frame=True)
+    df = wine.frame
+
+    # Features Categorization
+    columns = df.columns
+    class_name = columns[-1]
+
+    df_cols = list(df.columns)
+    df_cols.remove(class_name)
+    new_cols = [class_name] + df_cols
+    df = df[new_cols]
+
+    discrete = []
+    columns = df.columns
+    return generate_dataset(df, columns, class_name, discrete, 'wine', normalize)
+
+
+def load_breast(normalize=False):
     """
     Load and return the breast cancer dataset.
 
@@ -281,4 +399,30 @@ def load_breast():
     class_name = 'diagnosis'
 
     discrete = []
-    return generate_dataset(df, columns, class_name, discrete, 'breast')
+    return generate_dataset(df, columns, class_name, discrete, 'breast', normalize)
+
+
+def load_basket(normalize=False, reduced=False):
+    """
+    Load and return the basket dataset.
+
+    Returns
+    -------
+    dataset : dict
+    """
+    # Read Dataset
+    if reduced:
+        df = pd.read_csv(MODULE_PATH + '/data/small_basket.csv', delimiter=',')
+    else:
+        df = pd.read_csv(MODULE_PATH + '/data/basket.csv', delimiter=',')
+
+    # Features Categorization
+    columns = df.columns
+    class_name = 'Position'
+    df_cols = list(df.columns)
+    df_cols.remove(class_name)
+    new_cols = [class_name] + df_cols
+    df = df[new_cols]
+
+    discrete = []
+    return generate_dataset(df, columns, class_name, discrete, 'basket', normalize)