# pipelines.classifier

> Wrappers for different approaches to text classification, including few-shot classification (via SetFit) and scikit-learn text classification

In [None]:
# | default_exp pipelines.classifier

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
# | export

from typing import List, Union

import warnings

with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    from setfit import SetFitModel, TrainingArguments, Trainer, sample_dataset

In [None]:
# | export

import warnings
from abc import ABC, abstractmethod
DEFAULT_SETFIT_MODEL = "sentence-transformers/paraphrase-mpnet-base-v2"
SMALL_SETFIT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DATASET_TEXT = "text"
DATASET_LABEL = "label"

class ClassifierBase(ABC):

  
    @abstractmethod
    def train(self,
              X:List[str],
              y:Union[List[int], List[str]],
              max_steps:int=50,
              num_epochs:int=10,
              batch_size:int=32,
              metric='accuracy',
              callbacks=None,
              **kwargs,
             ):
        """
        Trains the classifier on a list of texts (`X`) and a list of labels (`y`).
        Additional keyword arguments are passed directly to `SetFit.TrainingArguments`.

        **Args:**

        - *X*: List of texts
        - *y*: List of integers representing labels
        - *max_steps*: If set to a positive number, the total number of training steps to perform. Overrides num_epochs. 
        - *num_epochs*: Number of epochs to train
        - *batch_size*: Batch size
        - *metric*: metric to use
        - *callbacks*:  A list of callbacks to customize the training loop.

        **Returns:**

        - None
        """
        pass

    @abstractmethod
    def save(self, save_path:str):
        """
        Save model to specified folder path, `save_path`
        """
        pass

       
    def _labels(self):
        return self.model.labels if hasattr(self.model, 'labels') else []

        
    def sample_examples(self, X:list, y:list, num_samples:int=8,
                        text_key:str=DATASET_TEXT, label_key:str=DATASET_LABEL):
        """
        Sample a dataset with `num_samples` per class
        """
        full_dataset = self.arrays2dataset(X, y, text_key=text_key, label_key=label_key)
                
        sample = sample_dataset(full_dataset, label_column=label_key, num_samples=num_samples)
        return sample.to_dict()[text_key], sample.to_dict()[label_key]
        

    
    def arrays2dataset(self, X:List[str], y:Union[List[int], List[str]], 
                       text_key:str=DATASET_TEXT, label_key:str=DATASET_LABEL):
        """
        Convert train or test examples to HF dataset
        """
        from datasets import Dataset
        return Dataset.from_dict({text_key:X, label_key:y})


    def dataset2arrays(self, dataset, text_key:str=DATASET_TEXT, label_key:str=DATASET_LABEL):
        """
        Convert a Hugging Face dataset to X, y arrays
        """
        return dataset.to_dict()['text'], dataset.to_dict()['label']
        
    def get_trainer(self):
        """
        Retrieves last trainer
        """
        if not self.trainer:
            raise ValueError('A trainer has not been created yet. You must first train a model on some labeled examples ' +\
                             'using the FewShotClassifier.train method.')
        return self.trainer

    
    def evaluate(self, X_eval:list, y_eval:list, print_report:bool=False, labels:List[str]=[]):
        """
        Evaluates labeled data using the trained model. 
        If `print_report` is True, prints classification report and returns nothing.
        Otherwise, returns and prints a dictionary of the results.

        
        """
        from sklearn.metrics import classification_report
        y_pred= self.predict(X_eval)
        if self._labels():
            y_pred = [self._labels().index(y) for y in y_pred]
            if y_eval[0] in self._labels():
                y_eval = [self._labels().index(y) for y in y_eval]

        result = classification_report(y_eval, y_pred, 
                                       output_dict=not print_report, 
                                       target_names= self.get_labels(labels))
        if print_report:
            return result
        else:
            import yaml
            print(yaml.dump(result, allow_unicode=True, default_flow_style=False))
            return result


    def get_labels(self, labels:List[str]=[]):
        """
        Inspect model and return labels
        """
        target_names = labels if labels else self._labels()
        target_names = target_names if target_names else None
        return target_names



In [None]:
show_doc(ClassifierBase.arrays2dataset)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L82){target="_blank" style="float:right; font-size:smaller"}

### ClassifierBase.arrays2dataset

>      ClassifierBase.arrays2dataset (X:List[str], y:Union[List[int],List[str]],
>                                     text_key:str='text',
>                                     label_key:str='label')

*Convert train or test examples to HF dataset*

In [None]:
show_doc(ClassifierBase.dataset2arrays)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L91){target="_blank" style="float:right; font-size:smaller"}

### ClassifierBase.dataset2arrays

>      ClassifierBase.dataset2arrays (dataset, text_key:str='text',
>                                     label_key:str='label')

*Convert a Hugging Face dataset to X, y arrays*

In [None]:
show_doc(ClassifierBase.evaluate)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L107){target="_blank" style="float:right; font-size:smaller"}

### ClassifierBase.evaluate

>      ClassifierBase.evaluate (X_eval:list, y_eval:list,
>                               print_report:bool=False, labels:List[str]=[])

*Evaluates labeled data using the trained model. 
If `print_report` is True, prints classification report and returns nothing.
Otherwise, returns and prints a dictionary of the results.*

In [None]:
show_doc(ClassifierBase.get_trainer)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L97){target="_blank" style="float:right; font-size:smaller"}

### ClassifierBase.get_trainer

>      ClassifierBase.get_trainer ()

*Retrieves last trainer*

In [None]:
show_doc(ClassifierBase.get_trainer)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L97){target="_blank" style="float:right; font-size:smaller"}

### ClassifierBase.get_trainer

>      ClassifierBase.get_trainer ()

*Retrieves last trainer*

In [None]:
show_doc(ClassifierBase.sample_examples)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L70){target="_blank" style="float:right; font-size:smaller"}

### ClassifierBase.sample_examples

>      ClassifierBase.sample_examples (X:list, y:list, num_samples:int=8,
>                                      text_key:str='text',
>                                      label_key:str='label')

*Sample a dataset with `num_samples` per class*

In [None]:
# | export

class SKClassifier(ClassifierBase):
    def __init__(
        self,
        model_path=None,
        **kwargs,
    ):
        """
        `SKClassifier` is a wrapper to scikit-learn text classifiation models.
        Extra kwargs are fed directly to `onprem.sk.clf.Classifier.create_model`.
        If no arguments are supplied, then a default Logistic Regression model is used.

        **Args:**

        - *ctype*: the classifier type. one of {'logreg', sgdclassifier, 'nbsvm'}
        - *vec__ngram_range*: range of ngrams extracted from text
        - *vec__binary*: If True, non-zero word counts are set to 2 in document vectors
        """

        from onprem.sk.clf import Classifier
        self.model = Classifier()

        if model_path:
            self.model.load(model_path)
        else:
            # set defaults if necessary        
            if 'ctype' not in kwargs: 
                kwargs['ctype'] = 'sgdclassifier'
                if 'clf__random_state' not in kwargs:
                    kwargs['clf__random_state'] = 42
                if 'binary' not in kwargs:
                    kwargs['vec__binary'] = True
                if 'vec__ngram_range' not in kwargs:
                    kwargs['vec__ngram_range'] = (1,3)
                if 'vec__stop_words' not in kwargs:
                    kwargs['vec__stop_words'] = 'english'
                if 'vec__max_features' not in kwargs:
                    kwargs['vec__max_features'] = 100000
            self.model.create_model(**kwargs) 
        self.predict = self.model.predict
        self.predict_proba = self.model.predict_proba
        
        
    def train(self,
              X:List[str],
              y:Union[List[int], List[str]],
              **kwargs,
             ):
        """
        Trains the classifier on a list of texts (`X`) and a list of labels (`y`).
        Additional keyword arguments are passed directly to `self.model.fit`.

        **Args:**

        - *X*: List of texts
        - *y*: List representing labels

        **Returns:**

        - None
        """

        self.model.fit(X, y)                                                                                                                   
  

    def save(self, filename:str):
        """
        Save model to specified `filename` (e.g., `/tmp/mymodel.gz`).
        Model saved as pickle file.
        To reload the model, supply `model_path` when instantiating`SKClassifier`.
        """
        self.model.save(filename)    
        

In [None]:
show_doc(SKClassifier.train)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L214){target="_blank" style="float:right; font-size:smaller"}

### SKClassifier.train

>      SKClassifier.train (X:List[str], y:Union[List[int],List[str]], **kwargs)

*Trains the classifier on a list of texts (`X`) and a list of labels (`y`).
Additional keyword arguments are passed directly to `self.model.fit`.

**Args:**

- *X*: List of texts
- *y*: List representing labels

**Returns:**

- None*

In [None]:
show_doc(SKClassifier.save)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L236){target="_blank" style="float:right; font-size:smaller"}

### SKClassifier.save

>      SKClassifier.save (filename:str)

*Save model to specified `filename` (e.g., `/tmp/mymodel.gz`).
Model saved as pickle file.
To reload the model, supply `model_path` when instantiating`SKClassifier`.*

In [None]:
# | notest

categories = [
             "alt.atheism",
             "soc.religion.christian",
             "comp.graphics",
             "sci.med" ]
from sklearn.datasets import fetch_20newsgroups

train_b = fetch_20newsgroups(
            subset="train", categories=categories, shuffle=True, random_state=42
)
test_b = fetch_20newsgroups(
subset="test", categories=categories, shuffle=True, random_state=42
)
x_train = train_b.data
y_train = train_b.target
x_test = test_b.data
y_test = test_b.target
classes = train_b.target_names

clf = SKClassifier()
clf.train(x_train, y_train)
test_doc = "god christ jesus mother mary church sunday lord heaven amen"
assert(3 == clf.predict(test_doc))  
clf.model.evaluate(x_test, y_test)

0.8948069241011984

In [None]:
# | notest

clf.save('/tmp/mymodel.gz') # save

In [None]:
# | notest

clf = SKClassifier(model_path='/tmp/mymodel.gz') # reload
clf.model.evaluate(x_test, y_test)

0.8948069241011984

In [None]:
# | export

class FewShotClassifier(ClassifierBase):
    def __init__(
        self,
        model_id_or_path:str=DEFAULT_SETFIT_MODEL,
        use_smaller:bool=False,
        **kwargs,
    ):
        """
        `FewShotClassifier` can be used to train and run text classifiers. Currently based on SetFit.
                Additional keyword arguments are fed directly to `from_pretrained`.


        **Args:**

        - *model_id_or_path*: The Hugging Face model_id or path to model folder (e.g, path previously trained and saved model).
        - *use_smaller*:  If True, will use a smaller but performant model.

        """
        self.model_id_or_path = model_id_or_path
        if use_smaller and model_id_or_path != DEFAULT_SETFIT_MODEL:
            warnings.warn(f'Over-writing supplied model ({model_id_or_path}) with {SMALL_MODEL} because use_smaller=True.')
        self.model_id_or_path = SMALL_SETFIT_MODEL if use_smaller else self.model_id_or_path
        self.model = SetFitModel.from_pretrained(self.model_id_or_path, **kwargs)
        self.predict = self.model.predict
        self.predict_proba = self.model.predict_proba
        self.trainer = None # set in `FewShotClassifier.train`
        self.labels = [] # set in `FewShotClassifier.train`
         
        
    def train(self,
              X:List[str],
              y:Union[List[int], List[str]],
              num_epochs:int=10,
              batch_size:int=32,
              metric='accuracy',
              callbacks=None,
              **kwargs,
             ):
        """
        Trains the classifier on a list of texts (`X`) and a list of labels (`y`).
        Additional keyword arguments are passed directly to `SetFit.TrainingArguments`

        **Args:**

        - *X*: List of texts
        - *y*: List of integers representing labels
        - *num_epochs*: Number of epochs to train
        - *batch_size*: Batch size
        - *metric*: metric to use
        - *callbacks*:  A list of callbacks to customize the training loop.

        **Returns:**

        - None
        """

        # convert to HF dataset
        train_dataset = self.arrays2dataset(X, y, text_key='text', label_key='label')

        args = TrainingArguments(
                batch_size=batch_size,
                num_epochs=num_epochs,
                **kwargs
        )      

        trainer = Trainer(
                    model=self.model,
                    args=args,
                    metric=metric,
                    callbacks=callbacks,
                    train_dataset=train_dataset,
                    column_mapping={"text": "text", "label": "label"}
        )
        trainer.train()
      
        self.trainer = trainer
  

    def save(self, save_path:str):
        """
        Save model to specified folder path, `save_path`.
        To reload the model, supply path in `model_id_or_path` argument when
        instantiating`FewShotClassifier`.

        """
        self.model.save_pretrained(save_path)        

    def explain(self, X:list, labels:List[str]=[]):
        """
        Explain the predictions on given examples in `X`. (Requires `shap` and `matplotlib` to be installed.)
        """
        try:
            import shap
        except ImportError:
            raise ImportError('Please install the shap library: pip install shap')

        try:
            import matplotlib
        except ImportError:
            raise ImportError('Please install the matplotlib library: pip install matplotlib')
            
        def f(x):
            return self.predict_proba(x)

        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(self.model_id_or_path)
        
        output_names = self.get_labels(labels)
        explainer = shap.Explainer(f, tokenizer, output_names=self.get_labels(labels))
        shap_values = explainer(X)
        shap.plots.text(shap_values)
          

In [None]:
show_doc(FewShotClassifier.save)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L323){target="_blank" style="float:right; font-size:smaller"}

### FewShotClassifier.save

>      FewShotClassifier.save (save_path:str)

*Save model to specified folder path, `save_path`.
To reload the model, supply path in `model_id_or_path` argument when
instantiating`FewShotClassifier`.*

In [None]:
show_doc(FewShotClassifier.train)

---

[source](https://github.com/amaiya/onprem/blob/master/onprem/pipelines/classifier.py#L274){target="_blank" style="float:right; font-size:smaller"}

### FewShotClassifier.train

>      FewShotClassifier.train (X:List[str], y:Union[List[int],List[str]],
>                               num_epochs:int=10, batch_size:int=32,
>                               metric='accuracy', callbacks=None, **kwargs)

*Trains the classifier on a list of texts (`X`) and a list of labels (`y`).
Additional keyword arguments are passed directly to `SetFit.TrainingArguments`

**Args:**

- *X*: List of texts
- *y*: List of integers representing labels
- *num_epochs*: Number of epochs to train
- *batch_size*: Batch size
- *metric*: metric to use
- *callbacks*:  A list of callbacks to customize the training loop.

**Returns:**

- None*

In [None]:
show_doc(FewShotClassifier.explain)

---

### FewShotClassifier.explain

>      FewShotClassifier.explain (X:list, labels:List[str]=[])

*Explain the predictions on given examples in `X`. (Requires `shap` and `matplotlib` to be installed.)*

In [None]:
# | notest

clf = FewShotClassifier()

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [None]:
# | notest

from datasets import load_dataset

In [None]:
# | notest
dataset = load_dataset("SetFit/sst2")
X_train, y_train = clf.dataset2arrays(dataset["train"], text_key="text", label_key="label")
X_test, y_test = clf.dataset2arrays(dataset["test"], text_key="text", label_key="label")
X_sample, y_sample = clf.sample_examples(X_train,  y_train, label_key="label", num_samples=8)

README.md:   0%|          | 0.00/378 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


train.jsonl:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

dev.jsonl:   0%|          | 0.00/136k [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/281k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6920 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [None]:
# | notest
clf.train(X_sample,  y_sample, max_steps=50)

Applying column mapping to the training dataset


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 144
  Batch size = 32
  Num epochs = 10


Step,Training Loss
1,0.2427
50,0.0473


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [None]:
# | notest
results = clf.evaluate(X_test, y_test, labels=['negative', 'positive'])

accuracy: 0.9066447007138935
macro avg:
  f1-score: 0.9065398270698484
  precision: 0.9083346457169854
  recall: 0.9065919749869724
  support: 1821.0
negative:
  f1-score: 0.9096705632306057
  precision: 0.8824742268041237
  recall: 0.9385964912280702
  support: 912.0
positive:
  f1-score: 0.9034090909090909
  precision: 0.9341950646298472
  recall: 0.8745874587458746
  support: 909.0
weighted avg:
  f1-score: 0.9065449847900473
  precision: 0.9082920420614453
  recall: 0.9066447007138935
  support: 1821.0



In [None]:
# | notest
new_data = ["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"]

In [None]:
# | notest
preds = clf.predict(new_data)
preds

tensor([1, 0])

In [None]:
# | notest
preds = clf.predict_proba(new_data)
preds

tensor([[0.1657, 0.8343],
        [0.8551, 0.1449]], dtype=torch.float64)

In [None]:
# | notest
clf.save('/tmp/my_fewshot_model')

In [None]:
# | notest
clf = FewShotClassifier('/tmp/my_fewshot_model')
preds = clf.predict(new_data)
preds

tensor([1, 0])

In [None]:
# | notest
clf.explain(new_data, labels=['negative', 'positive'])

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()