# Example

https://medium.com/@ycouble/explaining-spacy-models-with-shap-1ec55c33bd65

In [19]:
import spacy

textcat_spacy = spacy.load(".\model-best\model-best")
tokenizer_spacy = spacy.tokenizer.Tokenizer(textcat_spacy.vocab)
classes = list(textcat_spacy.get_pipe("textcat").labels)

# Define a function to predict
def predict(texts):

    # convert texts to bare strings
    texts = [str(text) for text in texts]

    results = []
    for doc in textcat_spacy.pipe(texts):
        # results.append([{'label': cat, 'score': doc.cats[cat]} for cat in doc.cats])
        results.append([doc.cats[cat] for cat in classes])

    print(results)
    return results

# Create a function to create a transformers-like tokenizer to match shap's expectations
def tok_wrapper(text, return_offsets_mapping=False):
    doc = tokenizer_spacy(text)
    out = {"input_ids": [tok.norm for tok in doc]}
    if return_offsets_mapping:
        out["offset_mapping"] = [(tok.idx, tok.idx + len(tok)) for tok in doc]
    return out

In [14]:
import shap
# Create the Shap Explainer
# - predict is the "model" function, adapted to a transformers-like model
# - masker is the masker used by shap, which relies on a transformers-like tokenizer
# - algorithm is set to permutation, which is the one used for transformers models
# - output_names are the classes (although it is not propagated to the permutation explainer currently, which is why plots do not have the labels)
# - max_evals is set to a high number to reduce the probability of cases where the explainer fails because there are too many tokens
explainer = shap.Explainer(
    predict,
    masker=shap.maskers.Text(tok_wrapper),
    algorithm="permutation",
    output_names=classes,
    max_evals=1500,
)

In [15]:
import pandas as pd
dataset = pd.read_csv(".\data\\reddit_data.csv")["text"].tolist()
dataset[0]

'I’m looking for datasets or api source that quantifies fan base, or preferably, bettors’ sentiment regarding a team’s performance or direction. Does anyone know of an API that tracks this? For now I’m looking specifically for NBA, but am also interested in MLB, NFL, and NCAA f-ball and b-ball.'

In [20]:
shap_values = explainer(dataset[:1])
shap_values.output_names = classes

['...' '...and ...' '...MLB, ...and ...' '...MLB, NFL, and ...'
 '...but ...MLB, NFL, and ...']
['...', '...and ...', '...MLB, ...and ...', '...MLB, NFL, and ...', '...but ...MLB, NFL, and ...']
['...but ...in MLB, NFL, and ...'
 '...but ...interested in MLB, NFL, and ...'
 '...but ...also interested in MLB, NFL, and ...'
 '...but am also interested in MLB, NFL, and ...'
 '...but am also interested in MLB, NFL, and ...and ...']
['...but ...in MLB, NFL, and ...', '...but ...interested in MLB, NFL, and ...', '...but ...also interested in MLB, NFL, and ...', '...but am also interested in MLB, NFL, and ...', '...but am also interested in MLB, NFL, and ...and ...']
['...but am also interested in MLB, NFL, and ...and b-ball.'
 '...but am also interested in MLB, NFL, and NCAA ...and b-ball.'
 '...but am also interested in MLB, NFL, and NCAA f-ball and b-ball.'
 '...NBA, but am also interested in MLB, NFL, and NCAA f-ball and b-ball.'
 '...for NBA, but am also interested in MLB, NFL, and NCAA 

In [12]:
shap.plots.text(shap_values)

# Gendered Language

In [None]:
import spacy

textcat_spacy = spacy.load(".\model-best\model-best")
tokenizer_spacy = spacy.tokenizer.Tokenizer(textcat_spacy.vocab)
classes = list(textcat_spacy.get_pipe("textcat").labels)

# Define a function to predict
def predict(texts):
    # convert texts to bare strings
    texts = [str(text) for text in texts]
    results = []
    for doc in textcat_spacy.pipe(texts):
        # results.append([{'label': cat, 'score': doc.cats[cat]} for cat in doc.cats])
        results.append([doc.cats[cat] for cat in classes])
    return results

# Create a function to create a transformers-like tokenizer to match shap's expectations
def tok_wrapper(text, return_offsets_mapping=False):
    doc = tokenizer_spacy(text)
    out = {"input_ids": [tok.norm for tok in doc]}
    if return_offsets_mapping:
        out["offset_mapping"] = [(tok.idx, tok.idx + len(tok)) for tok in doc]
    return out