In [2]:
%%capture
!pip install datasets
import datasets
import pandas as pd
import seaborn as sns
import transformers
!pip install shap
import shap
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
!pip install lime
import lime
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt


# Interpretability

So far we have used various models as black boxes, where an input goes in and an output goes out.

Interpretability allows us to understand what happens inside this blackbox.

<img src='https://assets.spe.org/dims4/default/f6b3d58/2147483647/strip/true/crop/696x392+0+0/resize/800x451!/quality/90/?url=http%3A%2F%2Fspe-brightspot.s3.us-east-2.amazonaws.com%2Fae%2F27%2F97e3d6195614a45bed36e7a965e2%2Fblackbox.jpg' width="500">

## Content

The goal of this walkthrough is to introduce explainability modules for Machine Learning models. Those allow us to better retrace the steps taken by the computer.


## Background <a class="anchor" id="Background"></a>

### Objective

AI is often perceived as a black box. This erodes trust and makes it harder for some industries to adopt it.

In order to mitigate this, XAI models (explainable AI) are being developped that allow us to retrace the steps performed by the computer.

### Examples of XAI models
User interface examples:


*   [Language level](http://deeptext.unice.fr/FLE/)
*   [Emotion classification](https://shap.readthedocs.io/en/latest/example_notebooks/text_examples/sentiment_analysis/Emotion%20classification%20multiclass%20example.html)



Various XAI libraries exist, the two most popular ones being `shap`and `lime`


## SHAP (SHapley Additive exPlanations)

**SHAP** is a popular library used for interpreting the output of machine learning models. It is based on the concept of Shapley values from cooperative game theory, which allocate the contribution of each feature to the model's predictions.

### How SHAP Works:
1. **Feature Contribution**: For each prediction, SHAP computes the contribution of each feature to the prediction by considering all possible feature combinations.
2. **Additive Feature Attribution**: The sum of SHAP values for all features equals the model’s prediction, ensuring consistency.

### Example of a sentiment analisys model
Let's start with an example of `shap` over a `Sentiment Analisys` model.

For this example we will use a pretrained bert model, and we will try to identify how each feature affects the final prediction.

In [3]:
%%capture
# load the dataset
dataset = datasets.load_dataset("emotion", split="train")
data = pd.DataFrame({"text": dataset["text"], "emotion": dataset["label"]})

In [4]:
%%capture
# load the model and tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    "nateraw/bert-base-uncased-emotion", use_fast=True
)

model = transformers.AutoModelForSequenceClassification.from_pretrained(
    "nateraw/bert-base-uncased-emotion"
).cuda() # if this step fails, relaunch your google colab with a T4 runtime, ensuring cuda support

# build a pipeline object to do predictions
pred = transformers.pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0,
    top_k=None,
)

In [5]:
# build an explainer function that extends our predictions pipeline
shap_explainer = shap.Explainer(pred)

In [None]:
# show the explainer
shap_values = shap_explainer(["I am happy to be following the DSML course, although it is sad it is almost over"])
shap.plots.text(shap_values)

### Your turn!

Plot a `shap` barplot that represents the weights of each word in a sentence for different emotions. You can use the documentation available [here](https://shap.readthedocs.io/en/latest/example_notebooks/text_examples/sentiment_analysis/Emotion%20classification%20multiclass%20example.html).

In [None]:
# Your code here

## LIME
We will now create a similar model using `lime` explainability over youtube spam classifications. To do so we will train a model on the labeled commets of the videos from PSY, Shakira, LMFAO and Katy Perry, and then predict if some comments are spam or not.



### Classification
We will try to identify bellow the importance of each factor in the computers decision making.

We will apply it on two models, a `Random Forest Classifier` and a `Decision Tree`

In [20]:
psy = pd.read_csv('https://raw.githubusercontent.com/michalis0/DataScience_and_MachineLearning/master/13-interpretability-for-ai/data/Youtube01-Psy.csv')
perry = pd.read_csv('https://raw.githubusercontent.com/michalis0/DataScience_and_MachineLearning/master/13-interpretability-for-ai/data/Youtube02-KatyPerry.csv')
LMFAO = pd.read_csv('https://raw.githubusercontent.com/michalis0/DataScience_and_MachineLearning/master/13-interpretability-for-ai/data/Youtube03-LMFAO.csv')
Eminem = pd.read_csv('https://raw.githubusercontent.com/michalis0/DataScience_and_MachineLearning/master/13-interpretability-for-ai/data/Youtube04-Eminem.csv')
Shakira = pd.read_csv('https://raw.githubusercontent.com/michalis0/DataScience_and_MachineLearning/master/13-interpretability-for-ai/data/Youtube05-Shakira.csv')
full_train = pd.concat([psy, perry, LMFAO, Shakira, Eminem]).reset_index().drop(columns = ['index'])

In [None]:
display(full_train.head(3))

In [None]:
# Load useful libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

X= full_train['CONTENT']
y= full_train['CLASS']


# Using default tokenizer in TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words="english")

# Learn the vocabulary dictionary and return document-term matrix
features = tfidf.fit_transform(X)

# Visualize result in dataframe
X = pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out(),
)


# Create Train & Test Data
X_train, X_test, y_train, y_test = train_test_split(X.fillna(0), y,test_size=0.3,
                                                	stratify =y,
                                                	random_state = 13)

# Build the model
rf_clf = RandomForestClassifier(max_features=2, n_estimators =100 ,bootstrap = True)

rf_clf.fit(X_train, y_train)
predict_fn_clf = lambda x: rf_clf.predict_proba(x).astype(float)

# Make prediction on the testing data
y_pred = rf_clf.predict(X_test)

# Classification Report
print(classification_report(y_pred, y_test))

In [23]:
# Import the LimeTabularExplainer module
from lime.lime_tabular import LimeTabularExplainer

# Get the class names
class_names = ['Not spam', 'Spam']

# Get the feature names
feature_names = list(X_train.columns)

# Fit the Explainer on the training data set using the LimeTabularExplainer
lime_explainer = LimeTabularExplainer(X_train.values, feature_names =
                                 feature_names,
                                 class_names = class_names,
                                 mode = 'classification')



In [None]:
choosen_instance = X_test.loc[:].values[0]
exp = lime_explainer.explain_instance(choosen_instance, predict_fn_clf,num_features=10)
exp.show_in_notebook(show_all=False)


### Second example: Logistic regression

In the context of text classification, **sincere** and **insincere** often refer to whether a question (or text) is genuine and constructive or malicious, sarcastic, or disruptive. This distinction is commonly used in datasets like Quora Question Pairs or similar platforms where user-generated content is evaluated for quality or intent.



For this example we will analyse different tags on Quora posts


In [None]:
# Paths to the split files
file_paths = [
    "https://raw.githubusercontent.com/michalis0/DataScience_and_MachineLearning/master/13-interpretability-for-ai/data/Quora_pt1.csv",
    "https://raw.githubusercontent.com/michalis0/DataScience_and_MachineLearning/master/13-interpretability-for-ai/data/Quora_pt2.csv",
    "https://raw.githubusercontent.com/michalis0/DataScience_and_MachineLearning/master/13-interpretability-for-ai/data/Quora_pt3.csv"
]

# Read and combine all parts
train_df = pd.concat([pd.read_csv(path) for path in file_paths], ignore_index=True).dropna()

# Display the shape and first few rows
print("Train shape : ", train_df.shape)
display(train_df.head())


In [30]:

## split to train and val (values to test)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)
val_df.reset_index(drop=True)

## vectorize to tf-idf vectors
tfidf_vc = TfidfVectorizer(min_df = 10, max_features = 100000, analyzer = "word", ngram_range = (1, 2), stop_words = 'english', lowercase = True)
train_vc = tfidf_vc.fit_transform(train_df["question_text"])
val_vc = tfidf_vc.transform(val_df["question_text"])

In [None]:
## Training our model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C = 0.5, solver = "sag")
model = model.fit(train_vc, train_df.target)
val_pred = model.predict(val_vc)

## checking the accuracy of the model
from sklearn.metrics import accuracy_score
print("accuracy score = {}".format(accuracy_score(val_df.target, val_pred)))



Predictions and explaining one of the predictions:

Some good examples of divisive questions are indexed at : 15, 163, 226, 240, 306, and 979

In [None]:
from sklearn.pipeline import make_pipeline
idx = val_df.index[979]

c = make_pipeline(tfidf_vc, model)
class_names = ["sincere", "insincere"]
explainer = LimeTextExplainer(class_names = class_names)
exp = explainer.explain_instance(val_df["question_text"][idx], c.predict_proba, num_features = 10)

print("Question: \n", val_df["question_text"][idx])
print("Probability (Insincere) =", c.predict_proba([val_df["question_text"][idx]])[0, 1])
print("Probability (Sincere) =", c.predict_proba([val_df["question_text"][idx]])[0, 0])

In [None]:
exp.as_list()

In [None]:
exp.show_in_notebook(text=val_df["question_text"][idx], labels=(1,))

In [None]:
import collections

weights = collections.OrderedDict(exp.as_list())
lime_weights = pd.DataFrame({"words": list(weights.keys()), "weights": list(weights.values())})

sns.barplot(x = "words", y = "weights", data = lime_weights)
plt.xticks(rotation = 45)
plt.title("Sample {} features weights given by LIME".format(idx))
plt.show()