In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import transformers
import shap
import os


from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.utils import resample
import xgboost as xgb 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [2]:
def assess_on_val(clf, X_val_vectorized, y_val):
    print("Start predicting")
    predicted = clf.predict(X_val_vectorized)
    print("Finished predicting")
    total_accuracy = metrics.accuracy_score(predicted, y_val)

    accuracy_by_rating_class = {rc : metrics.accuracy_score(predicted[y_val==rc], y_val[y_val==rc]) for rc in np.unique(y_val)}
    accuracy_by_rating_class = pd.DataFrame.from_dict(accuracy_by_rating_class, orient="index")

    fig = plt.figure(figsize=(20,5))
    ax = fig.add_subplot(131)
    accuracy_by_rating_class.sort_index().plot(kind = "bar", ax = ax)
    ax.set_title("Accuracy per  \nrating class")
    ax.axhline(y = total_accuracy, color ='r', linestyle = '--')
    ax.get_legend().remove()
    ax.set_ylim(0,1)

    ax2 = fig.add_subplot(132)
    pd.DataFrame({"star_rating": predicted}).value_counts(normalize=1).reset_index().set_index("star_rating").sort_index().plot(kind="bar", ax=ax2)
    ax2.set_title("Percentage of \n predicted rating class")
    ax2.get_legend().remove()
    ax2.set_ylim(0,1)

    ax3 = fig.add_subplot(133)
    pd.DataFrame({"star_rating": y_val}).value_counts(normalize=1).reset_index().set_index("star_rating").sort_index().plot(kind="bar", ax=ax3)
    ax3.set_title("Percentage of actual \n rating classes in val")
    ax3.get_legend().remove()
    ax3.set_ylim(0,1)    
    fig.suptitle(str(clf._final_estimator), y=1.1)
    
    print("Global accuracy: " + str(total_accuracy))
    return fig

In [3]:
os.getcwd()
os.chdir(r"D:\sep22_10_supply_chain\data")
data_en = pd.read_pickle(r"data_en.pickle")

In [4]:
X = data_en["processed_reviews"]
y = data_en["star_rating"]
y2 = data_en["review_body"]

In [5]:
rs = [resample(X[y == sr], y[y == sr], replace=False, n_samples=int(np.floor((X[y == 2].shape[0])*0.25)), random_state=123) for sr in [1,2,3,4,5]]

X_list = [rs[r][0] for r in range(5)]
y_list = [rs[r][1] for r in range(5)]

X_us = np.hstack(X_list)
y_us = np.hstack(y_list)

In [4]:
data = pd.DataFrame({'text':data_en['review_body'],'emotion':data_en['star_rating']})

In [9]:
#https://shap.readthedocs.io/en/latest/example_notebooks/text_examples/sentiment_analysis/Emotion%20classification%20multiclass%20example.html
# load the model and tokenizer
os.environ["CURL_CA_BUNDLE"]=""
os.environ["CUDA_VISIBLE_DEVICES"] = ""
tokenizer = transformers.AutoTokenizer.from_pretrained("nateraw/bert-base-uncased-emotion", use_fast=True)
model = transformers.AutoModelForSequenceClassification.from_pretrained("nateraw/bert-base-uncased-emotion")

# build a pipeline object to do predictions
pred = transformers.pipeline("text-classification", model=model, tokenizer=tokenizer, device="cpu", return_all_scores=True)

`return_all_scores` is now deprecated,  if want a similar funcionality use `top_k=None` instead of `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.


In [14]:
pred.predict(data['text'][5])

[[{'label': 'sadness', 'score': 0.06754638254642487},
  {'label': 'joy', 'score': 0.7624708414077759},
  {'label': 'love', 'score': 0.006418523844331503},
  {'label': 'anger', 'score': 0.08851144462823868},
  {'label': 'fear', 'score': 0.0695919543504715},
  {'label': 'surprise', 'score': 0.005460812244564295}]]

In [10]:
explainer = shap.Explainer(pred)

In [11]:
shap_values = explainer(data['text'][:3])

Partition explainer: 4it [1:33:00, 1860.04s/it]                       


In [12]:
shap.plots.text(shap_values)