<a href="https://colab.research.google.com/github/nonoumasy/Bicyclist-Collisions-in-Los-Angeles-2013-2018/blob/master/Model_Interpretation_on_Text_Classification_using_eli5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    shuffle=True,
    random_state=42
)
twenty_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42
)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [52]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(stop_words='english')
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)



Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('logisticregressioncv',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv='warn',
                                      du

In [53]:
from sklearn import metrics

def print_report(pipe):
    y_test = twenty_test.target
    y_pred = pipe.predict(twenty_test.data)
    report = metrics.classification_report(y_test, y_pred,
        target_names=twenty_test.target_names)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.96      0.83      0.89       319
         comp.graphics       0.90      0.98      0.94       389
               sci.med       0.96      0.92      0.94       396
soc.religion.christian       0.91      0.95      0.93       398

              accuracy                           0.93      1502
             macro avg       0.93      0.92      0.92      1502
          weighted avg       0.93      0.93      0.93      1502

accuracy: 0.927


In [54]:
eli5.show_weights(clf, vec=vec, top=10,
                  target_names=twenty_test.target_names)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+17.784,atheism,,
+16.085,keith,,
+12.271,mathew,,
+12.254,islamic,,
+11.600,okcforum,,
+11.312,enviroleague,,
+11.310,morality,,
+11.266,wingate,,
+11.202,umd,,
+10.730,writes,,

Weight?,Feature
+17.784,atheism
+16.085,keith
+12.271,mathew
+12.254,islamic
+11.600,okcforum
+11.312,enviroleague
+11.310,morality
+11.266,wingate
+11.202,umd
+10.730,writes

Weight?,Feature
+20.848,graphics
+10.415,3d
+10.069,images
+10.005,image
+9.740,files
+9.657,file
+9.244,software
+9.116,42
+9.001,code
… 9969 more positive …,… 9969 more positive …

Weight?,Feature
+15.614,doctor
+14.590,pitt
+14.495,msg
+12.984,health
+12.939,disease
+12.197,treatment
+11.784,cancer
+11.108,medical
… 12382 more positive …,… 12382 more positive …
… 23091 more negative …,… 23091 more negative …

Weight?,Feature
+21.411,christians
+19.986,church
+18.214,rutgers
+17.743,christ
+16.668,clh
+14.726,christian
+14.658,athos
… 9756 more positive …,… 9756 more positive …
… 25717 more negative …,… 25717 more negative …
-15.435,posting


In [55]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names)

Contribution?,Feature
2.065,Highlighted in text (sum)
-7.756,<BIAS>

Contribution?,Feature
-1.742,<BIAS>
-3.885,Highlighted in text (sum)

Contribution?,Feature
8.507,Highlighted in text (sum)
-3.749,<BIAS>

Contribution?,Feature
-4.545,<BIAS>
-7.325,Highlighted in text (sum)


In [58]:
#Using Debugging HashingVectorizer

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vec = HashingVectorizer(stop_words='english', ngram_range=(1,2))
clf = SGDClassifier(random_state=42)
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

print_report(pipe)

#
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names,
                     targets=['sci.med'])

from eli5.sklearn import InvertableHashingVectorizer

ivec = InvertableHashingVectorizer(vec)
sample_size = len(twenty_train.data) // 10
X_sample = np.random.choice(twenty_train.data, size=sample_size)
ivec.fit(X_sample)

eli5.show_weights(clf, vec=ivec, top=20,
                  target_names=twenty_test.target_names)


                        precision    recall  f1-score   support

           alt.atheism       0.96      0.83      0.89       319
         comp.graphics       0.90      0.96      0.93       389
               sci.med       0.93      0.91      0.92       396
soc.religion.christian       0.91      0.97      0.94       398

              accuracy                           0.92      1502
             macro avg       0.93      0.92      0.92      1502
          weighted avg       0.93      0.92      0.92      1502

accuracy: 0.923


Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+2.450,atheism,,
+1.794,writes,,
+1.721,keith,,
+1.575,atheists,,
+1.522,morality,,
+1.469,islamic,,
+1.357,mathew,,
+1.309,wingate,,
+1.228,islam,,
+1.181,moral,,

Weight?,Feature
+2.450,atheism
+1.794,writes
+1.721,keith
+1.575,atheists
+1.522,morality
+1.469,islamic
+1.357,mathew
+1.309,wingate
+1.228,islam
+1.181,moral

Weight?,Feature
+3.823,graphics
+2.048,image
+1.970,files
+1.912,3d
+1.892,images
+1.861,file
+1.805,software
+1.761,points
+1.758,code
+1.635,card

Weight?,Feature
+2.213,doctor
+1.969,disease
+1.889,health
+1.848,msg
+1.771,information
+1.724,treatment
+1.687,cancer
+1.642,medical
+1.506,pitt
+1.414,pain

Weight?,Feature
+3.094,christians
+2.820,church
+2.409,rutgers edu
+2.368,christ …
+2.320,clh
+2.247,rutgers
+2.073,christian
+1.613,athos rutgers
+1.613,athos
+1.550,love
