In [1]:
import numpy as np
import pandas as pd
import lime
from lime import lime_text


In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.shape

(50000, 2)

In [4]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


begin with train test split

In [5]:
from sklearn.model_selection import train_test_split

X = df.drop('sentiment', axis=1)
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train.shape

(40000, 1)

In [7]:
X_test.shape

(10000, 1)

In [8]:
y_train.shape

(40000,)

In [9]:
y_test.shape

(10000,)

vectorizing

In [10]:
X_train = X_train.squeeze()
X_test = X_test.squeeze()

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [11]:
print(X_train_vec.shape) 
print(len(y_train))

(40000, 5000)
40000


train the model

In [12]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train_vec, y_train)

KeyboardInterrupt: 

model evaluation

In [None]:
score = model.score(X_test_vec, y_test)
score

Create the explainer

In [None]:
explainer = LimeTextExplainer(class_names=['negative', 'positive'], random_state = 42)

create a pipeline

In [None]:
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(vectorizer, model)

select a sample

In [None]:
idx = 80
sample_text = X_test.iloc[idx]
data_test = vectorizer.transform([sample_text])
prediction = model.predict(data_test)[0] 
y_true = np.array(y_test)[idx]   
print(f"Sample {idx} from the test set, predicted as {'positive' if prediction == 1 else 'negative'}, true label is {'positive' if y_true == 1 else 'negative'}")


In [None]:
exp = explainer.explain_instance(
    X_test.iloc[idx],            
    pipeline.predict_proba,   
    num_features=6
)
exp.save_to_file('text_explanation1.html')
exp.show_in_notebook(text=sample_text)


Rerun the explainer again with a different seed

In [None]:
explainer = LimeTextExplainer(class_names=['negative', 'positive'], random_state = 99)
exp2 = explainer.explain_instance(
    X_test.iloc[idx],            
    pipeline.predict_proba,   
    num_features=6
)
exp2.save_to_file('text_explanation2.html')
exp2.show_in_notebook(text=sample_text)