In [25]:
import pandas as pd 
import numpy as np
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix

### Data Facts and Import 

In [26]:
df_train = pd.read_csv('Dataset/SentimentAnalysisofTweetsthroughAltmetrics/train.csv')
df_test = pd.read_csv('Dataset/SentimentAnalysisofTweetsthroughAltmetrics/test.csv')

In [27]:
df_train.columns = ["Text", "Label"]
df_test.columns = ["Text", "Label"]

In [28]:
# remove the neutural.
df_train= df_train[df_train['Label'] != 0]
df_test= df_test[df_test['Label'] != 0]

In [29]:
df_train.shape

(470, 2)

In [30]:
df_test.shape

(201, 2)

In [31]:
df_train.head()

Unnamed: 0,Text,Label
0,good acronym copper nanotubes Definitely,-1
2,GlycemicIndex diet restricted energy effective...,1
3,higher fibre intake partic cereal fibre reduce...,1
4,next life going research copper nanotubes CuNTs,-1
6,Bean rich diet produces equivalent weight loss...,1


In [32]:
df_test.head()

Unnamed: 0,Text,Label
0,Yeah paper ebirdf,1
2,platform Bioinformatics paper advanced access ...,1
4,Duan naturally award Best Science Acronym year,-1
5,Everything Chinese turns swear word think karma,-1
6,dear difficulties finding scientific abbreviat...,-1


In [33]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 470 entries, 0 to 730
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    470 non-null    object
 1   Label   470 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.0+ KB


In [34]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 313
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    201 non-null    object
 1   Label   201 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.7+ KB


In [35]:
df_train.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Label,470.0,0.029787,1.000621,-1.0,-1.0,1.0,1.0,1.0


In [36]:
df_test.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Label,201.0,0.014925,1.002385,-1.0,-1.0,1.0,1.0,1.0


### Data Cleaning / EDA

In [37]:
# ### Checking Missing values in the Data Set and printing the Percentage for Missing Values for Each Columns ###

# count = df_train.isnull().sum().sort_values(ascending=False)
# percentage = ((df_train.isnull().sum()/len(df_train)*100)).sort_values(ascending=False)
# missing_data = pd.concat([count, percentage], axis=1, keys=['Count','Percentage'])

# print('Count and percentage of missing values for the columns:')

# missing_data

In [38]:
# ### Checking for the Distribution of Default ###
# import matplotlib.pyplot as plt
# %matplotlib inline
# print('Percentage for default\n')
# print(round(df_train.Is_Response.value_counts(normalize=True)*100,2))
# round(df_train.Is_Response.value_counts(normalize=True)*100,2).plot(kind='bar')
# plt.title('Percentage Distributions by review type')
# plt.show()

In [39]:
#Removing columns
#df_train.drop(columns = ['User_ID', 'Browser_Used', 'Device_Used'], inplace = True)

In [40]:
# #This function converts to lower-case, removes square bracket, removes numbers and punctuation
# def text_clean_1(text):
#     text = text.lower()
#     text = re.sub('\[.*?\]', '', text)
#     text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
#     text = re.sub('\w*\d\w*', '', text)
#     return text

# cleaned1 = lambda x: text_clean_1(x)

In [41]:
# # Apply first level cleaning

# # Let's take a look at the updated text
# df_train['cleaned_description'] = pd.DataFrame(df_train.Description.apply(cleaned1))
# df_train.head(10)

In [42]:
# # Apply a second round of cleaning
# def text_clean_2(text):
#     text = re.sub('[‘’“”…]', '', text)
#     text = re.sub('\n', '', text)
#     return text

# cleaned2 = lambda x: text_clean_2(x)

In [43]:
# # Let's take a look at the updated text
# df_train['cleaned_description_new'] = pd.DataFrame(df_train['cleaned_description'].apply(cleaned2))
# df_train.head(10)

### Spliting the data.

In [44]:
#from sklearn.model_selection import train_test_split

#Independent_var = df_train.cleaned_description_new
#Dependent_var = df_train.Is_Response

#IV_train, IV_test, DV_train, DV_test = train_test_split(Independent_var, Dependent_var, test_size = 0.1, random_state = 225)

IV_train = df_train.Text
DV_train = df_train.Label
IV_test = df_test.Text
DV_test = df_test.Label


print('IV_train :', len(IV_train))
print('IV_test  :', len(IV_test))
print('DV_train :', len(DV_train))
print('DV_test  :', len(DV_test))


IV_train : 470
IV_test  : 201
DV_train : 470
DV_test  : 201


### Model training 

In [45]:
# This model training code is directly from:
# https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py

'''Trains an LSTM model on the IMDB sentiment classification task.
The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
# Notes
- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.
- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
from keras_preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences
from keras_preprocessing.sequence import pad_sequences


In [46]:
max_features = 20000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
25000 train sequences
25000 test sequences


In [47]:
print('Pad sequences (samples x time)')
x_train =pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=1,
          validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)
Build model...
Train...
Cause: generators are not supported
Cause: generators are not supported
Test score: 0.3692709803581238
Test accuracy: 0.8380399942398071


In [48]:
!pip install keras==2.3.1



In [50]:
!pip install tensorflow==1.14.0

ERROR: Could not find a version that satisfies the requirement tensorflow==1.14.0 (from versions: 2.8.0rc1, 2.8.0, 2.8.1, 2.8.2, 2.8.3, 2.8.4, 2.9.0rc0, 2.9.0rc1, 2.9.0rc2, 2.9.0, 2.9.1, 2.9.2, 2.9.3, 2.10.0rc0, 2.10.0rc1, 2.10.0rc2, 2.10.0rc3, 2.10.0, 2.10.1, 2.11.0rc0, 2.11.0rc1, 2.11.0rc2, 2.11.0)
ERROR: No matching distribution found for tensorflow==1.14.0


In [49]:
import shap

# we use the first 100 training examples as our background dataset to integrate over
explainer = shap.DeepExplainer(model, x_train[:100])

# explain the first 10 predictions
# explaining each prediction requires 2 * background dataset size runs
shap_values = explainer.shap_values(x_test[:2])

  from .autonotebook import tqdm as notebook_tqdm
keras is no longer supported, please use tf.keras instead.
Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.


TypeError: 'NoneType' object cannot be interpreted as an integer

In [None]:
# init the JS visualization code
shap.initjs()

# transform the indexes to words
import numpy as np
words = imdb.get_word_index()
num2word = {}
for w in words.keys():
    num2word[words[w]] = w
x_test_words = np.stack([np.array(list(map(lambda x: num2word.get(x, "NONE"), x_test[i]))) for i in range(10)])

# plot the explanation of the first prediction
# Note the model is "multi-output" because it is rank-2 but only has one column
shap.force_plot(explainer.expected_value[0], shap_values[0][0], x_test_words[0])