Our goal is to create an NLP classifier that when given a paragraph from a famous classical book will be able to predict the text's author.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Creating the feature set and label set

In [2]:
path_to_text = 'books_and_authors'
data = pd.read_csv(path_to_text, names=['v1', 'v2'])

label = data['v1']
text = data['v2']

text

0      Pretty soon I wanted to smoke, and asked the w...
1      Her sister, Miss Watson, a tolerable slim old ...
2      Now she had got a start, and she went on and t...
3      Miss Watson she kept pecking at me, and it got...
4      I set down again, a-shaking all over, and got ...
                             ...                        
994    I was on the point of asking him what that wor...
995      1. Knowledge of Literature.--Nil.\r\n  2.   ...
996    I see that I have alluded above to his powers ...
997    During the first week or so we had no callers,...
998    It was upon the 4th of March, as I have good r...
Name: v2, Length: 999, dtype: object

# Text preprocessing using NLTK

In [3]:
import nltk

nltk.download('omw-1.4')
nltk.download('punkt')

from nltk.corpus import stopwords, wordnet

nltk.download('stopwords')
stopwords_ = stopwords.words('english')

nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

nltk.download('averaged_perceptron_tagger')

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
import string

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


We'll try two different datasets. The first (text_preprocessed) will be fully preprocessed (low letters, missing punctuation, words represented like tokens and then lemmatization. The second dataset (text_preprocessed_1) will only have low letters and will be tokenized.

#First dataset - fully preprocessed

In [4]:
text_preprocessed = []
for sentence in text:
    #sentence lower
    sentence = sentence.lower()

    #string punct
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    #tokenize
    tokens = nltk.word_tokenize(sentence)
    
    # stop-words
    tokens_stop_words = []
    
    for token in tokens:
          if token not in stopwords_:
                tokens_stop_words.append(token)
                
    # Lemmatization
    tokens_lemma = []
    for token in tokens_stop_words:
          tokens_lemma.append(wnl.lemmatize(token, get_wordnet_pos(nltk.pos_tag([token])[0][1])))
            
    final = ' '.join(tokens_lemma)
    
    text_preprocessed.append(final)

#Second dataset with lower letters and tokenisation

In [5]:
text_preprocessed_1 = []
for sentence in text:
    #sentence lower
    sentence = sentence.lower()
    #tokenize
    tokens = nltk.word_tokenize(sentence)
    
    text_preprocessed_1.append(' '.join(tokens))

In [6]:
len(text_preprocessed_1),len(text_preprocessed), len(text)

(999, 999, 999)

### Extracting Text and Train Data

In [7]:
from sklearn.model_selection import train_test_split

trainX, testX, trainY, testY = train_test_split(
    text_preprocessed, label, test_size=0.3, random_state=42)


len(trainX), len(testX), len(trainY), len(testY)

(699, 300, 699, 300)

Label encoding

In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
trainY= le.fit_transform(trainY)
testY = le.fit_transform(testY)
trainY.shape,testY.shape
trainY[0]

3

In [9]:
np.unique(trainY, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]), array([ 68, 104,  89, 167,  74, 108,  89]))

In [12]:
#trainY

First, we'll use CountVectorizer to process the data and to show us the number of apperances of each token (word).

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectors as features
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', ngram_range=(1, 1), max_features=5000)
count_vect.fit(text_preprocessed)

# transform the training and test data using count vectorizer object
trainX_vec = count_vect.transform(trainX)
testX_vec = count_vect.transform(testX)
trainX_vec.shape,
testX_vec.shape

(300, 5000)

In [11]:
dict_ = count_vect.vocabulary_
new_data = pd.DataFrame.from_dict(dict_, orient='index')

In [12]:
new_data

Unnamed: 0,0
pretty,3526
soon,4202
want,4817
smoke,4162
ask,274
...,...
moorgate,2852
coroner,970
stamford,4269
laboratory,2404


In [14]:
print('count_vect: ')
print(count_vect.vocabulary_)


count_vect: 
{'pretty': 3526, 'soon': 4202, 'want': 4817, 'smoke': 4162, 'ask': 274, 'widow': 4897, 'let': 2488, 'wouldnt': 4959, 'say': 3936, 'mean': 2706, 'practice': 3475, 'wasnt': 4826, 'clean': 721, 'must': 2914, 'try': 4668, 'way': 4833, 'people': 3281, 'get': 1888, 'thing': 4512, 'dont': 1371, 'know': 2399, 'nothing': 3031, 'moses': 2865, 'kin': 2377, 'use': 4745, 'anybody': 200, 'go': 1916, 'see': 3983, 'yet': 4988, 'find': 1727, 'power': 3472, 'fault': 1685, 'good': 1923, 'take': 4442, 'course': 1002, 'right': 3846, 'do': 1351, 'sister': 4116, 'miss': 2807, 'watson': 4831, 'tolerable': 4570, 'slim': 4151, 'old': 3096, 'maid': 2621, 'come': 788, 'live': 2542, 'set': 4017, 'work': 4948, 'middle': 2765, 'hard': 2022, 'hour': 2142, 'make': 2629, 'ease': 1466, 'couldnt': 986, 'stood': 4310, 'much': 2887, 'longer': 2563, 'deadly': 1118, 'dull': 1440, 'would': 4958, 'put': 3646, 'foot': 1777, 'like': 2515, 'straight': 4318, 'gap': 1860, 'stretch': 4330, 'behave': 393, 'told': 4569, '

In [15]:
count_vect.vocabulary_['crime']

1031

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score


#create an instance of the model
lr_model = LogisticRegression(random_state=7, C=1, max_iter = 500) #pomalo C se poloshi, pogolemo isto poloshi
#train the model
lr_model.fit(trainX_vec, trainY)

#predict test data
pred_test = lr_model.predict(testX_vec)

#print evaluation metrics 
print(classification_report(testY,pred_test))
print(confusion_matrix(testY,pred_test))
print("Accuracy:", accuracy_score(testY, pred_test))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83        30
           1       0.83      0.87      0.85        39
           2       0.70      0.74      0.72        31
           3       0.83      0.95      0.89        65
           4       0.93      0.87      0.90        46
           5       0.76      0.71      0.73        48
           6       0.89      0.76      0.82        41

    accuracy                           0.83       300
   macro avg       0.83      0.81      0.82       300
weighted avg       0.83      0.83      0.83       300

[[24  1  4  0  0  1  0]
 [ 1 34  0  3  1  0  0]
 [ 1  1 23  1  0  5  0]
 [ 0  2  0 62  0  1  0]
 [ 1  0  0  2 40  0  3]
 [ 1  1  5  5  1 34  1]
 [ 0  2  1  2  1  4 31]]
Accuracy: 0.8266666666666667


### Вториот модел - не процесиран текст

In [17]:
from sklearn.model_selection import train_test_split

trainX_1, testX_1, trainY_1, testY_1 = train_test_split(
    text_preprocessed_1, label, test_size=0.3, random_state=42)


len(trainX), len(testX), len(trainY), len(testY)


len(trainX_1), len(testX_1) ,len(trainY_1), len(testY_1)

(699, 300, 699, 300)

In [18]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
trainY_1= le.fit_transform(trainY_1)
testY_1 = le.fit_transform(testY_1)
trainY_1.shape,testY_1.shape
trainY_1[0]

3

In [19]:
count_vect_1 = CountVectorizer(max_features=5000)
count_vect_1.fit(text_preprocessed_1)

# transform the training and test data using count vectorizer object
trainX_1_vec = count_vect.transform(trainX_1)
testX_1_vec = count_vect.transform(testX_1)
trainX_1_vec.shape,
testX_1_vec.shape

(300, 5000)

In [20]:
lr_model_1 = LogisticRegression(random_state=0, C=100, max_iter=1000) #so ponisko C, polosha preciznost
lr_model_1.fit(trainX_1_vec, trainY_1)

pred_test_1 = lr_model_1.predict(testX_1_vec)

#print evaluation metrics 
print(classification_report(testY_1,pred_test_1))
print(confusion_matrix(testY_1,pred_test_1))
print("Accuracy:",accuracy_score(testY_1, pred_test_1))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84        30
           1       0.76      0.90      0.82        39
           2       0.76      0.71      0.73        31
           3       0.81      0.94      0.87        65
           4       0.83      0.76      0.80        46
           5       0.85      0.73      0.79        48
           6       0.77      0.66      0.71        41

    accuracy                           0.80       300
   macro avg       0.80      0.79      0.79       300
weighted avg       0.80      0.80      0.80       300

[[26  1  1  1  1  0  0]
 [ 1 35  0  0  2  0  1]
 [ 2  1 22  1  1  3  1]
 [ 0  2  0 61  1  1  0]
 [ 2  2  0  2 35  0  5]
 [ 0  2  4  6  0 35  1]
 [ 1  3  2  4  2  2 27]]
Accuracy: 0.8033333333333333


In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words={"english"}, ngram_range=(1, 3)) 
tfidf.fit(text)

X_train_tfidf = tfidf.transform(trainX)
X_test_tfidf = tfidf.transform(testX)

X_train_tfidf.shape, X_test_tfidf.shape

((699, 5000), (300, 5000))

In [25]:
lr_model_tf = LogisticRegression(random_state=0, C=10, max_iter=1000)
lr_model_tf.fit(X_train_tfidf, trainY)
pred_test_tf = lr_model_tf.predict(X_test_tfidf)

#print evaluation metrics 
print(classification_report(testY,pred_test_tf))
print(confusion_matrix(testY,pred_test_tf))
print("Accuracy:",accuracy_score(testY, pred_test_tf))

              precision    recall  f1-score   support

           0       0.96      0.73      0.83        30
           1       0.82      0.92      0.87        39
           2       0.75      0.77      0.76        31
           3       0.91      0.98      0.95        65
           4       0.93      0.91      0.92        46
           5       0.89      0.85      0.87        48
           6       0.85      0.83      0.84        41

    accuracy                           0.88       300
   macro avg       0.87      0.86      0.86       300
weighted avg       0.88      0.88      0.88       300

[[22  2  3  2  0  1  0]
 [ 0 36  0  0  1  0  2]
 [ 1  1 24  1  0  3  1]
 [ 0  1  0 64  0  0  0]
 [ 0  0  0  1 42  0  3]
 [ 0  1  3  2  1 41  0]
 [ 0  3  2  0  1  1 34]]
Accuracy: 0.8766666666666667


#Multinomial NB

In [26]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
nb = MultinomialNB(alpha=0.01)
nb.fit(X_train_tfidf, trainY)
pred_test_nb = nb.predict(X_test_tfidf)

#print evaluation metrics 
print(classification_report(testY,pred_test_nb))
print(confusion_matrix(testY,pred_test_nb))
print("Accuracy:",accuracy_score(testY, pred_test_nb))

              precision    recall  f1-score   support

           0       1.00      0.53      0.70        30
           1       0.85      0.90      0.88        39
           2       0.88      0.71      0.79        31
           3       0.74      0.97      0.84        65
           4       0.98      0.91      0.94        46
           5       0.75      0.81      0.78        48
           6       0.84      0.78      0.81        41

    accuracy                           0.83       300
   macro avg       0.86      0.80      0.82       300
weighted avg       0.85      0.83      0.83       300

[[16  3  2  6  0  2  1]
 [ 0 35  0  1  0  0  3]
 [ 0  1 22  3  1  4  0]
 [ 0  1  1 63  0  0  0]
 [ 0  0  0  1 42  2  1]
 [ 0  1  0  7  0 39  1]
 [ 0  0  0  4  0  5 32]]
Accuracy: 0.83


#SVM classifier

In [28]:
from sklearn import svm
svm_model = svm.SVC(kernel='linear', probability=True, C=10, gamma=0.1)
svm_model.fit(trainX_vec, trainY)

pred_test_svm = svm_model.predict(testX_vec)
print(classification_report(testY,pred_test_svm))
print(confusion_matrix(testY,pred_test_svm))
print("Accuracy:", accuracy_score(testY, pred_test_svm))


              precision    recall  f1-score   support

           0       0.81      0.73      0.77        30
           1       0.74      0.87      0.80        39
           2       0.70      0.74      0.72        31
           3       0.84      0.88      0.86        65
           4       0.93      0.80      0.86        46
           5       0.63      0.71      0.67        48
           6       0.91      0.71      0.79        41

    accuracy                           0.79       300
   macro avg       0.79      0.78      0.78       300
weighted avg       0.80      0.79      0.79       300

[[22  1  2  0  0  5  0]
 [ 1 34  1  2  1  0  0]
 [ 1  1 23  1  0  5  0]
 [ 0  3  1 57  0  3  1]
 [ 1  2  0  2 37  3  1]
 [ 2  3  3  4  1 34  1]
 [ 0  2  3  2  1  4 29]]
Accuracy: 0.7866666666666666


In [30]:
words= nltk.word_tokenize(text[2])
length= len(words) 
print(length) 
print(text[2]) 

99
Now she had got a start, and she went on and told me all about the good
place. She said all a body would have to do there was to go around all
day long with a harp and sing, forever and ever. So I didn't think
much of it. But I never said so. I asked her if she reckoned Tom Sawyer
would go there, and she said not by a considerable sight. I was glad
about that, because I wanted him and me to be together.


In [31]:
paragraph = text[25]
sentences = nltk.sent_tokenize(paragraph) 
length= len(sentences) 
print(text[25]) 
print(length) 

“Yo' ole father doan' know yit what he's a-gwyne to do. Sometimes he
spec he'll go 'way, en den agin he spec he'll stay. De bes' way is to
res' easy en let de ole man take his own way. Dey's two angels hoverin'
roun' 'bout him. One uv 'em is white en shiny, en t'other one is black.
De white one gits him to go right a little while, den de black one sail
in en bust it all up. A body can't tell yit which one gwyne to fetch
him at de las'. But you is all right. You gwyne to have considable
trouble in yo' life, en considable joy. Sometimes you gwyne to git
hurt, en sometimes you gwyne to git sick; but every time you's gwyne
to git well agin. Dey's two gals flyin' 'bout you in yo' life. One
uv 'em's light en t'other one is dark. One is rich en t'other is po'.
You's gwyne to marry de po' one fust en de rich one by en by. You
wants to keep 'way fum de water as much as you kin, en don't run no
resk, 'kase it's down in de bills dat you's gwyne to git hung.”
15


In [32]:
# # embeddings 
# from gensim.models.word2vec import Word2Vec
# from gensim.models import Phrases

# bigramer = Phrases(sentences)
# model = Word2Vec(bigramer[sentences], window=5, min_count=10, workers=4)

# # unload memory
# model.init_sims(replace=True) 

# # Storing a model
# model.save("author")
# # new_model = gensim.models.Word2Vec.load('author')

# # Switch to KeyedVectors instance  
# # w2v = {w: vec for w,vec in text_preprocessed}

In [33]:
# model.most_similar()

In [34]:
class EmbeddingVectorizer(object):
    # If word2vec were passed in during initialization, use those
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = 100
    
    # learning word2weight
    def fit(self, X, y):
        vect = TfidfVectorizer(min_df=5, ngram_range=(1,3))
        vect.fit(X)
        max_idf = max(vect.idf_)
        self.word2weight = defaultdict(
        lambda: max_idf, [(w, vect.idf_[i]) for w, i in vect.vocabulary_.items()]
        )
        return self
    
    # Use learned word2weight
    def transform(self, X):
        return np.array([
            np.mean([
                self.word2vec[w]*self.word2weight[w] 
                for w in words if w in self.word2vec] or 
                [np.zeros(self.dim)], axis=0) 
            for words in X
        ])
        

In [35]:
from keras.preprocessing.text import Tokenizer #similar to the CountVectorizer and TfIDF from sci-kit

#The word embedding layer expects input sequences to be comprised of integers.
# integer encode sequences of words
tokenizer = Tokenizer()

tokenizer.fit_on_texts(text_preprocessed)

sequences = tokenizer.texts_to_sequences(text_preprocessed)

In [36]:
tokenizer.word_index

{'’': 1,
 '“': 2,
 '”': 3,
 'say': 4,
 'would': 5,
 'one': 6,
 'go': 7,
 'mr': 8,
 'make': 9,
 'man': 10,
 'could': 11,
 'like': 12,
 'come': 13,
 'take': 14,
 'time': 15,
 'get': 16,
 'see': 17,
 'upon': 18,
 'little': 19,
 'know': 20,
 'look': 21,
 'well': 22,
 'great': 23,
 'hand': 24,
 'good': 25,
 'give': 26,
 'much': 27,
 'thing': 28,
 'seem': 29,
 'way': 30,
 'old': 31,
 '‘': 32,
 'might': 33,
 'day': 34,
 'never': 35,
 'even': 36,
 'two': 37,
 'every': 38,
 'eye': 39,
 'men': 40,
 'turn': 41,
 'life': 42,
 'head': 43,
 'thought': 44,
 'house': 45,
 'sir': 46,
 'back': 47,
 'young': 48,
 'people': 49,
 'woman': 50,
 'first': 51,
 'work': 52,
 'tom': 53,
 'must': 54,
 'think': 55,
 'may': 56,
 'mind': 57,
 'always': 58,
 'many': 59,
 'call': 60,
 'face': 61,
 'saw': 62,
 'away': 63,
 'long': 64,
 'though': 65,
 'u': 66,
 'last': 67,
 'year': 68,
 'without': 69,
 'want': 70,
 'nothing': 71,
 'ever': 72,
 'casaubon': 73,
 'boy': 74,
 'night': 75,
 'put': 76,
 'place': 77,
 'world':

In [37]:
sequences = np.array(sequences)

trainX, testX, trainY, testY = train_test_split(
    sequences, label, test_size=0.3, random_state=42)
trainX = np.array(trainX)
testX = np.array(testX)
trainY = np.array(trainY)
testY = np.array(testY)


len(trainX), len(testX), len(trainY), len(testY)

# trainX.shape, testX.shape, trainY.shape, testY.shape

  """Entry point for launching an IPython kernel.


(699, 300, 699, 300)

In [38]:
from keras.preprocessing.sequence import pad_sequences

max_len = 100
#transforms a list (of length num_samples) of sequences (lists of integers) 
#into a 2D Numpy array of shape (num_samples, num_timesteps) num_timesteps is the maxlen argument.

train_X_pad = pad_sequences(trainX, maxlen = max_len, dtype='int32')
test_X_pad = pad_sequences(testX, maxlen = max_len, dtype='int32')

print(train_X_pad.shape)
print(test_X_pad.shape)

(699, 100)
(300, 100)


In [39]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size
#tokenizer.word_index

11996

In [40]:
# create the model
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense

def generate_model(vocab_size, max_len, embedding_size): #dropout):

    _input = Input(max_len)

    x = Embedding(input_dim = vocab_size, output_dim = embedding_size) (_input)

    x = LSTM(50)(x)

    output = Dense(7, activation='softmax')(x)

    model = Model(inputs= [_input], outputs = [output])  
    #dropout = layers(Dropout(0.5))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [41]:
#trainY
#dropout sloj posle LSTM, moze da se smeni embedding size

In [42]:
model = generate_model(vocab_size , max_len , embedding_size=80) #dropout=dropout
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 80)           959680    
                                                                 
 lstm (LSTM)                 (None, 50)                26200     
                                                                 
 dense (Dense)               (None, 7)                 357       
                                                                 
Total params: 986,237
Trainable params: 986,237
Non-trainable params: 0
_________________________________________________________________


In [82]:
trainX= np.array(trainX)
trainY= np.array(trainY)
testX= np.array(testX)
testY= np.array(testY)

In [83]:
#testY
le = preprocessing.LabelEncoder()
trainY= le.fit_transform(trainY)
testY = le.fit_transform(testY)
trainY.shape,testY.shape
trainY[0]

3

In [55]:
history = model.fit(train_X_pad, trainY, epochs=10, batch_size=60, validation_data=(test_X_pad, testY))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [56]:
pred_test = model.predict(test_X_pad)
pred_test = np.argmax(pred_test,axis=1)
#pred_test = pred_test.round()

#print evaluation metrics 
print(classification_report(testY,pred_test))
print(confusion_matrix(testY,pred_test))
print("Accuracy:",accuracy_score(testY, pred_test))

              precision    recall  f1-score   support

           0       0.71      0.17      0.27        30
           1       0.43      0.67      0.52        39
           2       0.64      0.45      0.53        31
           3       0.82      0.69      0.75        65
           4       0.70      0.67      0.69        46
           5       0.46      0.73      0.56        48
           6       0.86      0.73      0.79        41

    accuracy                           0.62       300
   macro avg       0.66      0.59      0.59       300
weighted avg       0.67      0.62      0.62       300

[[ 5 10  6  0  2  7  0]
 [ 0 26  1  1  5  4  2]
 [ 0  1 14  2  2 11  1]
 [ 0  5  0 45  0 15  0]
 [ 0  9  0  2 31  2  2]
 [ 2  5  0  3  3 35  0]
 [ 0  5  1  2  1  2 30]]
Accuracy: 0.62


In [60]:
text = data['v2']
label = data['v1']

from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(text, label, test_size = 0.20)

In [61]:
from tensorflow.keras.preprocessing.text import one_hot

vocab_size = 11996

X_train = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ') for d in X_train]
X_test = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ') for d in X_test]

In [62]:
from keras.preprocessing.sequence import pad_sequences
max_length = 100
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')


In [100]:
from keras import layers
from keras import models
from tensorflow.keras.layers import Dense, Embedding,GlobalMaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D
from tensorflow.keras.layers import Embedding

model_conv1 = Sequential([
    Embedding(vocab_size, 7, input_length=max_length),
  Conv1D(32, 3, activation='relu'),
    GlobalMaxPooling1D(),
  Dense(10, activation='relu'),
  Dense(1, activation='sigmoid')
])


In [101]:
model_conv1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [78]:
le = preprocessing.LabelEncoder()
y_train= le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
y_train.shape,y_test.shape


((799,), (200,))

In [102]:
history = model_conv1.fit(train_X_pad, trainY, epochs=20, validation_data=(test_X_pad, testY))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [96]:
pred_test_conv1 = model_conv1.predict(test_X_pad)
pred_test_conv1 = np.argmax(pred_test_conv1,axis=1)
#pred_test = pred_test.round()

#print evaluation metrics 
print(classification_report(testY,pred_test_conv1))
print(confusion_matrix(testY,pred_test_conv1))
print("Accuracy:",accuracy_score(testY, pred_test_conv1))

              precision    recall  f1-score   support

           0       0.10      1.00      0.18        30
           1       0.00      0.00      0.00        39
           2       0.00      0.00      0.00        31
           3       0.00      0.00      0.00        65
           4       0.00      0.00      0.00        46
           5       0.00      0.00      0.00        48
           6       0.00      0.00      0.00        41

    accuracy                           0.10       300
   macro avg       0.01      0.14      0.03       300
weighted avg       0.01      0.10      0.02       300

[[30  0  0  0  0  0  0]
 [39  0  0  0  0  0  0]
 [31  0  0  0  0  0  0]
 [65  0  0  0  0  0  0]
 [46  0  0  0  0  0  0]
 [48  0  0  0  0  0  0]
 [41  0  0  0  0  0  0]]
Accuracy: 0.1


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


#ExplainerDashboard

In [109]:
df_columns = list({k: v for k, v in sorted(count_vect.vocabulary_.items(), key=lambda item: item[1])}.keys())
X_train_df = pd.DataFrame(trainX_vec.toarray(), columns = df_columns)
X_test_df = pd.DataFrame(testX_vec.toarray(), columns = df_columns)

In [110]:
X_train_df

Unnamed: 0,10,11,1862,1882,20,21st,25,30,abandon,abeyance,...,yield,yo,yonder,york,you,young,yous,youth,youthful,zeal
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
694,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
695,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
696,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
697,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [111]:
#X_train_df.value_counts()

In [112]:
model_ex = LogisticRegression()
model_ex.fit(X_train_df,trainY)

LogisticRegression()

In [104]:
#!! pip install explainerdashboard

In [113]:
from explainerdashboard import ClassifierExplainer, ExplainerDashboard

explainer = ClassifierExplainer(model_ex, X_test_df, testY,
  #labels=['v1'], # defaults to ['0', '1', etc]
)

db = ExplainerDashboard(explainer,title="Author Classifier Explainer",
    shap_interaction=False,
)
db.run(port=8050)

Note: model_output='probability' is currently not supported for linear classifiers models with shap. So defaulting to model_output='logodds' If you really need probability outputs use shap='kernel' instead.
Note: shap values for shap='linear' get calculated against X_background, but paramater X_background=None, so using X instead...
Generating self.shap_explainer = shap.LinearExplainer(model, X)...
Building ExplainerDashboard..
Detected google colab environment, setting mode='external'
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...
Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating dependencies...
Calculating permutation importances (if slow, try setting n_jobs parameter)...
Calculating predictions...
Calculating pred_per

<IPython.core.display.Javascript object>

#Bidirectional LSTM

In [None]:
max_features = 20000
maxlen = 200

In [None]:
from tensorflow import keras
inputs = keras.Input(shape=(None,), dtype="int32")
x = layers.Embedding(max_features, 128)(inputs)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()


In [None]:
x_train = keras.preprocessing.sequence.pad_sequences(trainX, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(testX, maxlen=maxlen)

In [None]:
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
model.fit(x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val))