In [1]:
import os
import random
from nltk.corpus import stopwords
from nltk import word_tokenize
from string import punctuation
import pickle
import numpy as np

In [2]:
stop_words = stopwords.words('english')

In [3]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
translator = str.maketrans('', '', punctuation)
a = 'salam$%^'
a.translate(translator)

'salam'

In [5]:
negative_documents = []
max_len_negative = 0
for file in os.listdir('data/pos'):
    with open('data/pos/' + file) as f:
        text = f.read()
        tokens = word_tokenize(text)
        translator = str.maketrans('', '', punctuation)
        tokens = [w.translate(translator) for w in tokens]
        tokens = [w for w in tokens if not w in stop_words]
        negative_documents.append(' '.join(tokens))
        if len(tokens) > max_len_negative:
            max_len_negative = len(tokens)
len(negative_documents)

1000

In [6]:
max_len_negative

1693

In [7]:
positive_documents = []
max_len_positive = 0
for file in os.listdir('data/neg'):
    with open('data/neg/' + file) as f:
        text = f.read()
        tokens = word_tokenize(text)
        translator = str.maketrans('', '', punctuation)
        tokens = [w.translate(translator) for w in tokens]
        tokens = [w for w in tokens if not w in stop_words]
        positive_documents.append(' '.join(tokens))
        if len(tokens) > max_len_positive:
            max_len_positive = len(tokens)
len(positive_documents)

1000

In [8]:
max_len_positive

1400

In [9]:
max_len = max(max_len_negative, max_len_positive)
max_len

1693

In [10]:
random.shuffle(negative_documents)

In [11]:
random.shuffle(positive_documents)

In [12]:
X_train = negative_documents[:800] + positive_documents[:800]

In [13]:
len(X_train)

1600

In [14]:
y_train = [0 for _ in range(800)] + [1 for _ in range(800)]

In [15]:
len(y_train)

1600

In [16]:
X_test = negative_documents[800:] + positive_documents[800:]

In [17]:
len(X_test)

400

In [18]:
y_test = [0 for _ in range(200)] + [1 for _ in range(200)]

In [19]:
len(y_test)

400

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Conv1D, MaxPool1D, Embedding, Dropout
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import concatenate

In [21]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [22]:
tokenizer

<keras.preprocessing.text.Tokenizer at 0x2377d7ae400>

In [23]:
with open('tokenizer.h5', 'wb') as f:
    pickle.dump(tokenizer, f)

In [24]:
tokenizer.word_index

{'film': 1,
 'nt': 2,
 'movie': 3,
 'one': 4,
 'like': 5,
 'even': 6,
 'good': 7,
 'time': 8,
 'would': 9,
 'story': 10,
 'much': 11,
 'also': 12,
 'character': 13,
 'get': 14,
 'two': 15,
 'characters': 16,
 'first': 17,
 'see': 18,
 'way': 19,
 'well': 20,
 'make': 21,
 'could': 22,
 'really': 23,
 'films': 24,
 'little': 25,
 'people': 26,
 'life': 27,
 'plot': 28,
 'scene': 29,
 'man': 30,
 'bad': 31,
 'never': 32,
 'best': 33,
 'new': 34,
 'scenes': 35,
 'many': 36,
 'know': 37,
 'movies': 38,
 'great': 39,
 'love': 40,
 'another': 41,
 'us': 42,
 'action': 43,
 'director': 44,
 'seems': 45,
 'go': 46,
 'end': 47,
 'still': 48,
 'something': 49,
 'made': 50,
 'world': 51,
 'back': 52,
 'work': 53,
 'big': 54,
 'makes': 55,
 'however': 56,
 'every': 57,
 'though': 58,
 'audience': 59,
 'better': 60,
 'enough': 61,
 'around': 62,
 'seen': 63,
 'performance': 64,
 'take': 65,
 'role': 66,
 'real': 67,
 'gets': 68,
 'may': 69,
 'going': 70,
 'last': 71,
 'almost': 72,
 'look': 73,
 't

In [25]:
vocab_len = len(tokenizer.word_index) + 1

In [26]:
vocab_len

40607

In [27]:
encoded = tokenizer.texts_to_sequences(X_train)

In [28]:
encoded[0]

[3098,
 18196,
 15045,
 1791,
 1670,
 3304,
 5517,
 382,
 2651,
 23889,
 2386,
 15046,
 7859,
 4052,
 3910,
 135,
 12860,
 15047,
 1947,
 41,
 121,
 23890,
 593,
 2073,
 108,
 2502,
 3911,
 121,
 276,
 897,
 293,
 2228,
 401,
 5518,
 135,
 15,
 299,
 23891,
 6134,
 3304,
 5811,
 6488,
 18197,
 264,
 11399,
 23892,
 34,
 1502,
 4585,
 928,
 2387,
 4585,
 928,
 2387,
 3304,
 217,
 498,
 7860,
 2570,
 1791,
 1695,
 9274,
 8479,
 558,
 1462,
 1326,
 239,
 494,
 12861,
 4,
 19,
 929,
 1643,
 18198,
 5000,
 3304,
 380,
 8,
 335,
 4,
 128,
 425,
 83,
 1135,
 3,
 4053,
 73,
 15045,
 67,
 27,
 1064,
 262,
 382,
 2651,
 1148,
 44,
 653,
 217,
 105,
 4585,
 1124,
 499,
 64,
 3304,
 1919,
 11400,
 64,
 836,
 898,
 4795,
 4,
 1978,
 104,
 192,
 1,
 2651,
 1370,
 231,
 8480,
 3305,
 1260,
 1395,
 808,
 59,
 198,
 1846,
 73,
 3306,
 2652,
 20,
 3099,
 18199,
 2004,
 30,
 2653,
 5228,
 5229,
 67,
 27,
 2651,
 2653,
 218,
 630,
 10203,
 95,
 336,
 139,
 809,
 758,
 2280,
 23893,
 836,
 3,
 15045,
 4585

In [29]:
padded = pad_sequences(encoded, maxlen=max_len, padding='post')

In [30]:
padded.shape

(1600, 1693)

In [31]:
input1 = Input(shape=(max_len,))
embedding1 = Embedding(vocab_len, 100)(input1)
conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
drop1 = Dropout(0.5)(conv1)
pool1 = MaxPool1D(pool_size=2)(drop1)
flat1 = Flatten()(pool1)
dense1 = Dense(10, activation='relu')(flat1)
output = Dense(1, activation='sigmoid')(dense1)


In [32]:
model = Model(inputs=[input1], outputs=output)

In [33]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [34]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1693)]            0         
                                                                 
 embedding (Embedding)       (None, 1693, 100)         4060700   
                                                                 
 conv1d (Conv1D)             (None, 1690, 32)          12832     
                                                                 
 dropout (Dropout)           (None, 1690, 32)          0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 845, 32)          0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 27040)             0         
                                                             

In [35]:
encoded_test = tokenizer.texts_to_sequences(X_test)
padded_test = pad_sequences(encoded_test, maxlen=max_len, padding='post')

In [36]:
padded_test.shape

(400, 1693)

In [37]:
model.fit([padded], np.array(y_train), epochs=10, batch_size=20, validation_data=([padded_test], np.array(y_test)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23708bad970>

In [38]:
model.save('textcnn.h5')

In [39]:
plot_model(model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [40]:
from tensorflow.keras.models import load_model

In [41]:
model = load_model('textcnn.h5')

In [42]:
with open('tokenizer.h5', 'rb') as f:
    tokenizer = pickle.load(f)

In [43]:
tokenizer.word_index

{'film': 1,
 'nt': 2,
 'movie': 3,
 'one': 4,
 'like': 5,
 'even': 6,
 'good': 7,
 'time': 8,
 'would': 9,
 'story': 10,
 'much': 11,
 'also': 12,
 'character': 13,
 'get': 14,
 'two': 15,
 'characters': 16,
 'first': 17,
 'see': 18,
 'way': 19,
 'well': 20,
 'make': 21,
 'could': 22,
 'really': 23,
 'films': 24,
 'little': 25,
 'people': 26,
 'life': 27,
 'plot': 28,
 'scene': 29,
 'man': 30,
 'bad': 31,
 'never': 32,
 'best': 33,
 'new': 34,
 'scenes': 35,
 'many': 36,
 'know': 37,
 'movies': 38,
 'great': 39,
 'love': 40,
 'another': 41,
 'us': 42,
 'action': 43,
 'director': 44,
 'seems': 45,
 'go': 46,
 'end': 47,
 'still': 48,
 'something': 49,
 'made': 50,
 'world': 51,
 'back': 52,
 'work': 53,
 'big': 54,
 'makes': 55,
 'however': 56,
 'every': 57,
 'though': 58,
 'audience': 59,
 'better': 60,
 'enough': 61,
 'around': 62,
 'seen': 63,
 'performance': 64,
 'take': 65,
 'role': 66,
 'real': 67,
 'gets': 68,
 'may': 69,
 'going': 70,
 'last': 71,
 'almost': 72,
 'look': 73,
 't

In [None]:
model.predict()