In [1]:
import os
import random
from nltk.corpus import stopwords
from nltk import word_tokenize
from string import punctuation
import pickle
import numpy as np

In [2]:
stop_words = stopwords.words('english')

In [3]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
translator = str.maketrans('', '', punctuation)
a = 'salam$%^'
a.translate(translator)

'salam'

In [5]:
negative_documents = []
max_len_negative = 0
for file in os.listdir('data/neg'):
    with open('data/neg/' + file) as f:
        text = f.read()
        tokens = word_tokenize(text)
        translator = str.maketrans('', '', punctuation)
        tokens = [w.translate(translator) for w in tokens]
        tokens = [w for w in tokens if not w in stop_words]
        negative_documents.append(' '.join(tokens))
        if len(tokens) > max_len_negative:
            max_len_negative = len(tokens)
len(negative_documents)

1000

In [6]:
max_len_negative

1400

In [7]:
positive_documents = []
max_len_positive = 0
for file in os.listdir('data/pos'):
    with open('data/pos/' + file) as f:
        text = f.read()
        tokens = word_tokenize(text)
        translator = str.maketrans('', '', punctuation)
        tokens = [w.translate(translator) for w in tokens]
        tokens = [w for w in tokens if not w in stop_words]
        positive_documents.append(' '.join(tokens))
        if len(tokens) > max_len_positive:
            max_len_positive = len(tokens)
len(positive_documents)

1000

In [8]:
max_len_positive

1693

In [9]:
max_len = max(max_len_negative, max_len_positive)
max_len

1693

In [10]:
random.shuffle(negative_documents)

In [11]:
random.shuffle(positive_documents)

In [12]:
X_train = negative_documents[:800] + positive_documents[:800]

In [13]:
len(X_train)

1600

In [14]:
y_train = [0 for _ in range(800)] + [1 for _ in range(800)]

In [15]:
len(y_train)

1600

In [16]:
X_test = negative_documents[800:] + positive_documents[800:]

In [17]:
len(X_test)

400

In [18]:
y_test = [0 for _ in range(200)] + [1 for _ in range(200)]

In [19]:
len(y_test)

400

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Conv1D, MaxPool1D, Embedding, Dropout
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import concatenate

In [21]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [22]:
tokenizer

<keras.preprocessing.text.Tokenizer at 0x285b6f44280>

In [23]:
with open('tokenizer.h5', 'wb') as f:
    pickle.dump(tokenizer, f)

In [24]:
tokenizer.word_index

{'film': 1,
 'nt': 2,
 'movie': 3,
 'one': 4,
 'like': 5,
 'even': 6,
 'good': 7,
 'time': 8,
 'would': 9,
 'story': 10,
 'much': 11,
 'character': 12,
 'also': 13,
 'get': 14,
 'characters': 15,
 'two': 16,
 'first': 17,
 'see': 18,
 'well': 19,
 'way': 20,
 'could': 21,
 'make': 22,
 'really': 23,
 'films': 24,
 'people': 25,
 'little': 26,
 'plot': 27,
 'life': 28,
 'scene': 29,
 'bad': 30,
 'never': 31,
 'man': 32,
 'new': 33,
 'best': 34,
 'many': 35,
 'scenes': 36,
 'know': 37,
 'movies': 38,
 'love': 39,
 'another': 40,
 'great': 41,
 'director': 42,
 'something': 43,
 'action': 44,
 'end': 45,
 'us': 46,
 'go': 47,
 'seems': 48,
 'made': 49,
 'back': 50,
 'world': 51,
 'however': 52,
 'still': 53,
 'work': 54,
 'big': 55,
 'makes': 56,
 'every': 57,
 'audience': 58,
 'though': 59,
 'better': 60,
 'around': 61,
 'enough': 62,
 'real': 63,
 'seen': 64,
 'take': 65,
 'performance': 66,
 'role': 67,
 'going': 68,
 'gets': 69,
 'may': 70,
 'say': 71,
 'almost': 72,
 'think': 73,
 'l

In [25]:
vocab_len = len(tokenizer.word_index) + 1

In [26]:
vocab_len

40616

In [27]:
encoded = tokenizer.texts_to_sequences(X_train)

In [28]:
encoded[0]

[1338,
 103,
 434,
 133,
 57,
 104,
 2605,
 24,
 49,
 11424,
 58,
 787,
 3520,
 211,
 1151,
 3058,
 277,
 3769,
 2355,
 1515,
 18197,
 270,
 1304,
 350,
 22,
 1,
 55,
 249,
 93,
 144,
 690,
 183,
 1239,
 674,
 163,
 264,
 3145,
 507,
 168,
 4598,
 8504,
 6160,
 905,
 8505,
 350,
 266,
 1540,
 986,
 1371,
 117,
 798,
 1151,
 480,
 5546,
 2825,
 2606,
 108,
 3058,
 856,
 15038,
 172,
 72,
 1,
 149,
 3145,
 2826,
 2177,
 15039,
 334,
 1806,
 23837,
 579,
 447,
 2178,
 646,
 31,
 4376,
 675,
 1,
 42,
 11425,
 18198,
 2178,
 4794,
 11426,
 2533,
 2261,
 6892,
 2306,
 2827,
 481,
 2125,
 3146,
 140,
 595,
 1807,
 12891,
 7897,
 657,
 11,
 12892,
 1780,
 28,
 81,
 178,
 2534,
 47,
 1321,
 191,
 7898,
 67,
 2125,
 1440,
 3647,
 109,
 1099,
 1024,
 1322,
 3335,
 23838,
 780,
 23839,
 4794,
 54,
 124,
 225,
 8506,
 2975,
 3521,
 18199,
 1262,
 4042,
 88,
 1516,
 7368,
 16,
 12893,
 193,
 343,
 2356,
 1113,
 17,
 3647,
 15040,
 657,
 2976,
 2534,
 52,
 87,
 1,
 3147,
 1151,
 58,
 26,
 5032,
 12,


In [29]:
padded = pad_sequences(encoded, maxlen=max_len, padding='post')

In [30]:
padded.shape

(1600, 1693)

In [31]:
input1 = Input(shape=(max_len,))
embedding1 = Embedding(vocab_len, 100)(input1)
conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
drop1 = Dropout(0.5)(conv1)
pool1 = MaxPool1D(pool_size=2)(drop1)
flat1 = Flatten()(pool1)

input2 = Input(shape=(max_len,))
embedding2 = Embedding(vocab_len, 100)(input1)
conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
drop2 = Dropout(0.5)(conv2)
pool2 = MaxPool1D(pool_size=2)(drop2)
flat2 = Flatten()(pool2)

input3 = Input(shape=(max_len,))
embedding3 = Embedding(vocab_len, 100)(input3)
conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
drop3 = Dropout(0.5)(conv3)
pool3 = MaxPool1D(pool_size=2)(drop3)
flat3 = Flatten()(pool3)

input4 = Input(shape=(max_len,))
embedding4 = Embedding(vocab_len, 100)(input4)
conv4 = Conv1D(filters=32, kernel_size=10, activation='relu')(embedding4)
drop4 = Dropout(0.5)(conv4)
pool4 = MaxPool1D(pool_size=2)(drop4)
flat4 = Flatten()(pool4)

conc = concatenate([flat1, flat2, flat3, flat4])

dens1 = Dense(10, activation='relu')(conc)
output = Dense(1, activation='sigmoid')(dens1)

In [32]:
model = Model(inputs=[input1, input2, input3, input4], outputs=output)

In [33]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [34]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1693)]       0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 1693)]       0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 1693)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1693, 100)    4061600     ['input_1[0][0]']                
                                                                                              

In [35]:
encoded_test = tokenizer.texts_to_sequences(X_test)
padded_test = pad_sequences(encoded_test, maxlen=max_len, padding='post')

In [36]:
padded_test.shape

(400, 1693)

In [37]:
model.fit([padded, padded, padded, padded], np.array(y_train), epochs=10, batch_size=20, validation_data=([padded_test, padded_test, padded_test, padded_test], np.array(y_test)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x285c1e2f340>

In [38]:
model.save('textcnnMultiChannel.h5')

In [39]:
plot_model(model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [40]:
from tensorflow.keras.models import load_model

In [41]:
model = load_model('textcnnMultiChannel.h5')

In [42]:
with open('tokenizer.h5', 'rb') as f:
    tokenizer = pickle.load(f)

In [43]:
tokenizer.word_index

{'film': 1,
 'nt': 2,
 'movie': 3,
 'one': 4,
 'like': 5,
 'even': 6,
 'good': 7,
 'time': 8,
 'would': 9,
 'story': 10,
 'much': 11,
 'character': 12,
 'also': 13,
 'get': 14,
 'characters': 15,
 'two': 16,
 'first': 17,
 'see': 18,
 'well': 19,
 'way': 20,
 'could': 21,
 'make': 22,
 'really': 23,
 'films': 24,
 'people': 25,
 'little': 26,
 'plot': 27,
 'life': 28,
 'scene': 29,
 'bad': 30,
 'never': 31,
 'man': 32,
 'new': 33,
 'best': 34,
 'many': 35,
 'scenes': 36,
 'know': 37,
 'movies': 38,
 'love': 39,
 'another': 40,
 'great': 41,
 'director': 42,
 'something': 43,
 'action': 44,
 'end': 45,
 'us': 46,
 'go': 47,
 'seems': 48,
 'made': 49,
 'back': 50,
 'world': 51,
 'however': 52,
 'still': 53,
 'work': 54,
 'big': 55,
 'makes': 56,
 'every': 57,
 'audience': 58,
 'though': 59,
 'better': 60,
 'around': 61,
 'enough': 62,
 'real': 63,
 'seen': 64,
 'take': 65,
 'performance': 66,
 'role': 67,
 'going': 68,
 'gets': 69,
 'may': 70,
 'say': 71,
 'almost': 72,
 'think': 73,
 'l

In [44]:
txt = '''
boy , what a great movie ! ! 
keanu reeves and morgan freeman acting together , the director of the fugitive ( andrew davis ) back again to give us another thriller , and the beautiful rachel stealing beauty weisz thrown in to boot . 
how could this not be a blockbuster ? 
all die-hard keanu reeves fans , read on . 
ol " much ado about nothing " plays eddie kasalivich , a machinist studying at the university of chicago . 
to help pay for the rent , he takes on this job making the machinery for a hydrogen project being conducted by the university . 
by happy coincidence , he also happens to stumble on the solution to the final problem and thus is the only one who knows the key to performing this feat of miracle physics . 
this project holds great promise : taking hydrogen from water and giving out more energy than is put in . 
a potential solution to the earth's energy problems without the pollution cost . 
surely nobody could have any problems with that ? 
morgan freeman is paul shannon , the project's sponsor . 
he works for a very powerful organization that disagrees with the paternal project leader as to how quickly technology should be released to the public . 
he figures the world will disintegrate into anarchy if the results of the project are released too quickly . 
so he murders the project leader , blows up the project ( great but short scene here , sort of like a mini id4 city-devastation thing ) and tries to simulate the experiment at some other hi-tech hush-hush location . 
unfortunately , our intrepid machinist and an english physicist ( weisz ) manage to get away and now follows a fugitive-like chase using a not too dissimilar rehash of that movie script . 
this movie is just dying for a comparison with " the fugitive " . 
both movies use chicago as the main city and since the place doesn't change much , i guess we can't really blame the setting for the paucity of atmosphere . 
the trouble here is that where the fugitive had harrison ford and tommy lee jones , chain reaction only has keanu reeves and fred ward . 
credit to them , but we aren't really given much of a chance to empathise with the characters . 
where ford was able to work within the confines of the movie to evoke sympathy , eddie kasalivich just doesn9t seem very believeable and reeves' character is never given the time to develop . 
it might have been a better investment in film to give the characters more depth and spend less time on the chase sequences , which frankly , get quite boring after a while . 
harrison ford was " the man against the world . " 
he was alone in a world where he didn't know who to trust and it came across real well . 
in chain reaction , keanu reeves isn't alone . 
now that would be fine if the fleeing couple had some chemistry and could really portray some paranoia , vulnerability and confusion . 
we don't get this . 
we get him thinking he's still in speed , only now our sandra bullock has an english accent , probably doesn't drive a bus , and hardly contributes anything to the movie . 
the producers here probably thought , " hey , what if keanu and rachel don't hit it off too well ? 
let's rope in that morgan to help us out . " 
well , keanu and rachel didn't hit it off well on the screen , and most unfortunately , morgan freeman doesn't help much either . 
the only thing we come to really know of paul shannon is that he always has a full load of cigars in his cigar holder . 
it's not really his fault . 
once again , andrew davis just doesn't take the time to build his characters . 
someone must have convinced him that this time round , cinema dollars are best earned by making the movie run like a headless chicken . 
the flying inkpot rating system : * wait for the tv2 broadcast . 
 * * a little creaky , but still better than staying at home with gotcha ! 
 * * * pretty good , bring a friend . 
 * * * * amazing , potent stuff . 
 * * * * * perfection . 
see it twice . 
'''

In [46]:
encoded_doc = tokenizer.texts_to_sequences([txt])

In [47]:
padded_doc = pad_sequences(encoded_doc, maxlen=max_len, padding='post')

In [48]:
padded_doc

array([[313,  41,   3, ...,   0,   0,   0]])

In [49]:
model.predict([padded_doc, padded_doc, padded_doc, padded_doc])[0][0]



0.01420637

In [50]:
if model.predict([padded_doc, padded_doc, padded_doc, padded_doc])[0][0] < 0.5:
    print('Negative')
else:
    print('Positive')

Negative
