## How to Develop an N-gram Multichannel Convolution neural Network for Sentiment Analysis
## (Text classification)

A standard deep learning model for text classifiction and sentiment analysis used a word embedding layer and one-dimensional convolution
neural network

This model can be expanded by using multiple parallel convolution neural network that read the source document using different kernel sizes.
This , in effect creates a multichannel convolution neural network for text that reads with different n-gram sizes (group of words)

first we will remove stop words from the data 

In [1]:
from nltk.corpus import stopwords
import string

first we will clean on movie review

In [34]:
def load_doc(filename):
    with open(filename,'r') as f:
        text = f.read()
    return text

In [37]:
def clean_doc(doc):
    tokens = doc.split()
    
    table = str.maketrans('','',string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    tokens = [w for w in tokens if w.isalpha()]
    
    stop_words = set(stopwords.words("english"))
    tokens = [w for w in tokens if not w in stop_words]
    
    tokens = [w for w in tokens if len(w)>1]
#     tokens = ''.join(tokens)
    return tokens

In [39]:
filename = "txt_sentoken/pos/cv000_29590.txt"
text = load_doc(filename)
tokens = clean_doc(text)
# print(tokens)

In [40]:
from os import listdir
from pickle import dump

def process_docs(directory,is_train):
    documents = list()
    
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory + "/" + filename
        doc = load_doc(path)
        tokens = clean_doc(doc)
        documents.append(tokens)
    return documents

In [41]:
def save_data(dataset,filename):
    dump(dataset,open(filename,'wb'))
    print('Saved: {}'.format(filename))

In [42]:
negative_docs = process_docs('txt_sentoken/neg',True)
positive_docs = process_docs('txt_sentoken/pos',True)
trainX = negative_docs + positive_docs
trainy = [0 for _ in range(900)] + [1 for _ in range(900)]
save_data([trainX,trainy],'train.pkl')

negative_docs = process_docs('txt_sentoken/neg',False)
positive_docs = process_docs('txt_sentoken/pos',False)

testX = negative_docs + positive_docs
testy = [0 for _ in range(100)] + [1 for _ in range(100)]

save_data([testX,testy],'test.pkl')



Saved: train.pkl
Saved: test.pkl



## Now let us develop a multichannel model 

this section is divided into two parts

1. Encode Data
2. Define Model


In [43]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate


In [44]:
def load_dataset(filename):
    return load(open(filename,'rb'))

In [45]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [50]:
def max_length(lines):
    return max([len(s) for s in lines])


In [47]:
def encode_text(tokenizer,lines,length):
    encoded = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

**embedding** : keras provides an special layer called embedding layer which gives each word an unique number to identify the word



In [57]:
def define_model(length, vocab_size):
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size,100)(inputs1)
    conv1 = Conv1D(filters=32,kernel_size=4,activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size,100)(inputs2)
    conv2 = Conv1D(filters=32,kernel_size=6,activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size,100)(inputs3)
    conv3 = Conv1D(filters=32,kernel_size=8,activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    
    merged = concatenate([flat1, flat2, flat3])
    
    dense1 = Dense(10,activation='relu')(merged)
    output = Dense(1, activation='sigmoid')(dense1)
    
    model = Model(inputs=[inputs1,inputs2,inputs3],outputs=output)
    
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    print(model.summary())
    plot_model(model,show_shapes=True,to_file='multichannel.png')
    
    return model

In [65]:
trainlines, trainlabels = load_dataset('train.pkl')
testLines, testLabels = load_dataset('test.pkl')
tokenizer = create_tokenizer(trainlines)

length = max_length(trainlines)
vocab_size = len(tokenizer.word_index) + 1

trainX = encode_text(tokenizer,trainlines,length)
testX = encode_text(tokenizer, testLines, length)
print(trainX.shape)

(1800, 1380)


In [66]:

# define model
model = define_model(length, vocab_size)
# fit model
model.fit([trainX,trainX,trainX], array(trainLabels), epochs=10, batch_size=16)
# save the model
model.save('model.h5')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_20 (InputLayer)           (None, 1380)         0                                            
__________________________________________________________________________________________________
input_21 (InputLayer)           (None, 1380)         0                                            
__________________________________________________________________________________________________
input_22 (InputLayer)           (None, 1380)         0                                            
__________________________________________________________________________________________________
embedding_20 (Embedding)        (None, 1380, 100)    4427700     input_20[0][0]                   
__________________________________________________________________________________________________
embedding_

ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.