In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# create a vocabulary


In [None]:
import nltk
#nltk.download()

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import string
import re
from os import listdir
from collections import Counter
from nltk.corpus import stopwords


In [None]:
#functions
# load doc into memory
def load_doc(filename):
# open the file as read only
  file = open(filename, 'r')
# read all text
  text = file.read()
# close the file
  file.close()
  return text

# turn a doc into clean tokens
def clean_doc(doc):
# split into tokens by white space
  tokens = doc.split()
# prepare regex for char filtering
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
  tokens = [re_punc.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
  tokens = [word for word in tokens if len(word) > 1]
  return tokens

# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
# load doc
  doc = load_doc(filename)

# clean doc
  tokens = clean_doc(doc)
# update counts
  vocab.update(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
# walk through all files in the folder
  for filename in listdir(directory):
# skip any reviews in the test set
    if filename.startswith('cv9'):
      continue
# create the full path of the file to open
    path = directory + '/' + filename
# add doc to vocab
    add_doc_to_vocab(path, vocab)

def save_list(lines, filename):
# convert lines to a single blob of text
  data = '\n'.join(lines)
# open file
  file = open(filename, 'w')
# write text
  file.write(data)
# close file
  file.close()



In [None]:
# define vocab
vocab = Counter()
# add all docs to vocab
#txt_sentoken = '/content/drive/MyDrive/txt_sentoken'
process_docs('/content/drive/MyDrive/txt_sentoken/pos', vocab)
process_docs('/content/drive/MyDrive/txt_sentoken/neg', vocab)
# print the size of the vocab
print(len(vocab))

44276


In [None]:
#vocab


In [None]:
# keep tokens with a min occurrence
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))
# save tokens to a vocabulary file
save_list(tokens, '/content/drive/MyDrive/txt_sentoken/vocab.txt')


25767


# Train a CNN


In [None]:
import string
import re
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D


In [None]:
# turn a doc into clean tokens
def clean_doc(doc, vocab):
# split into tokens by white space
  tokens = doc.split()
# prepare regex for char filtering
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
  tokens = [re_punc.sub('', w) for w in tokens]
# filter out tokens not in vocab
  tokens = [w for w in tokens if w in vocab]
  tokens = ' '.join(tokens)
  return tokens


# load all docs in a directory
def process_docs(directory, vocab, is_train):
  documents = list()
# walk through all files in the folder
  for filename in listdir(directory):
# skip any reviews in the test set
    if is_train and filename.startswith('cv9'):
      continue
    if not is_train and not filename.startswith('cv9'):
      continue
# create the full path of the file to open
    path = directory + '/' + filename
# load the doc
    doc = load_doc(path)
# clean doc
    tokens = clean_doc(doc, vocab)
# add to list
    documents.append(tokens)
  return documents

# load and clean a dataset
def load_clean_dataset(vocab, is_train):
# load documents
  neg = process_docs('/content/drive/MyDrive/txt_sentoken/neg', vocab, is_train)
  pos = process_docs('/content/drive/MyDrive/txt_sentoken/pos', vocab, is_train)
  docs = neg + pos
# prepare labels
  labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
  return docs, labels

# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

# integer encode and pad documents
def encode_docs(tokenizer, max_length, docs):
# integer encode
  encoded = tokenizer.texts_to_sequences(docs)
# pad sequences
  padded = pad_sequences(encoded, maxlen=max_length, padding='post')
  return padded


In [None]:
# define the model
def define_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 100, input_length=max_length))
  model.add(Conv1D(filters=32, kernel_size=12, activation='relu'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Flatten())
  model.add(Dense(10, activation='relu'))

  model.add(Dense(1, activation='sigmoid'))
# compile network
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# summarize defined model
  model.summary()
  plot_model(model, to_file='model.png', show_shapes=True)

  return model


In [None]:
# load the vocabulary
vocab_filename = '/content/drive/MyDrive/txt_sentoken/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

train_docs, ytrain = load_clean_dataset(vocab, True)
# create the tokenizer
tokenizer = create_tokenizer(train_docs)
# define vocabulary size
vocab_size = len(tokenizer.word_index) + 1

print('Vocabulary size: %d' % vocab_size)
# calculate the maximum sequence length
max_length = max([len(s.split()) for s in train_docs])
print('Maximum length: %d' % max_length)
# encode data
Xtrain = encode_docs(tokenizer, max_length, train_docs)
# define model
model = define_model(vocab_size, max_length)
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# save the model
#model.save('model.h5')


Vocabulary size: 25768
Maximum length: 1317
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1317, 100)         2576800   
                                                                 
 conv1d_1 (Conv1D)           (None, 1306, 32)          38432     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 653, 32)          0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 20896)             0         
                                                                 
 dense_3 (Dense)             (None, 10)                208970    
                                                                 
 dense_4 (Dense)             (None, 1)                 11        
          

<keras.callbacks.History at 0x7f1fcc5678e0>

array([  30, 2806,  325, ...,    0,    0,    0], dtype=int32)

In [None]:
# classify a review as negative or positive
def predict_sentiment(review, vocab, tokenizer, max_length, model):
# clean review
  line = clean_doc(review, vocab)
# encode and pad review
  padded = encode_docs(tokenizer, max_length, [line])
# predict sentiment
  yhat = model.predict(padded, verbose=0)
# retrieve predicted percentage and label
  percent_pos = yhat[0,0]
  if round(percent_pos) == 0:
    return (1-percent_pos), 'NEGATIVE'
  return percent_pos, 'POSITIVE'


In [None]:
test_docs, ytest = load_clean_dataset(vocab, False)

Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %.2f' % (acc*100))

Test Accuracy: 85.00


In [None]:
text = 'Everyone will enjoy this film. I love it, recommended!'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))
# test negative text
text = 'averge . you can watch all by yourself.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [Everyone will enjoy this film. I love it, recommended!]
Sentiment: NEGATIVE (51.207%)
Review: [averge . you can watch all by yourself.]
Sentiment: NEGATIVE (51.860%)


# N-gram CNN model


In [None]:
# turn a doc into clean tokens
def clean_doc(doc):
# split into tokens by white space
  tokens = doc.split()
# prepare regex for char filtering
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
  tokens = [re_punc.sub('', w) for w in tokens]
# filter out tokens not in vocab
  # remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
  tokens = [word for word in tokens if len(word) > 1]
  tokens = ' '.join(tokens)

  return tokens


# load all docs in a directory
def process_docs(directory, is_train):
  documents = list()
# walk through all files in the folder
  for filename in listdir(directory):
# skip any reviews in the test set
    if is_train and filename.startswith('cv9'):
      continue
    if not is_train and not filename.startswith('cv9'):
      continue
# create the full path of the file to open
    path = directory + '/' + filename
# load the doc
    doc = load_doc(path)
# clean doc
    tokens = clean_doc(doc)
# add to list
    documents.append(tokens)
  return documents

# load and clean a dataset
def load_clean_dataset(is_train):
# load documents
  neg = process_docs('/content/drive/MyDrive/txt_sentoken/neg',  is_train)
  pos = process_docs('/content/drive/MyDrive/txt_sentoken/pos',  is_train)
  docs = neg + pos
# prepare labels
  labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
  return docs, labels

# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

# integer encode and pad documents
def encode_docs(tokenizer, max_length, docs):
# integer encode
  encoded = tokenizer.texts_to_sequences(docs)
# pad sequences
  padded = pad_sequences(encoded, maxlen=max_length, padding='post')
  return padded


In [None]:
from pickle import dump
def save_dataset(dataset, filename):
  dump(dataset, open(filename, 'wb'))
  print('Saved: %s' % filename)
# load and clean all reviews
train_docs, ytrain = load_clean_dataset(is_train=True)
test_docs, ytest = load_clean_dataset(is_train = False)
# save training datasets
save_dataset([train_docs, ytrain], 'train.pkl')
save_dataset([test_docs, ytest], 'test.pkl')


Saved: train.pkl
Saved: test.pkl


In [None]:
from pickle import load
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import concatenate


In [None]:

# load a clean dataset
def load_dataset(filename):
  return load(open(filename, 'rb'))
# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer
# calculate the maximum document length
def max_length(lines):
  return max([len(s.split()) for s in lines])
# encode a list of lines
def encode_text(tokenizer, lines, length):
# integer encode
  encoded = tokenizer.texts_to_sequences(lines)
# pad encoded sequences
  padded = pad_sequences(encoded, maxlen=length, padding='post')
  return padded

In [None]:
# define the model
def define_model(length, vocab_size):
# channel 1
  inputs1 = Input(shape=(length,))
  embedding1 = Embedding(vocab_size, 100)(inputs1)
  conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
  drop1 = Dropout(0.5)(conv1)
  pool1 = MaxPooling1D(pool_size=2)(drop1)
  flat1 = Flatten()(pool1)
# channel 2
  inputs2 = Input(shape=(length,))
  embedding2 = Embedding(vocab_size, 100)(inputs2)
  conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
  drop2 = Dropout(0.5)(conv2)
  pool2 = MaxPooling1D(pool_size=2)(drop2)
  flat2 = Flatten()(pool2)
# channel 3
  inputs3 = Input(shape=(length,))
  embedding3 = Embedding(vocab_size, 100)(inputs3)
  conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
  drop3 = Dropout(0.5)(conv3)
  pool3 = MaxPooling1D(pool_size=2)(drop3)
  flat3 = Flatten()(pool3)
# merge
  merged = concatenate([flat1, flat2, flat3])
# interpretation
  dense1 = Dense(10, activation='relu')(merged)
  outputs = Dense(1, activation='sigmoid')(dense1)
  model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
# compile
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# summarize
  model.summary()
  plot_model(model, show_shapes=True, to_file='model.png')
  return model


In [None]:
# load training dataset
trainLines, trainLabels = load_dataset('train.pkl')
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
print('Max document length: %d' % length)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
# define model
model = define_model(length, vocab_size)
# fit model
#model.fit([trainX,trainX,trainX], trainLabels, epochs=7, batch_size=16)
# save the model
#model.save('model.h5')

Max document length: 1380
Vocabulary size: 44277
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 1380)]       0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, 1380)]       0           []                               
                                                                                                  
 input_9 (InputLayer)           [(None, 1380)]       0           []                               
                                                                                                  
 embedding_8 (Embedding)        (None, 1380, 100)    4427700     ['input_7[0][0]']                
                                           

In [None]:
# load training dataset
testLines, testLabels = load_dataset('test.pkl')
# create tokenizer

testX = encode_text(tokenizer, testLines, length)

_, acc = model.evaluate([testX,testX,testX], testLabels, verbose=0)
print('Test Accuracy: %.2f' % (acc*100))


Test Accuracy: 49.50


In [None]:
import tensorflow as tf
import tensorflow_hub as hub

In [None]:
module_url = 'https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1'
embed_size = 128
trainable = False
hub_layer = hub.KerasLayer(module_url,input_shape=[],output_shape=[embed_size],dtype = tf.string,trainable = trainable)

# Character based CNN (LM)(cahracter based neural language modeling
)


In [None]:
!pip install keras_preprocessing

Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2


In [None]:
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras_preprocessing.sequence import pad_sequences


In [None]:

#functions that we are gonna use in our later code

def load_doc(filename):
  file = open(filename,'r')

  text = file.read()
  file.close()

  return text

def save_doc(lines,filename):
  data = '\n'.join(lines)
  file = open(filename,'w')
  file.write(data)
  file.close()



def build_model(X):
  model = Sequential()
  model.add(LSTM(75,input_shape= (X.shape[1],X.shape[2])))
  model.add(Dense(vocab_size,activation = 'softmax'))

  model.compile(loss = 'categorical_crossentropy', optimizer = 'adam' , metrics = ['accuracy'])
  model.summary()

  return model


In [None]:
text = load_doc('/content/New Text Document.txt')
#print(text)

In [None]:
tokens = text.split()
raw_text = ' '.join(tokens)
print(raw_text)

তবুও আমি ভাল থাকি , আকাশ থাকে যেমন, মেঘ ডাকে অজগুবি কে ডাকে অমনে


In [None]:
length = 10
sequences = []
for i in range(length,len(raw_text)):
  seq = raw_text[i-length:i+1]
  sequences.append(seq)


In [None]:
outfile = 'saved_text'
save_doc(sequences,outfile)

In [None]:
#encode input sequence

inp_file = '/content/saved_text'
data = load_doc(inp_file)
lines = data.split('\n')



In [None]:
chars = sorted(list(set(data)))
mapping = dict((c,i) for i, c in enumerate(chars))



In [None]:
encoded_sequence = []

for line in lines:
  en_sq = [mapping[char] for char in line]
  encoded_sequence.append(en_sq)

print('length of vacab is ' + str(len(mapping)) )





length of vacab is 24


In [None]:
vocab_size = len(mapping)


In [None]:
import numpy as np
encoded_sequence = np.array(encoded_sequence)

In [None]:
X = encoded_sequence[:,:-1]
y = encoded_sequence[:,-1]

In [None]:


tr_x = np.array([to_categorical(x, num_classes = vocab_size ) for x in X])
tr_y = to_categorical(y,num_classes = vocab_size)

In [None]:
model = build_model(tr_x)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 75)                30000     
                                                                 
 dense (Dense)               (None, 24)                1824      
                                                                 
Total params: 31,824
Trainable params: 31,824
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(tr_x,tr_y,epochs = 150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x79dabcbf1a50>

In [None]:
def generate_seq(model , mapping , seq_length , seed, n_chars):
  import numpy as np
  in_text = seed
  for _ in range(n_chars):
# encode the characters as integers
    encoded = [mapping[char] for char in in_text]
# truncate sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
# one hot encode
    encoded = to_categorical(encoded, num_classes=len(mapping))
    #encoded = encoded.reshape(1, encoded.shape[0], encoded.shape[1])
# predict character
    yhat = model.predict(encoded)

    classes_x=np.argmax(yhat,axis=1)

    out_char = ''

    for char , index in mapping.items():
      if index == classes_x:
        out_char = char
        break

    in_text += out_char

  return in_text


In [None]:
print(generate_seq(model, mapping, 10, 'যেমন', 3))
# test mid-line
#print(generate_seq(model, mapping, 10, 'king was i', 20))
# test not in original
#print(generate_seq(model, mapping, 10, 'hello worl', 20))

যেমনে  


In [None]:
print(generate_seq(model, mapping, 10, 'sing a son', 20))


sing a song of sixpence, A poc


In [None]:
print(generate_seq(model, mapping, 10, 'king was i', 20))


king was in his counting house


In [None]:
print(generate_seq(model, mapping, 10, 'i lo ', 20))

i lo ke pe. Foffo  theepi
