In [19]:
import os
import re
import nltk
import gensim
import string
import random
import tarfile
import numpy as np
from nltk.corpus import stopwords
from keras.models import Sequential
from nltk.stem import PorterStemmer
from sklearn.metrics import f1_score
from keras.initializers import Constant
from nltk.tokenize import word_tokenize
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Embedding, LSTM, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
path = "/content/drive/MyDrive/Files/aclImdb_v1.tar.gz"
my_tar = tarfile.open(path)
my_tar.extractall('./')
my_tar.close()

In [3]:
pos_dst = '/content/aclImdb/train/pos'
neg_dst = '/content/aclImdb/train/neg'

# No of positive samples
no_ps = len(os.listdir(pos_dst))

# No of negative samples
no_ns = len(os.listdir(neg_dst))

print("No of positive samples:", no_ps)
print("No of negative samples:", no_ns)

No of positive samples: 12500
No of negative samples: 12500


In [4]:
train_dict = {"pos":[],"neg":[]}

list_pos = os.listdir(pos_dst)
list_neg = os.listdir(neg_dst)

for pos,neg in zip(list_pos,list_neg):
    
    pos_f = open(os.path.join(pos_dst,pos),"r")
    neg_f = open(os.path.join(neg_dst,neg),"r")
    
    # Read the text
    pos_txt = pos_f.read()
    neg_txt = neg_f.read()
    
    pos_f.close()
    neg_f.close()
    
    train_dict["pos"].append(pos_txt)
    train_dict["neg"].append(neg_txt) 

In [None]:
pos_dst_test = "/content/aclImdb/test/pos"
neg_dst_test = "/content/aclImdb/test/neg"

list_pos = os.listdir(pos_dst_test)
list_neg = os.listdir(neg_dst_test)

for pos,neg in zip(list_pos,list_neg):
    
    pos_f = open(os.path.join(pos_dst_test,pos),"r")
    neg_f = open(os.path.join(neg_dst_test,neg),"r")
    
    # Read the text
    pos_txt = pos_f.read()
    neg_txt = neg_f.read()
    
    pos_f.close()
    neg_f.close()
    
    train_dict["pos"].append(pos_txt)
    train_dict["neg"].append(neg_txt)

In [5]:
X = train_dict["pos"]+train_dict["neg"]
Y = np.append(np.ones((len(train_dict["pos"]),1)),
                       np.zeros((len(train_dict["neg"]), 1)),
                       axis=0)

In [6]:
def process(txt):
    
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # Removing hyperlinks
    txt = re.sub(r'http\S+', '', txt)
    
    # Removing Line breaks
    txt = re.sub(r'<br />', ' ', txt)
    
    # Only removing the hash # sign from the word
    txt = re.sub(r'#','', txt)
    
    # Removing text.text types
    match = re.search(r'[a-zA-Z]*\.[a-zA-Z]*', txt)
    while(match != None):
        replace = " ".join((txt[match.start():match.end()].split(".")))
        txt = re.sub(r'[a-zA-Z]*\.[a-zA-Z]*',replace, txt, 1)
        match = re.search(r'[a-zA-Z]*\.[a-zA-Z]*', txt)
    
    # Removing special characters and numbers
    pattern = r'[^a-zA-z\s]'
    txt = re.sub(pattern, ' ', txt)
    
    txt_tokens = nltk.word_tokenize(txt)  
    
    clean_txt = []
    
    for word in txt_tokens:
        word = word.lower()
        word = word.strip(" ")
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            
            stem_word = stemmer.stem(word.strip('_'))# stemming word
            clean_txt.append(stem_word)
    
    
    
    return clean_txt

def Process_List(List):
  P_list = []
  for item in List:
    P_list.append(process(item))
  return P_list

In [7]:
X_processed = Process_List(X)

In [8]:
max_length = max(len(s) for s in X_processed)
max_length

1422

In [9]:
EMBEDDING_DIM = 100
model = gensim.models.Word2Vec(sentences=X_processed, size=EMBEDDING_DIM, window=5, workers=4, min_count=1)

# vocab size
words = list(model.wv.vocab)
print( "Vocabulary size: %d" % len(words))

In [None]:
stemmer = PorterStemmer()
model.wv.most_similar(stemmer.stem("horrible"))

  if np.issubdtype(vec.dtype, np.int):


[('terribl', 0.9086008667945862),
 ('atroci', 0.798699140548706),
 ('horrend', 0.7883786559104919),
 ('aw', 0.7870608568191528),
 ('horrid', 0.7560380697250366),
 ('pathet', 0.7318055629730225),
 ('lousi', 0.7211521863937378),
 ('dread', 0.7113169431686401),
 ('bad', 0.6869966983795166),
 ('laughabl', 0.6846331357955933)]

In [None]:
model.wv.most_similar_cosmul(positive=['woman','king'],negative=['man'])

[('princess', 0.8513698577880859),
 ('queen', 0.8470419049263),
 ('agatha', 0.8260659575462341),
 ('chatterley', 0.8145774006843567),
 ('princ', 0.8107573390007019),
 ('carmen', 0.8095245957374573),
 ('elleri', 0.8093214631080627),
 ('changxin', 0.8087121844291687),
 ('godmoth', 0.8040111064910889),
 ('jerol', 0.8036038279533386)]

In [None]:
filename = '/content/drive/MyDrive/Files/IMDB_50k_Embedding_Word2Vec.txt'
model.wv.save_word2vec_format(filename,binary=False)

In [None]:
embeddings_index = {} # Word:Embedding Dictionary

f = open(os.path.join('', filename), encoding = "utf-8")

for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:])
  embeddings_index[word] = coefs
f.close()

In [10]:
# vectorize the text samples into a 2D integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(X_processed)
sequences = tokenizer_obj.texts_to_sequences(X_processed)

# pad sequences
word_index = tokenizer_obj.word_index
print("Found %s unique tokens." % len(word_index))

review_pad = pad_sequences(sequences, maxlen=max_length)
print('Shape of review tensor:', review_pad.shape)

Found 50106 unique tokens.
Shape of review tensor: (25000, 1422)


In [11]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM) )
for word, i in word_index.items():
  if i > num_words:
    continue
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    # words not found in embedding index will be all-zeros.
    embedding_matrix[i] = embedding_vector

In [None]:
# define model
model = Sequential()

embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM, 
                            embeddings_initializer=Constant(embedding_matrix), 
                            input_length=max_length,
                            trainable=False)

model.add(embedding_layer)
model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model .compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(review_pad, Y, test_size=0.25, shuffle=True,random_state=0)

y_train = y_train.ravel()
y_test = y_test.ravel()

print("Shape of X_train:", X_train.shape)
print("Shape of y_train", y_train.shape)
print("Shape of X_test", X_test.shape)
print("Shape of y_test", y_test.shape)

Shape of X_train: (18750, 1422)
Shape of y_train (18750,)
Shape of X_test (6250, 1422)
Shape of y_test (6250,)


In [None]:
model.fit(X_train,y_train,batch_size=64,epochs=100,validation_data=(X_test,y_test),verbose=1)

In [None]:
model.save('/content/drive/MyDrive/Files/W2V_Model.h5')

In [12]:
from keras.models import load_model
m = load_model('/content/drive/MyDrive/Files/W2V_Model.h5')



In [18]:
print("Training Accuracy: {}%".format(m.evaluate(X_train,y_train,verbose=0)[1]*100))
print("Testing Accuracy: {}%".format(m.evaluate(X_test,y_test,verbose=0)[1]*100))

Training Accuracy: 85.31200289726257%
Testing Accuracy: 85.43999791145325%


In [22]:
print("Training F1 Score: ",f1_score(y_train,m.predict_classes(X_train),average='binary'))
print("Testing F1 Score: ",f1_score(y_test,m.predict_classes(X_test),average='binary'))

Training F1 Score:  0.8509578958761772
Testing F1 Score:  0.8529411764705883
