In [3]:
import numpy as np 
import pandas as pd 
import re
import sklearn
from decimal import Decimal
import nltk
from sklearn.feature_extraction import DictVectorizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D,Bidirectional
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from nltk.tokenize import sent_tokenize, word_tokenize
from textblob import TextBlob, Word
from langdetect import detect
from gensim.models import Word2Vec
from sentic import SenticPhrase
import warnings
warnings.filterwarnings("ignore")

#Reading and cleaning the data
data = pd.read_csv(r"C:\Users\jayap\Desktop\Revs\reviews.csv",keep_default_na=False,nrows=7000)
data = data["comments"]
data = data.apply(lambda x: x.lower())
# data = data.apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
data_new = []
#Tokenizing the data 
tokens = []
polarity_set = []
sentic_vecs= {}
for review in data:
    try:
        if (detect(review) == 'en'):
            data_new.append(review)
            sentence = TextBlob(review)
            value = sentence.polarity
            if value > 0.5:
                polarity_set.append("positive")
            elif value <= 0.5:
                polarity_set.append("negative")
            tokens.append(nltk.word_tokenize(review))
    except:
        continue 
    
pos_tags = []
tags = []
for token in tokens:
    pos_tags.append(nltk.pos_tag(token))

for list in pos_tags:
    tag = []
    for word,pos in list:
        tag.append(pos)
    tags.append(tag)

# Creating the model and setting values for the various parameters
num_features = 100  # Word vector dimensionality
min_word_count = 1 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 5        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

model = Word2Vec(tokens,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling,negative=1)
tag_model = Word2Vec(tags,workers=num_workers,min_count=min_word_count,size=45,negative=1)

print("Vocabulary")
print(model)

print("POS tag vocab")
print(tag_model)

Vocabulary
Word2Vec(vocab=9608, size=100, alpha=0.025)
POS tag vocab
Word2Vec(vocab=43, size=45, alpha=0.025)


In [4]:
train_vecs = []
sentic_dict = DictVectorizer(sparse=False)
for review in data_new:
    sp = SenticPhrase(review)
    token = nltk.word_tokenize(review)
    token = nltk.pos_tag(token)            
    for word,pos in token:
        sentic = {}
        sentic.update(sp.get_sentics(word))
        polarity = {'polarity': sp.get_polarity(word)}
        sentic.update(polarity)
        sent = sentic_dict.fit_transform(sentic)
        senti = []
        if sent.size > 1:
            for i in sent[0]:
                senti.append('%.2E' % Decimal(i))
        else:
            for i in range(5):
                senti.append(0)
        senti = np.array(senti)
        x = np.concatenate([model[word],tag_model[pos],senti])
    train_vecs.append(x)


       
polarity_set = pd.get_dummies(polarity_set).values

tokens = [w for s in tokens for w in s ]
tokens = set(tokens)
max_features = len(tokens)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,max_features - 1))
scaler.fit(train_vecs)
train_vecs = scaler.transform(train_vecs)

#converting to array type
train_vecs = np.asarray(train_vecs)

embed_dim = 128
lstm_out = 196
model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = train_vecs.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(Bidirectional(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(polarity_set.shape[1],activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

print("Training on " + str(int(0.75*len(train_vecs))) + " out of " + str(len(train_vecs)) + " samples")
X_train, X_test,Y_train, Y_test = train_test_split(train_vecs,polarity_set, test_size = 0.25, random_state = 42)
batch_size = 32
model.fit(X_train,Y_train, epochs = 8, batch_size=batch_size, verbose = 1)

validation_size = 500
X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]

score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("\n")
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))


Training on 5201 out of 6935 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


score: 0.57
acc: 0.75
