In [79]:
import gensim

# Load Google's pre-trained Word2Vec model.
w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)  

In [2]:
import glob
def getcontent(path):
# path = 'txt_sentoken/pos/*.txt'   
    files = glob.glob(path)   
    listofreviews=[]
    for name in files: # 'file' is a builtin type, 'name' is a less-ambiguous variable name
        with open(name) as f: # No need to specify 'r': this is the default.
            #sys.stdout.write(f.read())
            contents=f.read()
            contents=contents.replace(',','')
            contents=contents.replace('\n','')
            listofreviews.append([''.join(contents)])
    return listofreviews

In [3]:
posreviews = getcontent('txt_sentoken/pos/*.txt')
negreviews = getcontent('txt_sentoken/neg/*.txt')

In [4]:
import pandas as pd
posdf = pd.DataFrame({'review':posreviews})
posdf['label'] = 1
negdf = pd.DataFrame({'review':negreviews})
negdf['label'] = -1 


In [5]:
alldata = pd.concat([posdf,negdf],ignore_index=True)
alldata['review'] = alldata['review'].apply(', '.join)


In [6]:
import re
def getwordlist(text):
    # Convert words to lower case and split them
    text = ''.join(text)
    text = text.lower().split()    
    text = " ".join(text)    
    #Remove Special Characters
    text=re.sub(r'[^a-z\d ]','',text)    
    #Replace Numbers
    text=re.sub(r'\d+','n',text)
    # Return a list of words
    return(text)

In [7]:
alldata['review'] = alldata.review.apply(getwordlist)

In [8]:
alldata.head()

Unnamed: 0,review,label
0,films adapted from comic books have had plenty...,1
1,every now and then a movie comes along from a ...,1
2,youve got mail works alot better than it deser...,1
3,jaws is a rare film that grabs your attentio...,1
4,moviemaking is a lot like being the general ma...,1


In [111]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(alldata['review'], alldata['label'], test_size=0.20,shuffle=True)

In [112]:
print(len(X_train))
print(len(X_test))

1600
400


In [13]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

sgd = Pipeline([
                ('vect', CountVectorizer()),
#                 ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

# %%time

y_pred = sgd.predict(X_test)
my_tags = ['positive','negative']
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))



accuracy 0.7725
              precision    recall  f1-score   support

    positive       0.94      0.55      0.69       187
    negative       0.71      0.97      0.82       213

   micro avg       0.77      0.77      0.77       400
   macro avg       0.82      0.76      0.76       400
weighted avg       0.82      0.77      0.76       400



In [17]:
from sklearn.svm import SVC   
svm = Pipeline([
                ('vect', CountVectorizer()),
#                 ('tfidf', TfidfTransformer()),
                ('clf', SVC(kernel='linear')),
               ])
svm.fit(X_train, y_train)

# %%time

y_pred = svm.predict(X_test)

my_tags = ['positive','negative']
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.825
              precision    recall  f1-score   support

    positive       0.82      0.80      0.81       187
    negative       0.83      0.85      0.84       213

   micro avg       0.82      0.82      0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [18]:
from sklearn.svm import SVC   
svm_rbf = Pipeline([
                ('vect', CountVectorizer()),
                ('clf', SVC(kernel='rbf')),
               ])
svm_rbf.fit(X_train, y_train)

# %%time

y_pred = svm_rbf.predict(X_test)

my_tags = ['positive','negative']
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))



accuracy 0.69
              precision    recall  f1-score   support

    positive       0.62      0.86      0.72       187
    negative       0.81      0.54      0.65       213

   micro avg       0.69      0.69      0.69       400
   macro avg       0.72      0.70      0.69       400
weighted avg       0.72      0.69      0.68       400



In [109]:
import numpy as np
def word2vectoizer(x_data):
    x_data_embeddings = []
    wordVector = np.zeros(300)
    count = 0
    x_data = x_data.split(" ")
    for words in x_data:
        count += 1
        if words in w2v.vocab:
            wordVector += w2v.word_vec(words)
    if(count != 0):
        wordVector /= count
    x_data_embeddings.append(wordVector)
    return wordVector


In [104]:
from sklearn.svm import SVC   
svm_embedding = Pipeline([
#                 "word2vec vectorizer", MeanEmbeddingVectorizer(model)
#                 ('tfidf', TfidfTransformer()),
                ('clf', SVC(kernel='linear')),
               ])

test_tokenized = X_test.apply(word2vectoizer)
test_tokenized = pd.DataFrame(test_tokenized.values.tolist())
train_tokenized = X_train.apply(word2vectoizer)
train_tokenized = pd.DataFrame(train_tokenized.values.tolist())
svm_embedding.fit(train_tokenized, y_train)
                              
# %%time

y_pred = svm_embedding.predict(test_tokenized)

my_tags = ['positive','negative']
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.7825
              precision    recall  f1-score   support

    positive       0.79      0.81      0.80       213
    negative       0.78      0.75      0.76       187

   micro avg       0.78      0.78      0.78       400
   macro avg       0.78      0.78      0.78       400
weighted avg       0.78      0.78      0.78       400



In [113]:
from sklearn.svm import SVC   
svm_embedding_rbf = Pipeline([
#                 "word2vec vectorizer", MeanEmbeddingVectorizer(model)
#                 ('tfidf', TfidfTransformer()),
                ('clf', SVC(kernel='rbf')),
               ])

test_tokenized = X_test.apply(word2vectoizer)
test_tokenized = pd.DataFrame(test_tokenized.values.tolist())
train_tokenized = X_train.apply(word2vectoizer)
train_tokenized = pd.DataFrame(train_tokenized.values.tolist())
svm_embedding_rbf.fit(train_tokenized, y_train)
                              
# %%time

y_pred = svm_embedding_rbf.predict(test_tokenized)

my_tags = ['positive','negative']
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))



accuracy 0.4925
              precision    recall  f1-score   support

    positive       0.00      0.00      0.00       203
    negative       0.49      1.00      0.66       197

   micro avg       0.49      0.49      0.49       400
   macro avg       0.25      0.50      0.33       400
weighted avg       0.24      0.49      0.33       400



  'precision', 'predicted', average, warn_for)


In [106]:
# import nltk
# import numpy as np

# def w2v_tokenize_text(text):
#     tokens = []
# #     print(text)
#     for sent in nltk.sent_tokenize(text):
#         for word in nltk.word_tokenize(sent):
#             if len(word) < 2:
#                 continue
#             tokens.append(word)
#     return tokens
    
# # train, test = train_test_split(df, test_size=0.3, random_state = 42)

# test_tokenized = X_test.apply(w2v_tokenize_text)#X_test.apply(lambda r: w2v_tokenize_text(r['comments'])).values
# train_tokenized = X_train.apply(w2v_tokenize_text)#X_train.apply(lambda r: w2v_tokenize_text(r['comments'])).values

# X_train_word_average = word_averaging_list(model,train_tokenized)
# X_test_word_average = word_averaging_list(model,test_tokenized)

In [107]:
# def word_averaging(wv, words):
#     all_words, mean = set(), []
    
#     for word in words:
#         if isinstance(word, np.ndarray):
#             mean.append(word)
#         elif word in wv.vocab:
#             mean.append(wv.vectors_norm[wv.vocab[word].index])
#             all_words.add(wv.vocab[word].index)

#     if not mean:
# #         logging.warning("cannot compute similarity with no input %s", words)
#         # FIXME: remove these examples in pre-processing
#         return np.zeros(wv.vector_size,)

#     mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
#     return mean

# def  word_averaging_list(model, text_list):
#     return np.vstack([word_averaging(model, post) for post in text_list ])

In [108]:
# train_reviews = []
# for text in X_train:
#     train_reviews.append(getwordlist(text))
# test_reviews=[]
# for text in X_test:
#     test_reviews.append(getwordlist(text))


In [59]:
#GRU
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_reviews + test_reviews) 
sequences = tokenizer.texts_to_sequences(train_reviews)
test_sequences = tokenizer.texts_to_sequences(test_reviews)

word_index = tokenizer.word_index
print('unique words %s' % len(word_index))

train_data = pad_sequences(sequences, maxlen=2365, padding='post')
print('Train data shape:', train_data.shape)

test_data = pad_sequences(test_sequences, maxlen=2365, padding='post')
print('Test data shape:', test_data.shape)

unique words 46908
Train data shape: (1600, 2365)
Test data shape: (400, 2365)


In [62]:
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((len(word_index)+1, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in model.vocab:
        embedding_matrix[i] = model.word_vec(word)
import keras
embedding_layer = keras.layers.Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix],input_length=2365,trainable=False)


In [63]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN, GRU 
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.layers import Flatten
from keras.callbacks import EarlyStopping, ModelCheckpoint
model = Sequential()
model.add(embedding_layer)
#model.add(GRU(32))
model.add(SimpleRNN(300, return_sequences=True))
model.add(SimpleRNN(300))
#model.add(Flatten())
#model.add(SimpleRNN(32, return_sequences=True))
#model.add(SimpleRNN(64))
#model.add(SimpleRNN(64))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(train_data, y_train, epochs=5, batch_size=256)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2365, 300)         14072700  
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 2365, 300)         180300    
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 300)               180300    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 301       
Total params: 14,433,601
Trainable params: 360,901
Non-trainable params: 14,072,700
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [64]:
loss, accuracy = model.evaluate(test_data, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 1.250000


In [67]:
from keras.layers import Flatten, GlobalMaxPooling1D, MaxPool1D, Conv1D, Concatenate
from keras.layers import Input
text_seq_input = Input(shape=(2365,), dtype='int32')
text_embedding = keras.layers.Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix],input_length=2365,trainable=False)(text_seq_input)

filter_sizes = [3,4,5]
convs = []
for filter_size in filter_sizes:
    l_conv = Conv1D(filters=128, kernel_size=filter_size, padding='same', activation='relu')(text_embedding)
    l_pool = MaxPool1D(filter_size)(l_conv)
    convs.append(l_pool)

l_merge = Concatenate(axis=1)(convs)
l_cov1= Conv1D(128, 5, activation='tanh')(l_merge)
# since the text is too long we are maxpooling over 100
# and not GlobalMaxPool1D
l_pool1 = MaxPool1D(100)(l_cov1)
l_flat = Flatten()(l_pool1)
l_dense = Dense(128, activation='tanh')(l_flat)
l_out = Dense(1, activation='softmax')(l_dense)
model_1 = Model(inputs=[text_seq_input], outputs=l_out)

In [70]:
model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_1.summary()
model_1.fit(train_data, y_train, epochs=5, batch_size=256)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 2365)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 2365, 300)    14072700    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 2365, 128)    115328      embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 2365, 128)    153728      embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_7 (

  % delta_t_median)


Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2d8bbc59940>

In [73]:
loss, accuracy = model_1.evaluate(test_data, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 53.250000
