In [1]:
#Data manipulation and analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
#Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, GRU, SimpleRNN
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

Using TensorFlow backend.


In [3]:
# Scikit learn 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [4]:
# NLP 
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
import re

In [5]:
#Misc
from six.moves import cPickle
import pickle
import itertools
from collections import Counter
import glob

In [6]:
from my_functions import plot_history
from my_functions import clean_text
from my_functions import avg_word_len
#from my_functions import perf_results

## IBC data

In [7]:
[lib,con,neutral]= pickle.load(open('ibcData.pkl','rb'))

In [8]:
liberal = []
for tree in lib:
    liberal.append(tree.get_words())
conservative = []
for tree in con:
    conservative.append(tree.get_words())
neu = []
for tree in neutral:
    neu.append(tree.get_words())

In [9]:
liberals = pd.DataFrame(liberal,columns=['text'])
liberals['label'] = 0
conservatives = pd.DataFrame(conservative,columns=['text'])
conservatives['label'] = 1
neutrals = pd.DataFrame(neu,columns=['text'])
neutrals['label'] = 2

In [10]:
frames = [liberals,conservatives]
result = pd.concat(frames)

In [11]:
result['text'] = result['text'].map(lambda x: clean_text(x))
result = result.sample(frac=1).reset_index(drop=True)
my_ibc_data = result

In [12]:
my_ibc_data['word_count'] = my_ibc_data['text'].apply(lambda x: len(str(x).split(" ")))
overall_word_count = np.sum(my_ibc_data['word_count'].values)
print("Overall word count", overall_word_count)

('Overall word count', 85487)


In [13]:
my_ibc_data['char_count'] = my_ibc_data['text'].str.len()
overall_char_count = np.sum(my_ibc_data['char_count'].values)
print("Overall char count", overall_char_count)

('Overall char count', 628655)


In [14]:
my_ibc_data['avg_word_length'] = my_ibc_data['text'].apply(lambda x: avg_word_len(x))
overall_word_avg_len = np.sum(my_ibc_data['avg_word_length'].values)/len(my_ibc_data['avg_word_length'].values)
print("Overall average word length", overall_word_avg_len)
my_ibc_data.head(2)

('Overall average word length', 5)


Unnamed: 0,text,label,word_count,char_count,avg_word_length
0,nation destroyed war also destroyed political ...,1,17,128,6
1,efficiency term tax system limit risk corporat...,0,18,136,6


In [15]:
word_count_each_sentence = np.array(my_ibc_data['word_count'].values)
print 'word count per sentence'
print 'mean: ', np.mean(word_count_each_sentence)
print 'median: ', np.median(word_count_each_sentence)

word count per sentence
mean:  22.943370907139023
median:  22.0


## Convote data

In [16]:
convote_train_files_path = 'data_stage_one/training_set/*.txt'
convote_test_files_path = 'data_stage_one/test_set/*.txt'

In [17]:
convote_train_files = glob.glob(convote_train_files_path)
convote_test_files = glob.glob(convote_test_files_path)

In [18]:
filepath_dict = {'convote_train': convote_train_files,
                 'convote_test': convote_test_files}

In [19]:
convote_data = []

for data_type, filenames in filepath_dict.items():
    for i in range(len(filenames)):
        f = open(filenames[i], 'r')
        f_text = f.read()
        f.close()
        party = filenames[i].split('_')[-1][0]
        sample_group = data_type.split('_')[-1]
        review_label = 0 if party == 'D' else 1
        convote_data.append([f_text, party, sample_group, review_label])
      

In [20]:
convote_data = pd.DataFrame(convote_data)

In [21]:
convote_data = convote_data.rename(index=str, columns={0: 'text', 1: 'party', 2: 'group', 3: 'party_label'})

In [22]:
convote_data['text'] = convote_data['text'].map(lambda x: clean_text(x))

In [23]:
my_convote_data = pd.DataFrame(convote_data.iloc[:,[0,3]].values)
my_convote_data = my_convote_data.rename(index=str, columns={0: 'text', 1: 'party_label'})

In [24]:
my_convote_data['word_count'] = my_convote_data['text'].apply(lambda x: len(str(x).split(" ")))
overall_word_count = np.sum(my_convote_data['word_count'].values)
print("Overall word count", overall_word_count)

('Overall word count', 1016800)


In [25]:
my_convote_data['char_count'] = my_convote_data['text'].str.len()
overall_char_count = np.sum(my_convote_data['char_count'].values)
print("Overall char count", overall_char_count)

('Overall char count', 7330698)


In [26]:
my_convote_data['avg_word_length'] = my_convote_data['text'].apply(lambda x: avg_word_len(x))
overall_word_avg_len = np.sum(my_convote_data['avg_word_length'].values)/len(my_convote_data['avg_word_length'].values)
print("Overall average word length", overall_word_avg_len)
my_convote_data.head(2)

('Overall average word length', 5)


Unnamed: 0,text,party_label,word_count,char_count,avg_word_length
0,mr speaker rise join many colleague strongly o...,0,540,3901,6
1,mr chairman rise support amendment two ground ...,0,114,811,6


In [27]:
word_count_each_sentence = np.array(my_convote_data['word_count'].values)
print 'word count per sentence'
print 'mean: ', np.mean(word_count_each_sentence)
print 'median: ', np.median(word_count_each_sentence)

word count per sentence
mean:  137.0535112548861
median:  42.0


## Overall data

In [28]:
ibc = pd.DataFrame(my_ibc_data.iloc[:,[0,1]].values)
convote = pd.DataFrame(my_convote_data.iloc[:,[0,1]].values)

In [29]:
overall_data = [ibc, convote]
overall_data = pd.concat(overall_data)

In [30]:
ibc_text = np.array(my_ibc_data.iloc[:,0].values)
ibc_labels = np.array(my_ibc_data.iloc[:,1].values)

convote_text = np.array(my_convote_data.iloc[:,0].values)
convote_labels = np.array(my_convote_data.iloc[:,1].values)

overall_text = np.array(overall_data.iloc[:,0].values)
overall_labels = np.array(overall_data.iloc[:,1].values)

## Feature extraction

In [31]:
total_word_count = 50000
seq_length = 20 #Number of items in each sequence

tokenizer = Tokenizer(num_words=total_word_count)
tokenizer.fit_on_texts(convote_text)

In [32]:
ibc_sequences = tokenizer.texts_to_sequences(ibc_text)
ibc_sequences = pad_sequences(ibc_sequences, maxlen=seq_length)

In [33]:
convote_sequences = tokenizer.texts_to_sequences(convote_text)
convote_sequences = pad_sequences(convote_sequences, maxlen=seq_length)

In [34]:
exp_5_data = ibc_sequences
exp_5_labels = ibc_labels

add_data = convote_sequences

# Experiments for different models: 7 - semi-supervised learning

In [35]:
x_train, x_test, y_train, y_test = train_test_split(exp_5_data, exp_5_labels, test_size=0.2, random_state=42)

## LSTM

In [36]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(LSTM(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(5, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [37]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_7_lstm.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 2682 samples, validate on 298 samples
Epoch 1/10
 - 6s - loss: 0.7604 - acc: 0.4560 - val_loss: 0.6931 - val_acc: 0.5034
Epoch 2/10
 - 4s - loss: 0.6907 - acc: 0.5321 - val_loss: 0.6958 - val_acc: 0.4966
Epoch 3/10
 - 3s - loss: 0.6890 - acc: 0.5444 - val_loss: 0.6978 - val_acc: 0.4966
Epoch 4/10
 - 3s - loss: 0.6682 - acc: 0.6204 - val_loss: 0.6720 - val_acc: 0.5940
Epoch 5/10
 - 3s - loss: 0.5684 - acc: 0.8031 - val_loss: 0.6763 - val_acc: 0.5805
Epoch 6/10
 - 3s - loss: 0.4610 - acc: 0.8788 - val_loss: 0.6975 - val_acc: 0.5906
Epoch 7/10
 - 3s - loss: 0.3659 - acc: 0.9157 - val_loss: 0.7277 - val_acc: 0.5839


In [38]:
best_model = load_model('exp_7_lstm.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6287


In [39]:
add_psuedo_labels = best_model.predict(add_data)

In [40]:
unlab_labels = []

for pred in add_psuedo_labels:
    if pred > 0.5:
        unlab_labels.append(1)
    else:
        unlab_labels.append(0)
        
unlab_labels = np.array(unlab_labels)

In [41]:
new_x = np.concatenate((x_train, add_data), axis = 0)
new_y = np.concatenate((y_train, unlab_labels), axis = 0)

In [42]:
history = best_model.fit(new_x, new_y, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2, shuffle = True)

Train on 9359 samples, validate on 1040 samples
Epoch 1/10
 - 14s - loss: 0.4597 - acc: 0.8538 - val_loss: 0.3188 - val_acc: 0.9163
Epoch 2/10
 - 12s - loss: 0.3034 - acc: 0.9133 - val_loss: 0.3109 - val_acc: 0.8990
Epoch 3/10
 - 12s - loss: 0.2343 - acc: 0.9350 - val_loss: 0.2876 - val_acc: 0.8952
Epoch 4/10
 - 12s - loss: 0.1976 - acc: 0.9442 - val_loss: 0.3147 - val_acc: 0.8875
Epoch 5/10
 - 12s - loss: 0.1649 - acc: 0.9560 - val_loss: 0.3113 - val_acc: 0.8875
Epoch 6/10
 - 12s - loss: 0.1433 - acc: 0.9625 - val_loss: 0.3478 - val_acc: 0.8740


In [43]:
best_model = load_model('exp_7_lstm.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6501


In [44]:
y_pred_lstm = best_model.predict(x_test)

## GRU

In [45]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(GRU(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [46]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_7_gru.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 2682 samples, validate on 298 samples
Epoch 1/10
 - 5s - loss: 0.6893 - acc: 0.5440 - val_loss: 0.6982 - val_acc: 0.4966
Epoch 2/10
 - 3s - loss: 0.6743 - acc: 0.5697 - val_loss: 0.6727 - val_acc: 0.5839
Epoch 3/10
 - 3s - loss: 0.5492 - acc: 0.7625 - val_loss: 0.6809 - val_acc: 0.5772
Epoch 4/10
 - 3s - loss: 0.4087 - acc: 0.8482 - val_loss: 0.7410 - val_acc: 0.5973
Epoch 5/10
 - 3s - loss: 0.2995 - acc: 0.9057 - val_loss: 0.7914 - val_acc: 0.5839


In [47]:
best_model = load_model('exp_7_gru.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6153


In [48]:
add_psuedo_labels = best_model.predict(add_data)

In [49]:
unlab_labels = []

for pred in add_psuedo_labels:
    if pred > 0.5:
        unlab_labels.append(1)
    else:
        unlab_labels.append(0)
        
unlab_labels = np.array(unlab_labels)

In [50]:
new_x = np.concatenate((x_train, add_data), axis = 0)
new_y = np.concatenate((y_train, unlab_labels), axis = 0)

In [51]:
history = best_model.fit(new_x, new_y, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2, shuffle = True)

Train on 9359 samples, validate on 1040 samples
Epoch 1/10
 - 12s - loss: 0.4341 - acc: 0.8216 - val_loss: 0.3030 - val_acc: 0.8837
Epoch 2/10
 - 11s - loss: 0.2979 - acc: 0.8855 - val_loss: 0.2959 - val_acc: 0.8635
Epoch 3/10
 - 11s - loss: 0.2262 - acc: 0.9214 - val_loss: 0.2916 - val_acc: 0.8721
Epoch 4/10
 - 11s - loss: 0.1854 - acc: 0.9355 - val_loss: 0.2719 - val_acc: 0.8846
Epoch 5/10
 - 11s - loss: 0.1535 - acc: 0.9499 - val_loss: 0.2939 - val_acc: 0.8865
Epoch 6/10
 - 11s - loss: 0.1324 - acc: 0.9603 - val_loss: 0.3361 - val_acc: 0.8683
Epoch 7/10
 - 11s - loss: 0.1117 - acc: 0.9668 - val_loss: 0.3611 - val_acc: 0.8692


In [52]:
best_model = load_model('exp_7_gru.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6113


In [53]:
y_pred_gru = best_model.predict(x_test)

## Simple RNN

In [54]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(SimpleRNN(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [55]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='exp_7_rnn.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=20, callbacks = callbacks, verbose = 2)

Train on 2682 samples, validate on 298 samples
Epoch 1/20
 - 4s - loss: 0.7651 - acc: 0.4571 - val_loss: 0.6994 - val_acc: 0.5034
Epoch 2/20
 - 2s - loss: 0.7019 - acc: 0.4948 - val_loss: 0.6947 - val_acc: 0.4899
Epoch 3/20
 - 2s - loss: 0.6936 - acc: 0.5153 - val_loss: 0.6982 - val_acc: 0.4966
Epoch 4/20
 - 2s - loss: 0.6812 - acc: 0.5652 - val_loss: 0.6989 - val_acc: 0.4966
Epoch 5/20
 - 2s - loss: 0.6769 - acc: 0.5824 - val_loss: 0.6976 - val_acc: 0.4933
Epoch 6/20
 - 2s - loss: 0.6676 - acc: 0.5984 - val_loss: 0.7001 - val_acc: 0.5101
Epoch 7/20
 - 2s - loss: 0.6500 - acc: 0.6421 - val_loss: 0.7001 - val_acc: 0.4966


In [56]:
best_model = load_model('exp_7_rnn.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5576


In [57]:
add_psuedo_labels = best_model.predict(add_data)

In [58]:
unlab_labels = []

for pred in add_psuedo_labels:
    if pred > 0.5:
        unlab_labels.append(1)
    else:
        unlab_labels.append(0)
        
unlab_labels = np.array(unlab_labels)

In [59]:
new_x = np.concatenate((x_train, add_data), axis = 0)
new_y = np.concatenate((y_train, unlab_labels), axis = 0)

In [60]:
history = best_model.fit(new_x, new_y, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2, shuffle = True)

Train on 9359 samples, validate on 1040 samples
Epoch 1/10
 - 7s - loss: 0.4588 - acc: 0.8332 - val_loss: 0.1528 - val_acc: 0.9952
Epoch 2/10
 - 6s - loss: 0.4211 - acc: 0.8489 - val_loss: 0.1487 - val_acc: 0.9952
Epoch 3/10
 - 5s - loss: 0.3975 - acc: 0.8493 - val_loss: 0.1475 - val_acc: 0.9952
Epoch 4/10
 - 6s - loss: 0.2919 - acc: 0.8724 - val_loss: 0.0919 - val_acc: 0.9769
Epoch 5/10
 - 6s - loss: 0.2201 - acc: 0.9067 - val_loss: 0.0739 - val_acc: 0.9760
Epoch 6/10
 - 5s - loss: 0.1716 - acc: 0.9359 - val_loss: 0.0769 - val_acc: 0.9740
Epoch 7/10
 - 6s - loss: 0.1358 - acc: 0.9514 - val_loss: 0.0790 - val_acc: 0.9683
Epoch 8/10
 - 6s - loss: 0.1071 - acc: 0.9637 - val_loss: 0.0738 - val_acc: 0.9692
Epoch 9/10
 - 6s - loss: 0.0886 - acc: 0.9704 - val_loss: 0.0793 - val_acc: 0.9615
Epoch 10/10
 - 6s - loss: 0.0758 - acc: 0.9760 - val_loss: 0.0896 - val_acc: 0.9625


In [61]:
best_model = load_model('exp_7_rnn.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5764


In [62]:
y_pred_rnn = best_model.predict(x_test)

In [63]:
y_t = y_test.reshape(len(y_test),1)

In [97]:
exp_3_predictions = np.concatenate((y_pred_lstm, y_pred_gru, y_pred_rnn, y_t), axis=1)
exp_3_predictions_df = pd.DataFrame(exp_3_predictions)
exp_3_predictions_df.to_csv('exp_7_predictions.csv')

In [65]:
exp_6_data = convote_sequences
exp_6_labels = convote_labels

add_data = ibc_sequences

# Experiments for different models: 8 - semi-supervised learning

In [66]:
x_train, x_test, y_train, y_test = train_test_split(exp_6_data, exp_6_labels, test_size=0.2, random_state=42)

## LSTM

In [67]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(LSTM(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(5, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [68]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_8_lstm.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 5341 samples, validate on 594 samples
Epoch 1/10
 - 11s - loss: 0.7180 - acc: 0.4982 - val_loss: 0.6919 - val_acc: 0.5303
Epoch 2/10
 - 7s - loss: 0.6843 - acc: 0.5465 - val_loss: 0.6643 - val_acc: 0.6145
Epoch 3/10
 - 7s - loss: 0.6100 - acc: 0.7280 - val_loss: 0.6021 - val_acc: 0.6835
Epoch 4/10
 - 7s - loss: 0.5050 - acc: 0.8010 - val_loss: 0.5917 - val_acc: 0.6886
Epoch 5/10
 - 7s - loss: 0.4299 - acc: 0.8379 - val_loss: 0.5801 - val_acc: 0.7138
Epoch 6/10
 - 7s - loss: 0.3765 - acc: 0.8596 - val_loss: 0.6064 - val_acc: 0.7054
Epoch 7/10
 - 7s - loss: 0.3360 - acc: 0.8792 - val_loss: 0.6558 - val_acc: 0.6987
Epoch 8/10
 - 7s - loss: 0.3096 - acc: 0.8865 - val_loss: 0.6695 - val_acc: 0.7003


In [69]:
best_model = load_model('exp_8_lstm.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6961


In [70]:
add_psuedo_labels = best_model.predict(add_data)

In [71]:
unlab_labels = []

for pred in add_psuedo_labels:
    if pred > 0.5:
        unlab_labels.append(1)
    else:
        unlab_labels.append(0)
        
unlab_labels = np.array(unlab_labels)

In [72]:
new_x = np.concatenate((x_train, add_data), axis = 0)
new_y = np.concatenate((y_train, unlab_labels), axis = 0)

In [73]:
history = best_model.fit(new_x, new_y, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2, shuffle = True)

Train on 8694 samples, validate on 967 samples
Epoch 1/10
 - 13s - loss: 0.3673 - acc: 0.8642 - val_loss: 0.2826 - val_acc: 0.9049
Epoch 2/10
 - 11s - loss: 0.2980 - acc: 0.8952 - val_loss: 0.3159 - val_acc: 0.8873
Epoch 3/10
 - 12s - loss: 0.2603 - acc: 0.9081 - val_loss: 0.3289 - val_acc: 0.8645
Epoch 4/10
 - 11s - loss: 0.2405 - acc: 0.9138 - val_loss: 0.3814 - val_acc: 0.8480


In [74]:
best_model = load_model('exp_8_lstm.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6894


In [75]:
y_pred_lstm = best_model.predict(x_test)

## GRU

In [76]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(GRU(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [77]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_8_gru.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 5341 samples, validate on 594 samples
Epoch 1/10
 - 10s - loss: 0.6940 - acc: 0.5420 - val_loss: 0.6676 - val_acc: 0.6279
Epoch 2/10
 - 6s - loss: 0.5965 - acc: 0.7040 - val_loss: 0.5896 - val_acc: 0.6785
Epoch 3/10
 - 6s - loss: 0.4879 - acc: 0.7768 - val_loss: 0.5516 - val_acc: 0.7222
Epoch 4/10
 - 6s - loss: 0.4133 - acc: 0.8262 - val_loss: 0.5874 - val_acc: 0.6970
Epoch 5/10
 - 6s - loss: 0.3685 - acc: 0.8431 - val_loss: 0.5819 - val_acc: 0.7088
Epoch 6/10
 - 6s - loss: 0.3197 - acc: 0.8669 - val_loss: 0.6114 - val_acc: 0.7088


In [78]:
best_model = load_model('exp_8_gru.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6867


In [79]:
add_psuedo_labels = best_model.predict(add_data)

In [80]:
unlab_labels = []

for pred in add_psuedo_labels:
    if pred > 0.5:
        unlab_labels.append(1)
    else:
        unlab_labels.append(0)
        
unlab_labels = np.array(unlab_labels)

In [81]:
new_x = np.concatenate((x_train, add_data), axis = 0)
new_y = np.concatenate((y_train, unlab_labels), axis = 0)

In [82]:
history = best_model.fit(new_x, new_y, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2, shuffle = True)

Train on 8694 samples, validate on 967 samples
Epoch 1/10
 - 12s - loss: 0.4031 - acc: 0.8317 - val_loss: 0.2999 - val_acc: 0.8956
Epoch 2/10
 - 10s - loss: 0.3167 - acc: 0.8742 - val_loss: 0.2940 - val_acc: 0.8780
Epoch 3/10
 - 10s - loss: 0.2811 - acc: 0.8905 - val_loss: 0.3029 - val_acc: 0.8811
Epoch 4/10
 - 10s - loss: 0.2459 - acc: 0.9019 - val_loss: 0.3166 - val_acc: 0.8687
Epoch 5/10
 - 10s - loss: 0.2226 - acc: 0.9109 - val_loss: 0.3613 - val_acc: 0.8542


In [83]:
best_model = load_model('exp_8_gru.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6813


In [84]:
y_pred_gru = best_model.predict(x_test)

## Simple RNN

In [85]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(SimpleRNN(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [86]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='exp_8_rnn.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=20, callbacks = callbacks, verbose = 2)

Train on 5341 samples, validate on 594 samples
Epoch 1/20
 - 8s - loss: 0.7458 - acc: 0.5025 - val_loss: 0.6901 - val_acc: 0.5320
Epoch 2/20
 - 3s - loss: 0.6969 - acc: 0.5149 - val_loss: 0.6883 - val_acc: 0.5556
Epoch 3/20
 - 3s - loss: 0.6852 - acc: 0.5462 - val_loss: 0.6816 - val_acc: 0.5606
Epoch 4/20
 - 3s - loss: 0.6636 - acc: 0.6087 - val_loss: 0.6627 - val_acc: 0.6145
Epoch 5/20
 - 3s - loss: 0.5989 - acc: 0.6896 - val_loss: 0.6215 - val_acc: 0.6734
Epoch 6/20
 - 3s - loss: 0.5249 - acc: 0.7422 - val_loss: 0.6218 - val_acc: 0.6650
Epoch 7/20
 - 3s - loss: 0.4667 - acc: 0.7781 - val_loss: 0.6298 - val_acc: 0.6532
Epoch 8/20
 - 3s - loss: 0.4314 - acc: 0.8021 - val_loss: 0.6489 - val_acc: 0.6414
Epoch 9/20
 - 3s - loss: 0.3990 - acc: 0.8214 - val_loss: 0.6508 - val_acc: 0.6549
Epoch 10/20
 - 3s - loss: 0.3532 - acc: 0.8446 - val_loss: 0.6658 - val_acc: 0.6734


In [87]:
best_model = load_model('exp_8_rnn.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6637


In [88]:
add_psuedo_labels = best_model.predict(add_data)

In [89]:
unlab_labels = []

for pred in add_psuedo_labels:
    if pred > 0.5:
        unlab_labels.append(1)
    else:
        unlab_labels.append(0)
        
unlab_labels = np.array(unlab_labels)

In [90]:
new_x = np.concatenate((x_train, add_data), axis = 0)
new_y = np.concatenate((y_train, unlab_labels), axis = 0)

In [91]:
history = best_model.fit(new_x, new_y, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2, shuffle = True)

Train on 8694 samples, validate on 967 samples
Epoch 1/10
 - 6s - loss: 0.5258 - acc: 0.7476 - val_loss: 0.4205 - val_acc: 0.8428
Epoch 2/10
 - 5s - loss: 0.4328 - acc: 0.8077 - val_loss: 0.4048 - val_acc: 0.8108
Epoch 3/10
 - 5s - loss: 0.3707 - acc: 0.8379 - val_loss: 0.4020 - val_acc: 0.8211
Epoch 4/10
 - 5s - loss: 0.3357 - acc: 0.8568 - val_loss: 0.4380 - val_acc: 0.7952
Epoch 5/10
 - 5s - loss: 0.2995 - acc: 0.8768 - val_loss: 0.4751 - val_acc: 0.7725
Epoch 6/10
 - 5s - loss: 0.2731 - acc: 0.8862 - val_loss: 0.4844 - val_acc: 0.7746
Epoch 7/10
 - 5s - loss: 0.2498 - acc: 0.8968 - val_loss: 0.5196 - val_acc: 0.7580
Epoch 8/10
 - 5s - loss: 0.2331 - acc: 0.9018 - val_loss: 0.5389 - val_acc: 0.7549


In [92]:
best_model = load_model('exp_8_rnn.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6678


In [93]:
y_pred_rnn = best_model.predict(x_test)

In [94]:
y_t = y_test.reshape(len(y_test),1)

In [96]:
exp_4_predictions = np.concatenate((y_pred_lstm, y_pred_gru, y_pred_rnn, y_t), axis=1)
exp_4_predictions_df = pd.DataFrame(exp_4_predictions)
exp_4_predictions_df.to_csv('exp_8_predictions.csv')