In [1]:
#Data manipulation and analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
#Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, GRU, SimpleRNN
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

Using TensorFlow backend.


In [3]:
# Scikit learn 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [4]:
# NLP 
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
import re

In [5]:
#Misc
from six.moves import cPickle
import pickle
import itertools
from collections import Counter
import glob

In [6]:
from my_functions import plot_history
from my_functions import clean_text
from my_functions import avg_word_len
#from my_functions import perf_results

## IBC data

In [7]:
[lib,con,neutral]= pickle.load(open('ibcData.pkl','rb'))

In [8]:
liberal = []
for tree in lib:
    liberal.append(tree.get_words())
conservative = []
for tree in con:
    conservative.append(tree.get_words())
neu = []
for tree in neutral:
    neu.append(tree.get_words())

In [9]:
liberals = pd.DataFrame(liberal,columns=['text'])
liberals['label'] = 0
conservatives = pd.DataFrame(conservative,columns=['text'])
conservatives['label'] = 1
neutrals = pd.DataFrame(neu,columns=['text'])
neutrals['label'] = 2

In [10]:
frames = [liberals,conservatives]
result = pd.concat(frames)

In [11]:
result['text'] = result['text'].map(lambda x: clean_text(x))
result = result.sample(frac=1).reset_index(drop=True)
my_ibc_data = result

In [12]:
my_ibc_data['word_count'] = my_ibc_data['text'].apply(lambda x: len(str(x).split(" ")))
overall_word_count = np.sum(my_ibc_data['word_count'].values)
print("Overall word count", overall_word_count)

('Overall word count', 85487)


In [13]:
my_ibc_data['char_count'] = my_ibc_data['text'].str.len()
overall_char_count = np.sum(my_ibc_data['char_count'].values)
print("Overall char count", overall_char_count)

('Overall char count', 628655)


In [14]:
my_ibc_data['avg_word_length'] = my_ibc_data['text'].apply(lambda x: avg_word_len(x))
overall_word_avg_len = np.sum(my_ibc_data['avg_word_length'].values)/len(my_ibc_data['avg_word_length'].values)
print("Overall average word length", overall_word_avg_len)
my_ibc_data.head(2)

('Overall average word length', 5)


Unnamed: 0,text,label,word_count,char_count,avg_word_length
0,since brutal genocide 1994 20 percent populati...,1,25,200,7
1,need end cartel - like character higher educat...,1,32,250,6


In [15]:
word_count_each_sentence = np.array(my_ibc_data['word_count'].values)
print 'word count per sentence'
print 'mean: ', np.mean(word_count_each_sentence)
print 'median: ', np.median(word_count_each_sentence)

word count per sentence
mean:  22.943370907139023
median:  22.0


## Convote data

In [16]:
convote_train_files_path = 'data_stage_one/training_set/*.txt'
convote_test_files_path = 'data_stage_one/test_set/*.txt'

In [17]:
convote_train_files = glob.glob(convote_train_files_path)
convote_test_files = glob.glob(convote_test_files_path)

In [18]:
filepath_dict = {'convote_train': convote_train_files,
                 'convote_test': convote_test_files}

In [19]:
convote_data = []

for data_type, filenames in filepath_dict.items():
    for i in range(len(filenames)):
        f = open(filenames[i], 'r')
        f_text = f.read()
        f.close()
        party = filenames[i].split('_')[-1][0]
        sample_group = data_type.split('_')[-1]
        review_label = 0 if party == 'D' else 1
        convote_data.append([f_text, party, sample_group, review_label])
      

In [20]:
convote_data = pd.DataFrame(convote_data)

In [21]:
convote_data = convote_data.rename(index=str, columns={0: 'text', 1: 'party', 2: 'group', 3: 'party_label'})

In [22]:
convote_data['text'] = convote_data['text'].map(lambda x: clean_text(x))

In [23]:
my_convote_data = pd.DataFrame(convote_data.iloc[:,[0,3]].values)
my_convote_data = my_convote_data.rename(index=str, columns={0: 'text', 1: 'party_label'})

In [24]:
my_convote_data['word_count'] = my_convote_data['text'].apply(lambda x: len(str(x).split(" ")))
overall_word_count = np.sum(my_convote_data['word_count'].values)
print("Overall word count", overall_word_count)

('Overall word count', 1016800)


In [25]:
my_convote_data['char_count'] = my_convote_data['text'].str.len()
overall_char_count = np.sum(my_convote_data['char_count'].values)
print("Overall char count", overall_char_count)

('Overall char count', 7330698)


In [26]:
my_convote_data['avg_word_length'] = my_convote_data['text'].apply(lambda x: avg_word_len(x))
overall_word_avg_len = np.sum(my_convote_data['avg_word_length'].values)/len(my_convote_data['avg_word_length'].values)
print("Overall average word length", overall_word_avg_len)
my_convote_data.head(2)

('Overall average word length', 5)


Unnamed: 0,text,party_label,word_count,char_count,avg_word_length
0,mr speaker rise join many colleague strongly o...,0,540,3901,6
1,mr chairman rise support amendment two ground ...,0,114,811,6


In [27]:
word_count_each_sentence = np.array(my_convote_data['word_count'].values)
print 'word count per sentence'
print 'mean: ', np.mean(word_count_each_sentence)
print 'median: ', np.median(word_count_each_sentence)

word count per sentence
mean:  137.0535112548861
median:  42.0


## Overall data

In [28]:
ibc = pd.DataFrame(my_ibc_data.iloc[:,[0,1]].values)
convote = pd.DataFrame(my_convote_data.iloc[:,[0,1]].values)

In [29]:
overall_data = [ibc, convote]
overall_data = pd.concat(overall_data)

In [30]:
ibc_text = np.array(my_ibc_data.iloc[:,0].values)
ibc_labels = np.array(my_ibc_data.iloc[:,1].values)

convote_text = np.array(my_convote_data.iloc[:,0].values)
convote_labels = np.array(my_convote_data.iloc[:,1].values)

overall_text = np.array(overall_data.iloc[:,0].values)
overall_labels = np.array(overall_data.iloc[:,1].values)

## Feature extraction

In [31]:
total_word_count = 50000
seq_length = 20 #Number of items in each sequence

tokenizer = Tokenizer(num_words=total_word_count)
tokenizer.fit_on_texts(convote_text)

In [32]:
ibc_sequences = tokenizer.texts_to_sequences(ibc_text)
ibc_sequences = pad_sequences(ibc_sequences, maxlen=seq_length)

In [33]:
convote_sequences = tokenizer.texts_to_sequences(convote_text)
convote_sequences = pad_sequences(convote_sequences, maxlen=seq_length)

In [34]:
exp_5_data = ibc_sequences
exp_5_labels = ibc_labels

add_data = convote_sequences

# Experiments for different models: 5 - semi-supervised learning

In [35]:
x_train, x_test, y_train, y_test = train_test_split(exp_5_data, exp_5_labels, test_size=0.2, random_state=42)

## LSTM

In [36]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(LSTM(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(5, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [37]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_5_lstm.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 2682 samples, validate on 298 samples
Epoch 1/10
 - 6s - loss: 0.6998 - acc: 0.5477 - val_loss: 0.6923 - val_acc: 0.5201
Epoch 2/10
 - 3s - loss: 0.6781 - acc: 0.5574 - val_loss: 0.6664 - val_acc: 0.6275
Epoch 3/10
 - 3s - loss: 0.5969 - acc: 0.7472 - val_loss: 0.6431 - val_acc: 0.6409
Epoch 4/10
 - 3s - loss: 0.4903 - acc: 0.8527 - val_loss: 0.6515 - val_acc: 0.6376
Epoch 5/10
 - 3s - loss: 0.3895 - acc: 0.9027 - val_loss: 0.6802 - val_acc: 0.6544
Epoch 6/10
 - 4s - loss: 0.3130 - acc: 0.9329 - val_loss: 0.7342 - val_acc: 0.6275


In [38]:
best_model = load_model('exp_5_lstm.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5952


In [39]:
add_psuedo_labels = best_model.predict(add_data)

In [40]:
unlab_labels = []

for pred in add_psuedo_labels:
    if pred > 0.5:
        unlab_labels.append(1)
    else:
        unlab_labels.append(0)
        
unlab_labels = np.array(unlab_labels)

In [41]:
history = best_model.fit(add_data, unlab_labels, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 6677 samples, validate on 742 samples
Epoch 1/10
 - 10s - loss: 0.4425 - acc: 0.8604 - val_loss: 0.2906 - val_acc: 0.9394
Epoch 2/10
 - 8s - loss: 0.2766 - acc: 0.9340 - val_loss: 0.2200 - val_acc: 0.9515
Epoch 3/10
 - 9s - loss: 0.1953 - acc: 0.9551 - val_loss: 0.2051 - val_acc: 0.9394
Epoch 4/10
 - 8s - loss: 0.1489 - acc: 0.9665 - val_loss: 0.2148 - val_acc: 0.9367
Epoch 5/10
 - 9s - loss: 0.1185 - acc: 0.9729 - val_loss: 0.2103 - val_acc: 0.9313
Epoch 6/10
 - 9s - loss: 0.0994 - acc: 0.9763 - val_loss: 0.2334 - val_acc: 0.9232


In [42]:
best_model = load_model('exp_5_lstm.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6019


In [43]:
y_pred_lstm = best_model.predict(x_test)

## GRU

In [44]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(GRU(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [45]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_5_gru.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 2682 samples, validate on 298 samples
Epoch 1/10
 - 6s - loss: 0.7170 - acc: 0.4754 - val_loss: 0.6909 - val_acc: 0.5201
Epoch 2/10
 - 3s - loss: 0.6706 - acc: 0.6018 - val_loss: 0.6789 - val_acc: 0.6107
Epoch 3/10
 - 3s - loss: 0.5478 - acc: 0.7629 - val_loss: 0.6674 - val_acc: 0.6074
Epoch 4/10
 - 3s - loss: 0.3853 - acc: 0.8635 - val_loss: 0.7366 - val_acc: 0.6040
Epoch 5/10
 - 3s - loss: 0.2871 - acc: 0.9150 - val_loss: 0.8123 - val_acc: 0.5906
Epoch 6/10
 - 3s - loss: 0.2171 - acc: 0.9422 - val_loss: 0.8726 - val_acc: 0.5973


In [46]:
best_model = load_model('exp_5_gru.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6113


In [47]:
add_psuedo_labels = best_model.predict(add_data)

In [48]:
unlab_labels = []

for pred in add_psuedo_labels:
    if pred > 0.5:
        unlab_labels.append(1)
    else:
        unlab_labels.append(0)
        
unlab_labels = np.array(unlab_labels)

In [49]:
history = best_model.fit(add_data, unlab_labels, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 6677 samples, validate on 742 samples
Epoch 1/10
 - 9s - loss: 0.3055 - acc: 0.8979 - val_loss: 0.2266 - val_acc: 0.9299
Epoch 2/10
 - 8s - loss: 0.1815 - acc: 0.9434 - val_loss: 0.2112 - val_acc: 0.9218
Epoch 3/10
 - 8s - loss: 0.1315 - acc: 0.9617 - val_loss: 0.2297 - val_acc: 0.9124
Epoch 4/10
 - 8s - loss: 0.1060 - acc: 0.9669 - val_loss: 0.2482 - val_acc: 0.9030
Epoch 5/10
 - 8s - loss: 0.0759 - acc: 0.9787 - val_loss: 0.2341 - val_acc: 0.9097


In [50]:
best_model = load_model('exp_5_gru.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6099


In [51]:
y_pred_gru = best_model.predict(x_test)

## Simple RNN

In [52]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(SimpleRNN(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [53]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='exp_5_rnn.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=20, callbacks = callbacks, verbose = 2)

Train on 2682 samples, validate on 298 samples
Epoch 1/20
 - 4s - loss: 0.7505 - acc: 0.5477 - val_loss: 0.7053 - val_acc: 0.5201
Epoch 2/20
 - 2s - loss: 0.6972 - acc: 0.5474 - val_loss: 0.6929 - val_acc: 0.5201
Epoch 3/20
 - 2s - loss: 0.6912 - acc: 0.5474 - val_loss: 0.6928 - val_acc: 0.5201
Epoch 4/20
 - 1s - loss: 0.6893 - acc: 0.5470 - val_loss: 0.6928 - val_acc: 0.5201
Epoch 5/20
 - 2s - loss: 0.6854 - acc: 0.5477 - val_loss: 0.6932 - val_acc: 0.5201
Epoch 6/20
 - 2s - loss: 0.6795 - acc: 0.5544 - val_loss: 0.6929 - val_acc: 0.5201
Epoch 7/20
 - 2s - loss: 0.6667 - acc: 0.6156 - val_loss: 0.6946 - val_acc: 0.5201
Epoch 8/20
 - 2s - loss: 0.6333 - acc: 0.6394 - val_loss: 0.6956 - val_acc: 0.5034
Epoch 9/20
 - 2s - loss: 0.5941 - acc: 0.7196 - val_loss: 0.7000 - val_acc: 0.5436


In [54]:
best_model = load_model('exp_5_rnn.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5375


In [55]:
add_psuedo_labels = best_model.predict(add_data)

In [56]:
unlab_labels = []

for pred in add_psuedo_labels:
    if pred > 0.5:
        unlab_labels.append(1)
    else:
        unlab_labels.append(0)
        
unlab_labels = np.array(unlab_labels)

In [57]:
history = best_model.fit(add_data, unlab_labels, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 6677 samples, validate on 742 samples
Epoch 1/10
 - 5s - loss: 0.1617 - acc: 1.0000 - val_loss: 0.0280 - val_acc: 1.0000
Epoch 2/10
 - 4s - loss: 0.0217 - acc: 1.0000 - val_loss: 0.0127 - val_acc: 1.0000
Epoch 3/10
 - 4s - loss: 0.0113 - acc: 1.0000 - val_loss: 0.0080 - val_acc: 1.0000
Epoch 4/10
 - 4s - loss: 0.0074 - acc: 1.0000 - val_loss: 0.0057 - val_acc: 1.0000
Epoch 5/10
 - 4s - loss: 0.0054 - acc: 1.0000 - val_loss: 0.0043 - val_acc: 1.0000
Epoch 6/10
 - 4s - loss: 0.0041 - acc: 1.0000 - val_loss: 0.0034 - val_acc: 1.0000
Epoch 7/10
 - 4s - loss: 0.0032 - acc: 1.0000 - val_loss: 0.0027 - val_acc: 1.0000
Epoch 8/10
 - 4s - loss: 0.0026 - acc: 1.0000 - val_loss: 0.0022 - val_acc: 1.0000
Epoch 9/10
 - 4s - loss: 0.0022 - acc: 1.0000 - val_loss: 0.0019 - val_acc: 1.0000
Epoch 10/10
 - 4s - loss: 0.0018 - acc: 1.0000 - val_loss: 0.0016 - val_acc: 1.0000


In [58]:
best_model = load_model('exp_5_rnn.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5375


In [59]:
y_pred_rnn = best_model.predict(x_test)

In [60]:
y_t = y_test.reshape(len(y_test),1)

In [61]:
exp_3_predictions = np.concatenate((y_pred_lstm, y_pred_gru, y_pred_rnn, y_t), axis=1)
exp_3_predictions_df = pd.DataFrame(exp_3_predictions)
exp_3_predictions_df.to_csv('exp_5_predictions.csv')

In [62]:
exp_6_data = convote_sequences
exp_6_labels = convote_labels

add_data = ibc_sequences

# Experiments for different models: 6 - semi-supervised learning

In [63]:
x_train, x_test, y_train, y_test = train_test_split(exp_6_data, exp_6_labels, test_size=0.2, random_state=42)

## LSTM

In [64]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(LSTM(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(5, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [65]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_6_lstm.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 5341 samples, validate on 594 samples
Epoch 1/10
 - 11s - loss: 0.6926 - acc: 0.5261 - val_loss: 0.6804 - val_acc: 0.5993
Epoch 2/10
 - 7s - loss: 0.6405 - acc: 0.6832 - val_loss: 0.6256 - val_acc: 0.6801
Epoch 3/10
 - 7s - loss: 0.5432 - acc: 0.7727 - val_loss: 0.6048 - val_acc: 0.6818
Epoch 4/10
 - 7s - loss: 0.4623 - acc: 0.8141 - val_loss: 0.5985 - val_acc: 0.7020
Epoch 5/10
 - 7s - loss: 0.3936 - acc: 0.8506 - val_loss: 0.6038 - val_acc: 0.6970
Epoch 6/10
 - 7s - loss: 0.3520 - acc: 0.8620 - val_loss: 0.6066 - val_acc: 0.7037
Epoch 7/10
 - 7s - loss: 0.3112 - acc: 0.8817 - val_loss: 0.6229 - val_acc: 0.7054


In [66]:
best_model = load_model('exp_6_lstm.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6927


In [67]:
add_psuedo_labels = best_model.predict(add_data)

In [68]:
unlab_labels = []

for pred in add_psuedo_labels:
    if pred > 0.5:
        unlab_labels.append(1)
    else:
        unlab_labels.append(0)
        
unlab_labels = np.array(unlab_labels)

In [69]:
history = best_model.fit(add_data, unlab_labels, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 3353 samples, validate on 373 samples
Epoch 1/10
 - 6s - loss: 0.3507 - acc: 0.8888 - val_loss: 0.3100 - val_acc: 0.9035
Epoch 2/10
 - 5s - loss: 0.1904 - acc: 0.9687 - val_loss: 0.3116 - val_acc: 0.8928
Epoch 3/10
 - 4s - loss: 0.1323 - acc: 0.9860 - val_loss: 0.3428 - val_acc: 0.8767
Epoch 4/10
 - 4s - loss: 0.0998 - acc: 0.9928 - val_loss: 0.3486 - val_acc: 0.8686


In [70]:
best_model = load_model('exp_6_lstm.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6786


In [71]:
y_pred_lstm = best_model.predict(x_test)

## GRU

In [72]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(GRU(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [73]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_6_gru.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 5341 samples, validate on 594 samples
Epoch 1/10
 - 10s - loss: 0.6897 - acc: 0.5488 - val_loss: 0.6623 - val_acc: 0.6044
Epoch 2/10
 - 6s - loss: 0.5860 - acc: 0.6986 - val_loss: 0.5765 - val_acc: 0.6852
Epoch 3/10
 - 6s - loss: 0.4802 - acc: 0.7847 - val_loss: 0.5887 - val_acc: 0.6801
Epoch 4/10
 - 6s - loss: 0.4118 - acc: 0.8203 - val_loss: 0.5735 - val_acc: 0.7290
Epoch 5/10
 - 6s - loss: 0.3595 - acc: 0.8491 - val_loss: 0.6038 - val_acc: 0.7121
Epoch 6/10
 - 6s - loss: 0.3241 - acc: 0.8654 - val_loss: 0.6354 - val_acc: 0.6953
Epoch 7/10
 - 6s - loss: 0.2888 - acc: 0.8789 - val_loss: 0.6475 - val_acc: 0.7037


In [74]:
best_model = load_model('exp_6_gru.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6907


In [75]:
add_psuedo_labels = best_model.predict(add_data)

In [76]:
unlab_labels = []

for pred in add_psuedo_labels:
    if pred > 0.5:
        unlab_labels.append(1)
    else:
        unlab_labels.append(0)
        
unlab_labels = np.array(unlab_labels)

In [77]:
history = best_model.fit(add_data, unlab_labels, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 3353 samples, validate on 373 samples
Epoch 1/10
 - 6s - loss: 0.2750 - acc: 0.9105 - val_loss: 0.2363 - val_acc: 0.9196
Epoch 2/10
 - 4s - loss: 0.1191 - acc: 0.9770 - val_loss: 0.2040 - val_acc: 0.9276
Epoch 3/10
 - 4s - loss: 0.0777 - acc: 0.9881 - val_loss: 0.2258 - val_acc: 0.9115
Epoch 4/10
 - 4s - loss: 0.0528 - acc: 0.9940 - val_loss: 0.2344 - val_acc: 0.9142
Epoch 5/10
 - 4s - loss: 0.0471 - acc: 0.9925 - val_loss: 0.2487 - val_acc: 0.9115


In [78]:
best_model = load_model('exp_6_gru.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6846


In [79]:
y_pred_gru = best_model.predict(x_test)

## Simple RNN

In [80]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(SimpleRNN(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [81]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='exp_6_rnn.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=20, callbacks = callbacks, verbose = 2)

Train on 5341 samples, validate on 594 samples
Epoch 1/20
 - 7s - loss: 0.6931 - acc: 0.5106 - val_loss: 0.6932 - val_acc: 0.4882
Epoch 2/20
 - 3s - loss: 0.6895 - acc: 0.5432 - val_loss: 0.6888 - val_acc: 0.5438
Epoch 3/20
 - 3s - loss: 0.6829 - acc: 0.5806 - val_loss: 0.6821 - val_acc: 0.5606
Epoch 4/20
 - 3s - loss: 0.6672 - acc: 0.6177 - val_loss: 0.6746 - val_acc: 0.5539
Epoch 5/20
 - 3s - loss: 0.6267 - acc: 0.6826 - val_loss: 0.6604 - val_acc: 0.5606
Epoch 6/20
 - 3s - loss: 0.5756 - acc: 0.7130 - val_loss: 0.6664 - val_acc: 0.5724
Epoch 7/20
 - 3s - loss: 0.5213 - acc: 0.7420 - val_loss: 0.6894 - val_acc: 0.5808
Epoch 8/20
 - 3s - loss: 0.4796 - acc: 0.7705 - val_loss: 0.7045 - val_acc: 0.5859
Epoch 9/20
 - 3s - loss: 0.4403 - acc: 0.7931 - val_loss: 0.7204 - val_acc: 0.5926
Epoch 10/20
 - 3s - loss: 0.4083 - acc: 0.8107 - val_loss: 0.7580 - val_acc: 0.5690


In [82]:
best_model = load_model('exp_6_rnn.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6058


In [83]:
add_psuedo_labels = best_model.predict(add_data)

In [84]:
unlab_labels = []

for pred in add_psuedo_labels:
    if pred > 0.5:
        unlab_labels.append(1)
    else:
        unlab_labels.append(0)
        
unlab_labels = np.array(unlab_labels)

In [85]:
history = best_model.fit(add_data, unlab_labels, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 3353 samples, validate on 373 samples
Epoch 1/10
 - 3s - loss: 0.5765 - acc: 0.7295 - val_loss: 0.4886 - val_acc: 0.8097
Epoch 2/10
 - 2s - loss: 0.4060 - acc: 0.8583 - val_loss: 0.4150 - val_acc: 0.8338
Epoch 3/10
 - 2s - loss: 0.2834 - acc: 0.9129 - val_loss: 0.4138 - val_acc: 0.8043
Epoch 4/10
 - 2s - loss: 0.2041 - acc: 0.9398 - val_loss: 0.4208 - val_acc: 0.8150
Epoch 5/10
 - 2s - loss: 0.1516 - acc: 0.9574 - val_loss: 0.4433 - val_acc: 0.8043
Epoch 6/10
 - 2s - loss: 0.1227 - acc: 0.9660 - val_loss: 0.4561 - val_acc: 0.8016
Epoch 7/10
 - 2s - loss: 0.1038 - acc: 0.9675 - val_loss: 0.4777 - val_acc: 0.7962
Epoch 8/10
 - 2s - loss: 0.0873 - acc: 0.9746 - val_loss: 0.5088 - val_acc: 0.8016


In [86]:
best_model = load_model('exp_6_rnn.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6098


In [87]:
y_pred_rnn = best_model.predict(x_test)

In [88]:
y_t = y_test.reshape(len(y_test),1)

In [89]:
exp_4_predictions = np.concatenate((y_pred_lstm, y_pred_gru, y_pred_rnn, y_t), axis=1)
exp_4_predictions_df = pd.DataFrame(exp_4_predictions)
exp_4_predictions_df.to_csv('exp_6_predictions.csv')