In [1]:
#Data manipulation and analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
#Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, GRU, SimpleRNN
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

Using TensorFlow backend.


In [3]:
# Scikit learn 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [4]:
# NLP 
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
import re

In [5]:
#Misc
from six.moves import cPickle
import pickle
import itertools
from collections import Counter
import glob

In [6]:
from my_functions import plot_history
from my_functions import clean_text
from my_functions import avg_word_len
#from my_functions import perf_results

## IBC data

In [7]:
[lib,con,neutral]= pickle.load(open('ibcData.pkl','rb'))

In [8]:
liberal = []
for tree in lib:
    liberal.append(tree.get_words())
conservative = []
for tree in con:
    conservative.append(tree.get_words())
neu = []
for tree in neutral:
    neu.append(tree.get_words())

In [9]:
liberals = pd.DataFrame(liberal,columns=['text'])
liberals['label'] = 0
conservatives = pd.DataFrame(conservative,columns=['text'])
conservatives['label'] = 1
neutrals = pd.DataFrame(neu,columns=['text'])
neutrals['label'] = 2

In [10]:
frames = [liberals,conservatives]
result = pd.concat(frames)

In [11]:
result['text'] = result['text'].map(lambda x: clean_text(x))
result = result.sample(frac=1).reset_index(drop=True)
my_ibc_data = result

In [12]:
my_ibc_data['word_count'] = my_ibc_data['text'].apply(lambda x: len(str(x).split(" ")))
overall_word_count = np.sum(my_ibc_data['word_count'].values)
print("Overall word count", overall_word_count)

('Overall word count', 85487)


In [13]:
my_ibc_data['char_count'] = my_ibc_data['text'].str.len()
overall_char_count = np.sum(my_ibc_data['char_count'].values)
print("Overall char count", overall_char_count)

('Overall char count', 628655)


In [14]:
my_ibc_data['avg_word_length'] = my_ibc_data['text'].apply(lambda x: avg_word_len(x))
overall_word_avg_len = np.sum(my_ibc_data['avg_word_length'].values)/len(my_ibc_data['avg_word_length'].values)
print("Overall average word length", overall_word_avg_len)
my_ibc_data.head(2)

('Overall average word length', 5)


Unnamed: 0,text,label,word_count,char_count,avg_word_length
0,oddly within framework argument made earlier c...,0,15,114,6
1,walker played public suppressed class consciou...,0,39,246,5


In [15]:
word_count_each_sentence = np.array(my_ibc_data['word_count'].values)
print 'word count per sentence'
print 'mean: ', np.mean(word_count_each_sentence)
print 'median: ', np.median(word_count_each_sentence)

word count per sentence
mean:  22.943370907139023
median:  22.0


## Convote data

In [16]:
convote_train_files_path = 'data_stage_one/training_set/*.txt'
convote_test_files_path = 'data_stage_one/test_set/*.txt'

In [17]:
convote_train_files = glob.glob(convote_train_files_path)
convote_test_files = glob.glob(convote_test_files_path)

In [18]:
filepath_dict = {'convote_train': convote_train_files,
                 'convote_test': convote_test_files}

In [19]:
convote_data = []

for data_type, filenames in filepath_dict.items():
    for i in range(len(filenames)):
        f = open(filenames[i], 'r')
        f_text = f.read()
        f.close()
        party = filenames[i].split('_')[-1][0]
        sample_group = data_type.split('_')[-1]
        review_label = 0 if party == 'D' else 1
        convote_data.append([f_text, party, sample_group, review_label])
      

In [20]:
convote_data = pd.DataFrame(convote_data)

In [21]:
convote_data = convote_data.rename(index=str, columns={0: 'text', 1: 'party', 2: 'group', 3: 'party_label'})

In [22]:
convote_data['text'] = convote_data['text'].map(lambda x: clean_text(x))

In [23]:
my_convote_data = pd.DataFrame(convote_data.iloc[:,[0,3]].values)
my_convote_data = my_convote_data.rename(index=str, columns={0: 'text', 1: 'party_label'})

In [24]:
my_convote_data['word_count'] = my_convote_data['text'].apply(lambda x: len(str(x).split(" ")))
overall_word_count = np.sum(my_convote_data['word_count'].values)
print("Overall word count", overall_word_count)

('Overall word count', 1016800)


In [25]:
my_convote_data['char_count'] = my_convote_data['text'].str.len()
overall_char_count = np.sum(my_convote_data['char_count'].values)
print("Overall char count", overall_char_count)

('Overall char count', 7330698)


In [26]:
my_convote_data['avg_word_length'] = my_convote_data['text'].apply(lambda x: avg_word_len(x))
overall_word_avg_len = np.sum(my_convote_data['avg_word_length'].values)/len(my_convote_data['avg_word_length'].values)
print("Overall average word length", overall_word_avg_len)
my_convote_data.head(2)

('Overall average word length', 5)


Unnamed: 0,text,party_label,word_count,char_count,avg_word_length
0,mr speaker rise join many colleague strongly o...,0,540,3901,6
1,mr chairman rise support amendment two ground ...,0,114,811,6


In [27]:
word_count_each_sentence = np.array(my_convote_data['word_count'].values)
print 'word count per sentence'
print 'mean: ', np.mean(word_count_each_sentence)
print 'median: ', np.median(word_count_each_sentence)

word count per sentence
mean:  137.0535112548861
median:  42.0


## Overall data

In [28]:
ibc = pd.DataFrame(my_ibc_data.iloc[:,[0,1]].values)
convote = pd.DataFrame(my_convote_data.iloc[:,[0,1]].values)

In [29]:
overall_data = [ibc, convote]
overall_data = pd.concat(overall_data)

In [30]:
ibc_text = np.array(my_ibc_data.iloc[:,0].values)
ibc_labels = np.array(my_ibc_data.iloc[:,1].values)

convote_text = np.array(my_convote_data.iloc[:,0].values)
convote_labels = np.array(my_convote_data.iloc[:,1].values)

overall_text = np.array(overall_data.iloc[:,0].values)
overall_labels = np.array(overall_data.iloc[:,1].values)

## Feature extraction

In [31]:
total_word_count = 50000
seq_length = 20 #Number of items in each sequence

tokenizer = Tokenizer(num_words=total_word_count)
tokenizer.fit_on_texts(ibc_text)

In [32]:
ibc_sequences = tokenizer.texts_to_sequences(ibc_text)
ibc_sequences = pad_sequences(ibc_sequences, maxlen=seq_length)

In [33]:
convote_sequences = tokenizer.texts_to_sequences(convote_text)
convote_sequences = pad_sequences(convote_sequences, maxlen=seq_length)

In [34]:
exp_2_data = ibc_sequences
exp_2_labels = ibc_labels

# Experiments for different models: 2(a) - IBC

In [35]:
x_train, x_test, y_train, y_test = train_test_split(exp_2_data, exp_2_labels, test_size=0.2, random_state=42)

## LSTM

In [36]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(LSTM(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(5, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [37]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_2a_lstm.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 2682 samples, validate on 298 samples
Epoch 1/10
 - 6s - loss: 0.7513 - acc: 0.4612 - val_loss: 0.7076 - val_acc: 0.4362
Epoch 2/10
 - 4s - loss: 0.6956 - acc: 0.4884 - val_loss: 0.6914 - val_acc: 0.5638
Epoch 3/10
 - 4s - loss: 0.6909 - acc: 0.5388 - val_loss: 0.6875 - val_acc: 0.5638
Epoch 4/10
 - 4s - loss: 0.6902 - acc: 0.5388 - val_loss: 0.6864 - val_acc: 0.5638
Epoch 5/10
 - 4s - loss: 0.6897 - acc: 0.5388 - val_loss: 0.6863 - val_acc: 0.5638
Epoch 6/10
 - 4s - loss: 0.6736 - acc: 0.6208 - val_loss: 0.6665 - val_acc: 0.6409
Epoch 7/10
 - 4s - loss: 0.5952 - acc: 0.8039 - val_loss: 0.6587 - val_acc: 0.6443
Epoch 8/10
 - 4s - loss: 0.4869 - acc: 0.8908 - val_loss: 0.6853 - val_acc: 0.6141
Epoch 9/10
 - 4s - loss: 0.3892 - acc: 0.9247 - val_loss: 0.6899 - val_acc: 0.5973
Epoch 10/10
 - 4s - loss: 0.3030 - acc: 0.9549 - val_loss: 0.7367 - val_acc: 0.6174


In [38]:
best_model = load_model('exp_2a_lstm.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5777


In [39]:
y_pred_lstm = best_model.predict(x_test)

## GRU

In [40]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(GRU(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [41]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_2a_gru.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 2682 samples, validate on 298 samples
Epoch 1/10
 - 6s - loss: 0.6945 - acc: 0.5037 - val_loss: 0.6861 - val_acc: 0.5638
Epoch 2/10
 - 3s - loss: 0.6790 - acc: 0.5466 - val_loss: 0.6707 - val_acc: 0.6174
Epoch 3/10
 - 3s - loss: 0.5551 - acc: 0.7603 - val_loss: 0.6584 - val_acc: 0.6007
Epoch 4/10
 - 3s - loss: 0.3958 - acc: 0.8598 - val_loss: 0.6981 - val_acc: 0.6040
Epoch 5/10
 - 4s - loss: 0.2772 - acc: 0.9221 - val_loss: 0.7839 - val_acc: 0.6074
Epoch 6/10
 - 3s - loss: 0.1977 - acc: 0.9512 - val_loss: 0.8444 - val_acc: 0.6242


In [42]:
best_model = load_model('exp_2a_gru.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6019


In [43]:
y_pred_gru = best_model.predict(x_test)

## Simple RNN

In [44]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(SimpleRNN(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [45]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='exp_2a_rnn.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=20, callbacks = callbacks, verbose = 2)

Train on 2682 samples, validate on 298 samples
Epoch 1/20
 - 4s - loss: 0.7432 - acc: 0.4586 - val_loss: 0.6882 - val_acc: 0.5638
Epoch 2/20
 - 2s - loss: 0.6964 - acc: 0.5108 - val_loss: 0.6850 - val_acc: 0.5638
Epoch 3/20
 - 2s - loss: 0.6930 - acc: 0.5272 - val_loss: 0.6849 - val_acc: 0.5638
Epoch 4/20
 - 2s - loss: 0.6884 - acc: 0.5406 - val_loss: 0.6848 - val_acc: 0.5638
Epoch 5/20
 - 2s - loss: 0.6853 - acc: 0.5544 - val_loss: 0.6842 - val_acc: 0.5638
Epoch 6/20
 - 2s - loss: 0.6811 - acc: 0.5563 - val_loss: 0.6833 - val_acc: 0.5638
Epoch 7/20
 - 2s - loss: 0.6612 - acc: 0.6223 - val_loss: 0.6723 - val_acc: 0.5940
Epoch 8/20
 - 2s - loss: 0.6109 - acc: 0.7263 - val_loss: 0.6689 - val_acc: 0.5705
Epoch 9/20
 - 2s - loss: 0.5167 - acc: 0.8110 - val_loss: 0.6624 - val_acc: 0.6107
Epoch 10/20
 - 2s - loss: 0.3984 - acc: 0.8661 - val_loss: 0.6880 - val_acc: 0.6376
Epoch 11/20
 - 2s - loss: 0.3168 - acc: 0.8963 - val_loss: 0.7435 - val_acc: 0.6007
Epoch 12/20
 - 2s - loss: 0.2341 - acc

In [46]:
best_model = load_model('exp_2a_rnn.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5617


In [47]:
y_pred_rnn = best_model.predict(x_test)

In [48]:
y_t = y_test.reshape(len(y_test),1)

In [49]:
exp_2a_predictions = np.concatenate((y_pred_lstm, y_pred_gru, y_pred_rnn, y_t), axis=1)
exp_2a_predictions_df = pd.DataFrame(exp_2a_predictions)
exp_2a_predictions_df.to_csv('exp_2a_predictions.csv')

# Experiments for different models: 2(b) - Convote

In [50]:
exp_2b_data = convote_sequences
exp_2b_labels = convote_labels

In [51]:
x_train, x_test, y_train, y_test = train_test_split(exp_2b_data, exp_2b_labels, test_size=0.2, random_state=42)

## LSTM

In [52]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(LSTM(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(5, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [53]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_2b_lstm.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 5341 samples, validate on 594 samples
Epoch 1/10
 - 12s - loss: 0.6938 - acc: 0.5145 - val_loss: 0.6882 - val_acc: 0.5741
Epoch 2/10
 - 8s - loss: 0.6693 - acc: 0.6360 - val_loss: 0.6479 - val_acc: 0.6717
Epoch 3/10
 - 8s - loss: 0.5967 - acc: 0.7278 - val_loss: 0.6011 - val_acc: 0.7003
Epoch 4/10
 - 8s - loss: 0.5251 - acc: 0.7813 - val_loss: 0.5890 - val_acc: 0.7037
Epoch 5/10
 - 8s - loss: 0.4696 - acc: 0.8057 - val_loss: 0.5912 - val_acc: 0.6919
Epoch 6/10
 - 8s - loss: 0.4337 - acc: 0.8188 - val_loss: 0.6217 - val_acc: 0.6987
Epoch 7/10
 - 8s - loss: 0.3978 - acc: 0.8371 - val_loss: 0.6200 - val_acc: 0.7037


In [54]:
best_model = load_model('exp_2b_lstm.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6779


In [55]:
y_pred_lstm = best_model.predict(x_test)

## GRU

In [56]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(GRU(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [57]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_2b_gru.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 5341 samples, validate on 594 samples
Epoch 1/10
 - 11s - loss: 0.6900 - acc: 0.5385 - val_loss: 0.6754 - val_acc: 0.6178
Epoch 2/10
 - 7s - loss: 0.6186 - acc: 0.6692 - val_loss: 0.5931 - val_acc: 0.6633
Epoch 3/10
 - 7s - loss: 0.5295 - acc: 0.7349 - val_loss: 0.5773 - val_acc: 0.6987
Epoch 4/10
 - 7s - loss: 0.4757 - acc: 0.7808 - val_loss: 0.5823 - val_acc: 0.7054
Epoch 5/10
 - 7s - loss: 0.4372 - acc: 0.8040 - val_loss: 0.5968 - val_acc: 0.6987
Epoch 6/10
 - 7s - loss: 0.4031 - acc: 0.8253 - val_loss: 0.6051 - val_acc: 0.6936


In [58]:
best_model = load_model('exp_2b_gru.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6725


In [59]:
y_pred_gru = best_model.predict(x_test)

## Simple RNN

In [60]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(SimpleRNN(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [61]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='exp_2b_rnn.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=20, callbacks = callbacks, verbose = 2)

Train on 5341 samples, validate on 594 samples
Epoch 1/20
 - 7s - loss: 0.7020 - acc: 0.5033 - val_loss: 0.6912 - val_acc: 0.5387
Epoch 2/20
 - 4s - loss: 0.6910 - acc: 0.5280 - val_loss: 0.6877 - val_acc: 0.5673
Epoch 3/20
 - 4s - loss: 0.6866 - acc: 0.5553 - val_loss: 0.6853 - val_acc: 0.5774
Epoch 4/20
 - 4s - loss: 0.6785 - acc: 0.5896 - val_loss: 0.6789 - val_acc: 0.6010
Epoch 5/20
 - 4s - loss: 0.6598 - acc: 0.6227 - val_loss: 0.6686 - val_acc: 0.5909
Epoch 6/20
 - 4s - loss: 0.6255 - acc: 0.6688 - val_loss: 0.6442 - val_acc: 0.6195
Epoch 7/20
 - 4s - loss: 0.5762 - acc: 0.7092 - val_loss: 0.6254 - val_acc: 0.6582
Epoch 8/20
 - 4s - loss: 0.5390 - acc: 0.7309 - val_loss: 0.6410 - val_acc: 0.6162
Epoch 9/20
 - 4s - loss: 0.4923 - acc: 0.7682 - val_loss: 0.6259 - val_acc: 0.6599
Epoch 10/20
 - 4s - loss: 0.4640 - acc: 0.7834 - val_loss: 0.6344 - val_acc: 0.6616
Epoch 11/20
 - 4s - loss: 0.4413 - acc: 0.7969 - val_loss: 0.6521 - val_acc: 0.6448
Epoch 12/20
 - 4s - loss: 0.4213 - acc

In [62]:
best_model = load_model('exp_2b_rnn.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.6348


In [63]:
y_pred_rnn = best_model.predict(x_test)

In [64]:
y_t = y_test.reshape(len(y_test),1)

In [65]:
exp_2b_predictions = np.concatenate((y_pred_lstm, y_pred_gru, y_pred_rnn, y_t), axis=1)
exp_2b_predictions_df = pd.DataFrame(exp_2b_predictions)
exp_2b_predictions_df.to_csv('exp_2b_predictions.csv')

In [66]:
y_pred_labels = []

for pred in y_pred_lstm:
    if pred > 0.5:
        y_pred_labels.append(1)
    else:
        y_pred_labels.append(0)

y_pred_labels = np.array(y_pred_labels, dtype=object)

tp = 0
fp = 0
tn = 0
fn = 0

for t, p in zip(y_test, y_pred_labels):
    if t == 0 and p == 0:
        tp += 1
    elif t == 0 and p == 1:
        fp += 1
    elif t == 1 and p == 0:
        fn += 1
    else:
        tn += 1 

print 'LSTM IBC'       
print tp, fp, fn, tn

LSTM IBC
461 274 204 545


In [67]:
y_pred_labels = []

for pred in y_pred_gru:
    if pred > 0.5:
        y_pred_labels.append(1)
    else:
        y_pred_labels.append(0)

y_pred_labels = np.array(y_pred_labels, dtype=object)

tp = 0
fp = 0
tn = 0
fn = 0

for t, p in zip(y_test, y_pred_labels):
    if t == 0 and p == 0:
        tp += 1
    elif t == 0 and p == 1:
        fp += 1
    elif t == 1 and p == 0:
        fn += 1
    else:
        tn += 1 

print 'GRU IBC'       
print tp, fp, fn, tn
        

GRU IBC
493 242 244 505


In [68]:
y_pred_labels = []

for pred in y_pred_rnn:
    if pred > 0.5:
        y_pred_labels.append(1)
    else:
        y_pred_labels.append(0)

y_pred_labels = np.array(y_pred_labels, dtype=object)

tp = 0
fp = 0
tn = 0
fn = 0

for t, p in zip(y_test, y_pred_labels):
    if t == 0 and p == 0:
        tp += 1
    elif t == 0 and p == 1:
        fp += 1
    elif t == 1 and p == 0:
        fn += 1
    else:
        tn += 1 

print 'SimpleRNN IBC'       
print tp, fp, fn, tn

SimpleRNN IBC
476 259 283 466
