In [1]:
#Data manipulation and analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
#Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, GRU, SimpleRNN
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

Using TensorFlow backend.


In [3]:
# Scikit learn 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [4]:
# NLP 
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
import re

In [5]:
#Misc
from six.moves import cPickle
import pickle
import itertools
from collections import Counter
import glob

In [6]:
from my_functions import plot_history
from my_functions import clean_text
from my_functions import avg_word_len
#from my_functions import perf_results

## IBC data

In [7]:
[lib,con,neutral]= pickle.load(open('ibcData.pkl','rb'))

In [8]:
liberal = []
for tree in lib:
    liberal.append(tree.get_words())
conservative = []
for tree in con:
    conservative.append(tree.get_words())
neu = []
for tree in neutral:
    neu.append(tree.get_words())

In [9]:
liberals = pd.DataFrame(liberal,columns=['text'])
liberals['label'] = 0
conservatives = pd.DataFrame(conservative,columns=['text'])
conservatives['label'] = 1
neutrals = pd.DataFrame(neu,columns=['text'])
neutrals['label'] = 2

In [10]:
frames = [liberals,conservatives]
result = pd.concat(frames)

In [11]:
result['text'] = result['text'].map(lambda x: clean_text(x))
result = result.sample(frac=1).reset_index(drop=True)
my_ibc_data = result

In [12]:
my_ibc_data['word_count'] = my_ibc_data['text'].apply(lambda x: len(str(x).split(" ")))
overall_word_count = np.sum(my_ibc_data['word_count'].values)
print("Overall word count", overall_word_count)

('Overall word count', 85487)


In [13]:
my_ibc_data['char_count'] = my_ibc_data['text'].str.len()
overall_char_count = np.sum(my_ibc_data['char_count'].values)
print("Overall char count", overall_char_count)

('Overall char count', 628655)


In [14]:
my_ibc_data['avg_word_length'] = my_ibc_data['text'].apply(lambda x: avg_word_len(x))
overall_word_avg_len = np.sum(my_ibc_data['avg_word_length'].values)/len(my_ibc_data['avg_word_length'].values)
print("Overall average word length", overall_word_avg_len)
my_ibc_data.head(2)

('Overall average word length', 5)


Unnamed: 0,text,label,word_count,char_count,avg_word_length
0,individual city including boston chicago los a...,0,23,172,6
1,1967 1972 13 state adopted reform mostly permi...,0,24,168,6


In [15]:
word_count_each_sentence = np.array(my_ibc_data['word_count'].values)
print 'word count per sentence'
print 'mean: ', np.mean(word_count_each_sentence)
print 'median: ', np.median(word_count_each_sentence)

word count per sentence
mean:  22.943370907139023
median:  22.0


## Convote data

In [16]:
convote_train_files_path = 'data_stage_one/training_set/*.txt'
convote_test_files_path = 'data_stage_one/test_set/*.txt'

In [17]:
convote_train_files = glob.glob(convote_train_files_path)
convote_test_files = glob.glob(convote_test_files_path)

In [18]:
filepath_dict = {'convote_train': convote_train_files,
                 'convote_test': convote_test_files}

In [19]:
convote_data = []

for data_type, filenames in filepath_dict.items():
    for i in range(len(filenames)):
        f = open(filenames[i], 'r')
        f_text = f.read()
        f.close()
        party = filenames[i].split('_')[-1][0]
        sample_group = data_type.split('_')[-1]
        review_label = 0 if party == 'D' else 1
        convote_data.append([f_text, party, sample_group, review_label])
      

In [20]:
convote_data = pd.DataFrame(convote_data)

In [21]:
convote_data = convote_data.rename(index=str, columns={0: 'text', 1: 'party', 2: 'group', 3: 'party_label'})

In [22]:
convote_data['text'] = convote_data['text'].map(lambda x: clean_text(x))

In [23]:
my_convote_data = pd.DataFrame(convote_data.iloc[:,[0,3]].values)
my_convote_data = my_convote_data.rename(index=str, columns={0: 'text', 1: 'party_label'})

In [24]:
my_convote_data['word_count'] = my_convote_data['text'].apply(lambda x: len(str(x).split(" ")))
overall_word_count = np.sum(my_convote_data['word_count'].values)
print("Overall word count", overall_word_count)

('Overall word count', 1016800)


In [25]:
my_convote_data['char_count'] = my_convote_data['text'].str.len()
overall_char_count = np.sum(my_convote_data['char_count'].values)
print("Overall char count", overall_char_count)

('Overall char count', 7330698)


In [26]:
my_convote_data['avg_word_length'] = my_convote_data['text'].apply(lambda x: avg_word_len(x))
overall_word_avg_len = np.sum(my_convote_data['avg_word_length'].values)/len(my_convote_data['avg_word_length'].values)
print("Overall average word length", overall_word_avg_len)
my_convote_data.head(2)

('Overall average word length', 5)


Unnamed: 0,text,party_label,word_count,char_count,avg_word_length
0,mr speaker rise join many colleague strongly o...,0,540,3901,6
1,mr chairman rise support amendment two ground ...,0,114,811,6


In [27]:
word_count_each_sentence = np.array(my_convote_data['word_count'].values)
print 'word count per sentence'
print 'mean: ', np.mean(word_count_each_sentence)
print 'median: ', np.median(word_count_each_sentence)

word count per sentence
mean:  137.0535112548861
median:  42.0


## Overall data

In [28]:
ibc = pd.DataFrame(my_ibc_data.iloc[:,[0,1]].values)
convote = pd.DataFrame(my_convote_data.iloc[:,[0,1]].values)

In [29]:
overall_data = [ibc, convote]
overall_data = pd.concat(overall_data)

In [30]:
ibc_text = np.array(my_ibc_data.iloc[:,0].values)
ibc_labels = np.array(my_ibc_data.iloc[:,1].values)

convote_text = np.array(my_convote_data.iloc[:,0].values)
convote_labels = np.array(my_convote_data.iloc[:,1].values)

overall_text = np.array(overall_data.iloc[:,0].values)
overall_labels = np.array(overall_data.iloc[:,1].values)

## Feature extraction

In [31]:
total_word_count = 50000
seq_length = 20 #Number of items in each sequence

tokenizer = Tokenizer(num_words=total_word_count)
tokenizer.fit_on_texts(convote_text)

In [32]:
ibc_sequences = tokenizer.texts_to_sequences(ibc_text)
ibc_sequences = pad_sequences(ibc_sequences, maxlen=seq_length)

In [33]:
convote_sequences = tokenizer.texts_to_sequences(convote_text)
convote_sequences = pad_sequences(convote_sequences, maxlen=seq_length)

In [34]:
x_train = convote_sequences
y_train = convote_labels

x_test = ibc_sequences
y_test = ibc_labels

# Experiments for different models: 3 - Training on convote and testing on IBC

## LSTM

In [35]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(LSTM(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(5, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [36]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_3_lstm.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 6677 samples, validate on 742 samples
Epoch 1/10
 - 12s - loss: 0.7031 - acc: 0.5064 - val_loss: 0.6857 - val_acc: 0.5216
Epoch 2/10
 - 10s - loss: 0.6512 - acc: 0.6722 - val_loss: 0.6331 - val_acc: 0.6765
Epoch 3/10
 - 10s - loss: 0.5435 - acc: 0.7797 - val_loss: 0.6246 - val_acc: 0.6563
Epoch 4/10
 - 10s - loss: 0.4518 - acc: 0.8215 - val_loss: 0.6149 - val_acc: 0.6806
Epoch 5/10
 - 10s - loss: 0.3922 - acc: 0.8492 - val_loss: 0.6581 - val_acc: 0.6712
Epoch 6/10
 - 10s - loss: 0.3451 - acc: 0.8676 - val_loss: 0.6855 - val_acc: 0.6698
Epoch 7/10
 - 10s - loss: 0.3155 - acc: 0.8788 - val_loss: 0.6926 - val_acc: 0.6873


In [37]:
best_model = load_model('exp_3_lstm.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5448


In [38]:
y_pred_lstm = best_model.predict(x_test)

## GRU

In [39]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(GRU(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [40]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_3_gru.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 6677 samples, validate on 742 samples
Epoch 1/10
 - 11s - loss: 0.6844 - acc: 0.5480 - val_loss: 0.6455 - val_acc: 0.6375
Epoch 2/10
 - 9s - loss: 0.5737 - acc: 0.7181 - val_loss: 0.5939 - val_acc: 0.6765
Epoch 3/10
 - 8s - loss: 0.4741 - acc: 0.7836 - val_loss: 0.5859 - val_acc: 0.6941
Epoch 4/10
 - 9s - loss: 0.4109 - acc: 0.8270 - val_loss: 0.5962 - val_acc: 0.7008
Epoch 5/10
 - 9s - loss: 0.3679 - acc: 0.8462 - val_loss: 0.6252 - val_acc: 0.6900
Epoch 6/10
 - 8s - loss: 0.3254 - acc: 0.8651 - val_loss: 0.6678 - val_acc: 0.6712


In [41]:
best_model = load_model('exp_3_gru.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5437


In [42]:
y_pred_gru = best_model.predict(x_test)

## Simple RNN

In [43]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(SimpleRNN(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [44]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='exp_3_rnn.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=20, callbacks = callbacks, verbose = 2)

Train on 6677 samples, validate on 742 samples
Epoch 1/20
 - 6s - loss: 0.6944 - acc: 0.5139 - val_loss: 0.6900 - val_acc: 0.5404
Epoch 2/20
 - 4s - loss: 0.6863 - acc: 0.5558 - val_loss: 0.6792 - val_acc: 0.5876
Epoch 3/20
 - 5s - loss: 0.6701 - acc: 0.6054 - val_loss: 0.6649 - val_acc: 0.5916
Epoch 4/20
 - 4s - loss: 0.6285 - acc: 0.6614 - val_loss: 0.6461 - val_acc: 0.6078
Epoch 5/20
 - 4s - loss: 0.5790 - acc: 0.6972 - val_loss: 0.6405 - val_acc: 0.6321
Epoch 6/20
 - 5s - loss: 0.5376 - acc: 0.7265 - val_loss: 0.6543 - val_acc: 0.6213
Epoch 7/20
 - 4s - loss: 0.4978 - acc: 0.7502 - val_loss: 0.6647 - val_acc: 0.6051
Epoch 8/20
 - 4s - loss: 0.4591 - acc: 0.7816 - val_loss: 0.6785 - val_acc: 0.6334
Epoch 9/20
 - 4s - loss: 0.4229 - acc: 0.8025 - val_loss: 0.6812 - val_acc: 0.6469
Epoch 10/20
 - 4s - loss: 0.3911 - acc: 0.8221 - val_loss: 0.7055 - val_acc: 0.6348


In [45]:
best_model = load_model('exp_3_rnn.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5115


In [46]:
y_pred_rnn = best_model.predict(x_test)

In [47]:
y_t = y_test.reshape(len(y_test),1)

In [48]:
exp_3_predictions = np.concatenate((y_pred_lstm, y_pred_gru, y_pred_rnn, y_t), axis=1)
exp_3_predictions_df = pd.DataFrame(exp_3_predictions)
exp_3_predictions_df.to_csv('exp_3_predictions.csv')

# Experiments for different models: 4 - Training on IBC and testing on Convote

In [49]:
x_train = ibc_sequences
y_train = ibc_labels

x_test = convote_sequences
y_test = convote_labels

## LSTM

In [50]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(LSTM(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(5, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [51]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_4_lstm.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 3353 samples, validate on 373 samples
Epoch 1/10
 - 9s - loss: 0.6972 - acc: 0.4996 - val_loss: 0.6825 - val_acc: 0.5764
Epoch 2/10
 - 5s - loss: 0.6687 - acc: 0.6036 - val_loss: 0.6576 - val_acc: 0.6461
Epoch 3/10
 - 5s - loss: 0.5843 - acc: 0.7853 - val_loss: 0.6413 - val_acc: 0.6488
Epoch 4/10
 - 5s - loss: 0.4727 - acc: 0.8503 - val_loss: 0.6941 - val_acc: 0.6113
Epoch 5/10
 - 5s - loss: 0.3678 - acc: 0.8983 - val_loss: 0.7211 - val_acc: 0.6113
Epoch 6/10
 - 5s - loss: 0.2968 - acc: 0.9231 - val_loss: 0.7675 - val_acc: 0.6032


In [52]:
best_model = load_model('exp_4_lstm.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5073


In [53]:
y_pred_lstm = best_model.predict(x_test)

## GRU

In [54]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(GRU(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [55]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='exp_4_gru.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=10, callbacks = callbacks, verbose = 2)

Train on 3353 samples, validate on 373 samples
Epoch 1/10
 - 9s - loss: 0.7016 - acc: 0.5004 - val_loss: 0.6829 - val_acc: 0.5764
Epoch 2/10
 - 4s - loss: 0.6564 - acc: 0.6281 - val_loss: 0.6460 - val_acc: 0.6300
Epoch 3/10
 - 4s - loss: 0.5128 - acc: 0.7859 - val_loss: 0.6702 - val_acc: 0.6032
Epoch 4/10
 - 4s - loss: 0.3759 - acc: 0.8697 - val_loss: 0.7206 - val_acc: 0.6166
Epoch 5/10
 - 4s - loss: 0.2976 - acc: 0.9058 - val_loss: 0.8123 - val_acc: 0.6032


In [56]:
best_model = load_model('exp_4_gru.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5166


In [57]:
y_pred_gru = best_model.predict(x_test)

## Simple RNN

In [58]:
model = Sequential()
model.add(Embedding(total_word_count, seq_length, input_length=seq_length))
model.add(SimpleRNN(20, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [59]:
## Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='exp_4_rnn.h5', monitor='val_loss', save_best_only=True)]

history = model.fit(x_train, y_train, validation_split=0.1, epochs=20, callbacks = callbacks, verbose = 2)

Train on 3353 samples, validate on 373 samples
Epoch 1/20
 - 6s - loss: 0.7371 - acc: 0.4605 - val_loss: 0.6883 - val_acc: 0.5764
Epoch 2/20
 - 2s - loss: 0.6930 - acc: 0.5291 - val_loss: 0.6814 - val_acc: 0.5764
Epoch 3/20
 - 2s - loss: 0.6903 - acc: 0.5347 - val_loss: 0.6829 - val_acc: 0.5764
Epoch 4/20
 - 2s - loss: 0.6837 - acc: 0.5517 - val_loss: 0.6802 - val_acc: 0.5764
Epoch 5/20
 - 2s - loss: 0.6683 - acc: 0.6129 - val_loss: 0.6796 - val_acc: 0.5764
Epoch 6/20
 - 2s - loss: 0.6451 - acc: 0.6618 - val_loss: 0.6805 - val_acc: 0.5657
Epoch 7/20
 - 2s - loss: 0.5913 - acc: 0.7408 - val_loss: 0.6839 - val_acc: 0.5657
Epoch 8/20
 - 2s - loss: 0.5073 - acc: 0.7966 - val_loss: 0.7059 - val_acc: 0.5603
Epoch 9/20
 - 2s - loss: 0.4112 - acc: 0.8384 - val_loss: 0.7491 - val_acc: 0.5416
Epoch 10/20
 - 2s - loss: 0.3360 - acc: 0.8729 - val_loss: 0.8206 - val_acc: 0.5710


In [60]:
best_model = load_model('exp_4_rnn.h5')
test_loss, test_accuracy = best_model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy))

Testing Accuracy:  0.5098


In [61]:
y_pred_rnn = best_model.predict(x_test)

In [62]:
y_t = y_test.reshape(len(y_test),1)

In [63]:
exp_4_predictions = np.concatenate((y_pred_lstm, y_pred_gru, y_pred_rnn, y_t), axis=1)
exp_4_predictions_df = pd.DataFrame(exp_4_predictions)
exp_4_predictions_df.to_csv('exp_4_predictions.csv')