In [1]:
import codecs
from os import path
import pandas as pd
import pdb
import logging
import re
from gensim import parsing
import gensim
from gensim.parsing.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics import classification_report

In [2]:
training_file = path.join('semeval2016-task6-trainingdata.txt')
training_data = pd.read_csv(training_file, header=0, delimiter='\t')

In [3]:
testing_file = path.join('SemEval2016-Task6-subtaskA-testdata-gold.txt')
testing_data = pd.read_csv(testing_file,header=0, delimiter='\t')

In [4]:
def preprocess(text):
    #convert text to lower case
    text = text.lower()
   
    #removing whitespace
    text.strip()
   
    #removing digits
    text = gensim.parsing.preprocessing.strip_numeric(text)
    #text = ' '.join(s for s in text.split() if not any(c.isdigit() for c in s))
    
    #remove stopwords
    text = gensim.parsing.preprocessing.remove_stopwords(text)
    
    #strip punctutation
    text = gensim.parsing.preprocessing.strip_punctuation(text)
    
    #strip multiple whitepsace that might occur after we remove stopwords
    text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)

    p = PorterStemmer()
    
    text = ' '.join(p.stem(word) for word in text.split())    

    #print(text)
    
    return text

In [5]:
training_data['Tweet'] = training_data['Tweet'].map(preprocess)

In [6]:
testing_data['Tweet'] = testing_data['Tweet'].map(preprocess)

In [7]:
testing_data

Unnamed: 0,ID,Target,Tweet,Stance
0,10001,Atheism,exalt shall humbl humbl shall exalt matt semst,AGAINST
1,10002,Atheism,rt prayerbullet remov nehushtan previou move g...,AGAINST
2,10003,Atheism,brainman heidtjj benjaminl sought truth soul s...,AGAINST
3,10004,Atheism,god utterli powerless human intervent semst,AGAINST
4,10005,Atheism,david cameron miracl multicultur miracl shadi ...,AGAINST
...,...,...,...,...
1244,11245,Legalization of Abortion,metalheadmonti tom six follow watch human cent...,NONE
1245,11246,Legalization of Abortion,aveng blood rememb ignor afflict ps comequickl...,AGAINST
1246,11247,Legalization of Abortion,life sacr level abort comput philosophi kate m...,AGAINST
1247,11248,Legalization of Abortion,ravensymon u refer we you minor idiot support ...,AGAINST


In [8]:
training_data

Unnamed: 0,ID,Target,Tweet,Stance
0,101,Atheism,dear lord thank u ur bless forgiv sin lord str...,AGAINST
1,102,Atheism,bless peacemak shall call children god matthew...,AGAINST
2,103,Atheism,conform world transform renew mind ispeaklif g...,AGAINST
3,104,Atheism,salah prai focu understand allah warn lazi pra...,AGAINST
4,105,Atheism,stai hous displai like time ignor quran islam ...,AGAINST
...,...,...,...,...
2809,2910,Legalization of Abortion,there s law protect unborn eagl human uh idk y...,AGAINST
2810,2911,Legalization of Abortion,abort abortionondemand menstruationmatt semst,AGAINST
2811,2912,Legalization of Abortion,dare sexual prefer choic dare dismemb preborn ...,AGAINST
2812,2913,Legalization of Abortion,equal right born wai right born liberallog lib...,AGAINST


# Internal decomposition of training data

In [9]:
df=training_data

In [10]:
df['Tweet'] = df.Tweet + ' ' + df.Target

In [11]:
df['Tweet'][0]

'dear lord thank u ur bless forgiv sin lord strength energi busi dai ahead bless hope semst Atheism'

In [12]:
train, test = train_test_split(df, test_size=0.3)
train0, val = train_test_split(train, test_size=0.3)

In [13]:
from keras.preprocessing.text import Tokenizer

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 100
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, 
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 
                      split=" ",
                      lower=True)
tokenizer.fit_on_texts(df['Tweet'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 7184 unique tokens.


In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X = tokenizer.texts_to_sequences(train0['Tweet'].values)
X_train = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
max_len = 0
for t in X:
    if max_len < len(t):
        max_len = len(t)
X1 = tokenizer.texts_to_sequences(test['Tweet'].values)
for t in X1:
    if max_len < len(t):
        max_len = len(t)
X_test = pad_sequences(X1, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', X_test.shape)

Shape of data tensor: (845, 100)


In [15]:
X2 = tokenizer.texts_to_sequences(val['Tweet'].values)
X_val = pad_sequences(X2, maxlen=max_len)

In [16]:
Y_train = pd.get_dummies(train['Stance']).values
Y_test = pd.get_dummies(test['Stance']).values
Y_val = pd.get_dummies(val['Stance']).values
print('Shape of label tensor:', Y_train.shape)

Shape of label tensor: (1969, 3)


In [17]:
Y_train = train0['Stance']
Y_test = test['Stance']
Y_val = val['Stance']

In [18]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import SpatialDropout1D
from keras.callbacks import EarlyStopping
import tensorflow as tf

In [19]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [20]:
epochs = 5
batch_size = 64

print(model.summary())
tf.config.experimental_run_functions_eagerly(True)
history = model.fit(X_train, Y_train, 
                    epochs=epochs, 
                    batch_size=batch_size,
                    validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          5000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 100, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 3)                 303       
                                                                 
Total params: 5,080,703
Trainable params: 5,080,703
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.




Epoch 1/5


UnimplementedError: Cast string to float is not supported [Op:Cast]

In [21]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

UnimplementedError: Cast string to float is not supported [Op:Cast]

# Test data using gold tags

In [22]:
training_data['Tweet'] = training_data.Tweet + ' ' + training_data.Target
training_data['Tweet'] = training_data['Tweet'].str.replace('\d+', '')
testing_data['Tweet'] = testing_data.Tweet + ' ' + testing_data.Target
testing_data['Tweet'] = testing_data['Tweet'].str.replace('\d+', '')

  training_data['Tweet'] = training_data['Tweet'].str.replace('\d+', '')
  testing_data['Tweet'] = testing_data['Tweet'].str.replace('\d+', '')


In [23]:
from keras.preprocessing.text import Tokenizer

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 100
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(training_data['Tweet'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 7184 unique tokens.


In [24]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
train = tokenizer.texts_to_sequences(training_data['Tweet'].values)
X_train= pad_sequences(train, maxlen=MAX_SEQUENCE_LENGTH)
test = tokenizer.texts_to_sequences(testing_data['Tweet'].values)
X_test = pad_sequences(test, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', X_test.shape)

Shape of data tensor: (1249, 100)


In [25]:
Y_train = pd.get_dummies(training_data['Stance']).values
Y_test = pd.get_dummies(testing_data['Stance']).values
print('Shape of label tensor:', Y_test.shape)

Shape of label tensor: (1249, 3)


In [26]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import SpatialDropout1D
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

print(model.summary())

model.fit(X_train, Y_train, 
                    epochs=epochs, 
                    batch_size=batch_size,
                    validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          5000000   
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 100, 100)         0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 128)               117248    
                                                                 
 dense_1 (Dense)             (None, 3)                 387       
                                                                 
Total params: 5,117,635
Trainable params: 5,117,635
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x16d5386cd90>

In [None]:
def predict_proba(X, model, num_samples):
    preds = [model(X, training=True) for _ in range(num_samples)]
    return np.stack(preds).mean(axis=0)
     
def predict_class(X, model, num_samples):
    proba_preds = predict_proba(X, model, num_samples)
    return np.argmax(proba_preds, axis=1)

In [None]:
y_pred = predict_class(X_test, model, 100)


In [None]:
y_pred

In [None]:
Y_test

In [None]:
acc = np.mean(y_pred == Y_test)

In [None]:
acc

In [None]:
X_test.shape

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
history.history.keys()

In [None]:
import matplotlib.pyplot as plt

plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();

# LSTM+CNN with gold tags

In [None]:
from keras.layers import Embedding
from keras.models import Sequential, Model
from keras.layers import Dense, Activation
from keras.layers import Flatten, Conv1D, SpatialDropout1D, MaxPooling1D,AveragePooling1D, Bidirectional, concatenate, Input, Dropout, LSTM
# from keras.layers import merge


y_dim=3
num_filters=200
filter_sizes=[3,4,5] 
pool_padding='valid' 
dropout=0.5

embed_input = Input(shape=(X_train.shape[1],))
x = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_train.shape[1])(embed_input)
pooled_outputs = []
for i in range(len(filter_sizes)):
    conv = Conv1D(num_filters, kernel_size=filter_sizes[i], padding='valid', activation='relu')(x)
    conv = MaxPooling1D(pool_size=EMBEDDING_DIM-filter_sizes[i]+1)(conv)           
    pooled_outputs.append(conv)
merge = concatenate(pooled_outputs)
    


In [None]:
from keras.callbacks import EarlyStopping
x = Dense(30, activation='relu')(merge)
x = Dropout(dropout)(x)
x = Bidirectional(LSTM(100, return_sequences=True, dropout=0.5, recurrent_dropout=0.1))(x)
x = Dense(30, activation='relu')(x)
x = Dropout(dropout)(x)
x = Flatten()(x)
x = Dense(y_dim, activation='sigmoid')(x)

model = Model(inputs=embed_input,outputs=x)
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])

print(model.summary())

epochs = 3
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
import matplotlib.pyplot as plt

plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();