In [13]:
import matplotlib.pyplot as plt
%matplotlib inline
import nltk                            # Cleaning the data
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import os
import pandas as pd
import numpy as np


from sklearn import preprocessing
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM,SpatialDropout1D,Bidirectional
from keras.utils import to_categorical
from tensorflow.python.client import device_lib 
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [15]:
print(device_lib.list_local_devices()) 

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6895065321436028120
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 686792385667242858
physical_device_desc: "device: XLA_CPU device"
]


In [70]:
# nltk.download('punkt')

### Usefull Functions

In [24]:
# get train_test split of each target df

def get_traintestsplit(df):
    train_X = df['Tweet']
    target =  df['Stance']

    return train_test_split( train_X, target , test_size = 0.2, random_state = 42)

In [32]:
# max length of the tweet to fix our padding

def get_maxlength_tweet(df):
    tweet_len=[]
    for text in df['Tweet']:
        word=word_tokenize(text)
        l=len(word)
        tweet_len.append(l)

    return np.max(tweet_len)

### Data Handling

In [17]:
trainData = pd.read_pickle('Pickle_files/train_data.pkl')

In [18]:
testData = pd.read_pickle('Pickle_files/test_data.pkl')

In [19]:
trainData.shape

(2914, 3)

In [22]:
trainData.Target.value_counts()

Hillary Clinton                     689
Feminist Movement                   664
Legalization of Abortion            653
Atheism                             513
Climate Change is a Real Concern    395
Name: Target, dtype: int64

In [28]:
# splitting data into multiple dataframes based on target values

hc_df =trainData[trainData['Target'] == 'Hillary Clinton']
fm_df =trainData[trainData['Target'] == 'Feminist Movement']
la_df =trainData[trainData['Target'] == 'Legalization of Abortion']
at_df =trainData[trainData['Target'] == 'Atheism']
cc_df =trainData[trainData['Target'] == 'Climate Change is a Real Concern']

In [131]:
# at_test =tdf[tdf['Target'] == 'Atheism']

In [132]:
# at_test

Unnamed: 0,Tweet,Target,Stance
0,exalts shall humbled humbles shall exaltedmatt,Atheism,AGAINST
1,prayerbullets remove nehushtan previous move g...,Atheism,AGAINST
2,brainman heidtjj benjaminlives sought truth so...,Atheism,AGAINST
3,god utterly powerless human intervention,Atheism,AGAINST
4,davidcameron miracle multiculturalism miracle ...,Atheism,AGAINST
...,...,...,...
215,afraid apologise using word god analogy explai...,Atheism,AGAINST
216,soon think wwiii wwwiv begin utedwestand endra...,Atheism,FAVOR
217,humble trust god lean trust confident lord hap...,Atheism,AGAINST
218,newhorizons fly pluto newhorizons plutoflyby p...,Atheism,FAVOR


In [29]:
# getting train test split values for each target df

X_train1, X_val1, Y_train1, Y_val1 = get_traintestsplit(hc_df)
X_train2, X_val2, Y_train2, Y_val2 = get_traintestsplit(fm_df)
X_train3, X_val3, Y_train3, Y_val3 = get_traintestsplit(la_df)
X_train4, X_val4, Y_train4, Y_val4 = get_traintestsplit(at_df)
X_train5, X_val5, Y_train5, Y_val5 = get_traintestsplit(cc_df)

In [33]:
# max length of the review

max_tweet_len1 = get_maxlength_tweet(hc_df)
max_tweet_len2 = get_maxlength_tweet(fm_df)
max_tweet_len3 = get_maxlength_tweet(la_df)
max_tweet_len4 = get_maxlength_tweet(at_df)
max_tweet_len5 = get_maxlength_tweet(cc_df)

In [38]:
max_tweet_len5

16

In [41]:
max_feature = 1839
max_word = 350
batch_size = 128
epochs = 6
num_class = 3

tokenizer = Tokenizer(num_words=max_feature)
tokenizer.fit_on_texts(list(X_train1))
X_train1 = tokenizer.texts_to_sequences(X_train1)
X_val1 = tokenizer.texts_to_sequences(X_val1)

In [45]:
len(X_val1)

138

In [133]:
# X_test = tokenizer.texts_to_sequences(at_test['Tweet'])

In [46]:
X_train1 = sequence.pad_sequences(X_train1, maxlen=max_word)
X_val1 = sequence.pad_sequences(X_val1, maxlen=max_word)

In [134]:
# X_test = sequence.pad_sequences(X_test, maxlen=max_word)

In [47]:
print(X_train1.shape,X_val1.shape)

(551, 350) (138, 350)


In [135]:
print (X_test.shape)

(220, 350)


In [48]:
def get_coef(word, *arr):
    return word, np.asarray(arr, dtype='float32')
    
def get_emb_matrix(EMB_FILE, max_feature,emb_dimension):
    # word vectors
    emb_index = dict(get_coef(*x.rstrip().rsplit(' ')) for x in open(EMB_FILE, encoding='utf8'))
    print('Found %s word vectors.' % len(emb_index))

    # embedding matrix
    word_index = tokenizer.word_index
    num_words = min(max_feature, len(word_index) + 1)
    all_emb = np.stack(emb_index.values()) #for random init
    emb_matrix = np.random.normal(all_emb.mean(), all_emb.std(), (num_words, emb_dimension))
    
    for word, i in word_index.items():
        if i >= max_feature:
            continue
        emb_vector = emb_index.get(word)
        if emb_vector is not None:
            emb_matrix[i] = emb_vector
    max_feature = emb_matrix.shape[0]
    return emb_matrix
    

In [49]:
EMB_FILE = '../Embeddings/glove.6B.300d.txt'

emb_dimension = 300 #word vector dim
emb_matrix = get_emb_matrix(EMB_FILE,max_feature,emb_dimension)
print(emb_matrix.shape)

FileNotFoundError: [Errno 2] No such file or directory: '../Embeddings/glove.6B.300d.txt'

In [124]:
le = preprocessing.LabelEncoder()
le.fit(Y_train)
Y_train = le.transform(Y_train)
Y_val = le.transform(Y_val)

Y_train = to_categorical(Y_train, dtype ="uint8") 
Y_val = to_categorical(Y_val, dtype ="uint8") 

In [151]:
Y_test = le.transform(at_test['Stance'])
label_encode_y_test = Y_test
Y_test = to_categorical(Y_test, dtype ="uint8") 

In [125]:
print (max_feature, emb_dimension, X_train.shape[1])

1839 300 350


In [159]:
model = Sequential()
model.add(Embedding(max_feature, emb_dimension, input_length=X_train.shape[1],weights=[emb_matrix],trainable=True))
model.add(SpatialDropout1D(0.25))
# model.add(Bidirectional(LSTM(128,return_sequences=True)))
model.add(Bidirectional(LSTM(64,return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 350, 300)          551700    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 350, 300)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 350, 256)          439296    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               164352    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 387       
Total params: 1,155,735
Trainable params: 1,155,735
Non-trainable params: 0
____________________________________________

In [160]:
history = model.fit(X_train, Y_train, validation_data=(X_val, Y_val),epochs=15, batch_size=32, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [161]:
y_pred = model.predict(X_test)

In [162]:
y_test_predicted  = []
for i in y_pred:
    y_test_predicted.append(np.argmax(i))

In [163]:
y_test_predicted

[0,
 0,
 0,
 0,
 0,
 2,
 1,
 0,
 2,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 2,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 1,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 1,
 0,
 2,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 1,
 2,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 2,
 0,
 2,
 0,
 0,
 2,
 0,
 0,
 0]

In [164]:
from sklearn.metrics import confusion_matrix

confusion_matrix(label_encode_y_test.tolist(), y_test_predicted, labels =[0,1,2])

array([[132,  10,  18],
       [ 17,   6,   9],
       [  6,   0,  22]], dtype=int64)

In [None]:
#jkbfdbdjbfd