In [1]:
import pandas as pd
from numpy import array
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('C:\\Users\\Moaz\\Desktop\\moaz\\WORK\\text and sentiment\\PEC3\\OPINIONS-TAGGED-FAKE.csv', sep='\t')
df.head()

Unnamed: 0,OPINION,TAG
0,I recently stayed at the Affina Chicago hotel ...,FAKENEG
1,I stayed at the Affina Chicago for my annivers...,FAKENEG
2,If you are looking for a high end hotel on the...,FAKENEG
3,I just returned from a long weekend in Chicago...,FAKENEG
4,My wife and I stayed at the Affinia Chicago la...,FAKENEG


There are only 4 data labels

In [3]:
df['TAG'].unique()

array(['FAKENEG', 'FAKEPOS', 'TRUENEG', 'TRUEPOS'], dtype=object)

I have extracted data from dataframe to list.

In [4]:
data = []
data_labels = []

opinions = df['OPINION'].tolist()
tags = df['TAG'].tolist()

for i in range(len(opinions)): 
    data.append(opinions[i]) 
    data_labels.append(tags[i])

I have encoded data label into label encoded values and one hot encoded values. I will be using integer values.

In [5]:
values = array(data_labels)
print(values)
print('------------------------------------')

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)
print('------------------------------------')

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

['FAKENEG' 'FAKENEG' 'FAKENEG' ... 'TRUEPOS' 'TRUEPOS' 'TRUEPOS']
------------------------------------
[0 0 0 ... 3 3 3]
------------------------------------
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 ...
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]


To create embedding I have used glove.42B.300d, from Glove. I have also padded the data to make them of equal size.

In [19]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)

word_index = tokenizer.word_index
vocab_size=len(word_index)
print('Vocab size:',vocab_size)



embeddings_index = {};
with open('C:\\Users\\Moaz\\Desktop\\GLove\\glove.42B.300d.txt', 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;
print(len(coefs))

embeddings_matrix = np.zeros((vocab_size+1, 300));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;


        
# Padding data
sequences = tokenizer.texts_to_sequences(data)
padded = pad_sequences(sequences, maxlen=500, padding='post', truncating='post')
print('Padded length:',len(padded))
split = 0.2
split_n = int(round(len(padded)*(1-split),0))
print('Split:',split_n)

train_data = padded[:split_n]
train_labels = integer_encoded[:split_n]
test_data = padded[split_n:]
test_labels = integer_encoded[split_n:]

Vocab size: 9838
300
Padded length: 1600
Split: 1280


In [34]:
print('Data: ',len(data))
print('Padded length:',padded.shape)
print('Train Data: ',train_data.shape)
print('Train label: ',train_labels.shape)
print('Test Data: ',test_data.shape)
print('Test label: ',test_labels.shape)
print('Embedding matrix: ',len(embeddings_matrix))

Data:  1600
Padded length: (1600, 500)
Train Data:  (1280, 500)
Train label:  (1280, 1)
Test Data:  (320, 500)
Test label:  (320, 1)
Embedding matrix:  9839


In [35]:
#############################################
# SOLUCIÓN                                  #
#############################################
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import optimizers
from keras.layers import Flatten

model = Sequential()
model.add(Embedding(vocab_size+1, 300, weights = [embeddings_matrix], input_length=500, trainable=False))
model.add(LSTM(300))
model.add(Dense(4, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

filename = 'model.h1.14_jun_20'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
print(model.summary())

history = model.fit(train_data, train_labels, epochs=10, batch_size=100,
                    validation_split = 0.2, verbose=1, callbacks=[checkpoint])

model = load_model('model.h1.14_jun_20')
preds = model.predict_classes(test_data)

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 500, 300)          2951700   
_________________________________________________________________
lstm_13 (LSTM)               (None, 300)               721200    
_________________________________________________________________
dense_7 (Dense)              (None, 4)                 1204      
Total params: 3,674,104
Trainable params: 722,404
Non-trainable params: 2,951,700
_________________________________________________________________
None
Train on 1024 samples, validate on 256 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 3.05133, saving model to model.h1.14_jun_20
Epoch 2/10

Epoch 00002: val_loss did not improve from 3.05133
Epoch 3/10

Epoch 00003: val_loss did not improve from 3.05133
Epoch 4/10

Epoch 00004: val_loss did not improve from 3.05133
Epoch 5/10

Epoch 00005: va

In [36]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.head()

Unnamed: 0,val_loss,loss,epoch
0,3.051326,1.279903,0
1,3.38199,1.099803,1
2,3.206869,1.09142,2
3,3.241063,1.081965,3
4,3.166579,1.078018,4
