In [87]:
import nltk
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping
from keras import backend as K

In [18]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to C:\Users\Nisha
[nltk_data]     Yadav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Nisha
[nltk_data]     Yadav\AppData\Roaming\nltk_data...


True

In [79]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, MaxPooling1D,Conv1D, GlobalMaxPooling1D,Flatten, Dropout
from keras.optimizers import Adam

In [33]:
# load the LIAR dataset
liar_data = pd.read_csv('data.csv')

In [34]:
liar_data.head()

Unnamed: 0,text,speaker,label
0,Says the Annies List political group supports ...,dwayne-bohac,false
1,When did the decline of coal start? It started...,scott-surovell,half-true
2,The economic turnaround started at the end of ...,charlie-crist,half-true
3,The Chicago Bears have had more starting quart...,robin-vos,true
4,Jim Dunnam has not lived in the district he re...,republican-party-texas,barely-true


In [35]:
# define stop words
stop_words = set(stopwords.words('english'))


In [36]:
# define stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [37]:
# function to preprocess text data
def preprocess_text(text):
    # remove punctuation and special characters
    text = re.sub('[^a-zA-Z0-9\s]', '', text)

    # convert to lowercase
    text = text.lower()

    # remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # perform stemming
    text = ' '.join([stemmer.stem(word) for word in text.split()])

    # perform lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return text

In [38]:
# apply preprocess_text function to the 'text' column of the LIAR dataset
liar_data['text'] = liar_data['text'].apply(preprocess_text)

In [39]:
liar_data['text'].head()

0    say anni list polit group support thirdtrimest...
1    declin coal start start natur ga took start be...
2                     econom turnaround start end term
3    chicago bear start quarterback last 10 year to...
4                 jim dunnam live district repres year
Name: text, dtype: object

In [40]:
# define the maximum number of words to consider in the vocabulary
MAX_NB_WORDS = 50000

# define the maximum length of the input sequences
MAX_SEQUENCE_LENGTH = 250

In [41]:
# define the path to the GloVe word embeddings file
GLOVE_PATH = 'glove.6B.100d.txt'

In [42]:
# create a tokenizer to convert words to word embeddings
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
tokenizer.fit_on_texts(liar_data['text'].values)
word_index = tokenizer.word_index

In [43]:
# convert text data to sequences of word indices
sequences = tokenizer.texts_to_sequences(liar_data['text'].values)

In [44]:
# pad sequences to ensure uniform length
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [47]:
# load the GloVe word embeddings
embeddings_index = {}
f = open(GLOVE_PATH, encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [48]:
# create a weight matrix for words in the training documents
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [57]:
encoder = LabelEncoder()
encoder.fit(liar_data['label'])
liar_data['label'] = encoder.transform(liar_data['label'])

In [63]:
# create train-test split
labels = to_categorical(liar_data['label'],num_classes=6)
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)



In [64]:
# print the shape of the train-test split
print('Shape of X_train:', X_train.shape)
print('Shape of X_test:', X_test.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_train: (14131, 250)
Shape of X_test: (3533, 250)
Shape of y_train: (14131, 6)
Shape of y_test: (3533, 6)


In [80]:
# define the CNN model
cnn_model = Sequential()
cnn_model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(128, activation='relu'))




In [81]:
# define the fully connected layer for feature extraction
fc_model = Sequential()
fc_model.add(Dense(128, input_dim=128, activation='relu'))

In [82]:
# define the fully connected layer for classification
classify_model = Sequential()
classify_model.add(Dense(6, input_dim=128, activation='softmax'))

In [107]:
# define the margin loss function
def margin_loss(y_true, y_pred):
    m = 0.4
    L = y_true * K.square(K.maximum(0., m - y_pred)) + 0.5 * (1 - y_true) * K.square(K.maximum(0., y_pred - (1 - m)))
    return K.mean(K.sum(L, 1))

In [108]:
# compile the model with margin loss function and Adam optimizer
model = Sequential([cnn_model, fc_model, classify_model])
model.compile(loss=margin_loss, optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [109]:
# fit the model on the train data
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2db2cf8a100>

In [110]:
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.045831963419914246
Test Accuracy: 0.4851401150226593
