In [1]:
import re
import nltk
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer, WordNetLemmatizer 

from sklearn.model_selection import train_test_split

In [2]:
fake_raw_data = pd.read_csv('Fake_raw_data.csv')
real_raw_data = pd.read_csv('Real_raw_data.csv')

In [3]:
fake_raw_data

Unnamed: 0,data
0,Spraying chlorine or alcohol on the skin kills...
1,Only older adults and young people are at risk
2,Children cannot get COVID-19
3,COVID-19 is just like the flu
4,Everyone with COVID-19 dies
...,...
861,Trey Gowdy said coronavirus shutdowns were sus...
862,All elective or non-emergency surgeries are ba...
863,U.S. House Speaker Nancy Pelosi was in Wuhan C...
864,U.S. House Speaker Nancy Pelosi was in Wuhan C...


In [4]:
real_raw_data.loc[0:10, :]

Unnamed: 0,data
0,Should children wear a mask?
1,Are there situations where children aged 5 yea...
2,Should children with developmental disabilitie...
3,Should children who have health issues or a me...
4,What type of mask should children wear?
5,How should children wear a mask?
6,Should a child wear a mask at home?
7,Should teachers or other adults working with c...
8,Should children wear a mask when playing sport...
9,Are there alternatives to fabric masks such as...


In [5]:
print(fake_raw_data.isnull().any())
print()
print(real_raw_data.isnull().any())

data    False
dtype: bool

data    False
dtype: bool


In [6]:
from nltk.corpus import stopwords

# Cleaning comment
def clean_comment(comment):
    p = """'!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~'"""
    stop_words = stopwords.words('english')
    stop_words.remove('not')
    comment = [word.lower() for word in comment]
    comment = [''.join(w for w in word if w not in p) for word in comment]
    comment = [word for word in comment if word not in stop_words]
    return comment

# Stemming and Lemmantization
def stemming(comment):
    return [SnowballStemmer(language="english").stem(word) for word in comment]

def lemmatization(comment):
    return [WordNetLemmatizer.lemmatize(word) for word in comment]

def prep_comment(comment):
    regexp = "([a-zA-Z]+(?:’[a-z]+)?)"
    regex_tokenizer = RegexpTokenizer(regexp)
    comment = regex_tokenizer.tokenize(comment)
    comment = clean_comment(comment)
    # comment = stemming(comment) 
    return ' '.join(comment)

In [7]:
X_fake = [prep_comment(comment) for comment in fake_raw_data['data'].values]

In [8]:
test = [x for x in X_fake if len(x.split(' ')) >= 35]
test

['claim sarah huckabee sanders tweeted it’s odd antifa insurgency happened covid loses steam it’s odd covid happened moment impeachment failed it’s odd impeachment happened moment russian hoax failed russian hoax happened hillary lost notice pattern']

In [9]:
y_fake = [0 for _ in range(len(X_fake))]
len(y_fake)

866

In [10]:
X_real = [prep_comment(comment) for comment in real_raw_data['data'].values]

In [11]:
X_real

['children wear mask',
 'situations children aged years may wear required wear mask',
 'children developmental disabilities wear masks',
 'children health issues medical condition compromises immune system wear mask',
 'type mask children wear',
 'children wear mask',
 'child wear mask home',
 'teachers adults working children wear mask',
 'children wear mask playing sports physical activities',
 'alternatives fabric masks face shields',
 'contact tracing',
 'contact tracing work',
 'defined contact',
 'contact tracing help controlling spread virus',
 'contact tracing implemented',
 'happens contact',
 'needed successful contact tracing',
 'tools used contact tracing',
 'considered data protection',
 'get covid eating fresh foods like fruits vegetables',
 'wash fruits vegetables time covid',
 'virus causes covid live surface food packaging',
 'virus causes covid transmitted consumption cooked foods including animal products',
 'safe go grocery stores food markets',
 'safe groceries del

In [12]:
y_real = [1 for _ in range(len(X_real))]
len(y_real)

4064

In [13]:
X_all = X_real + X_fake
y_all = y_real + y_fake
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, random_state=0, test_size=0.2)

In [14]:
max_seq_len_real = max([len(x.split(' ')) for x in X_real])
max_seq_len_fake = max([len(x.split(' ')) for x in X_fake])
max_seq_len = max(max_seq_len_real, max_seq_len_fake)
MAX_NUM_WORDS = 10000
max_seq_len

35

In [15]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(X_train)
word_seq_train = tokenizer.texts_to_sequences(X_train)
word_seq_test = tokenizer.texts_to_sequences(X_test)

#pad sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len, padding="post")
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len, padding="post")

# word_seq_train = np.reshape(word_seq_train,(word_seq_train.shape[0],1,word_seq_train.shape[1]))
# word_seq_test = np.reshape(word_seq_test,(word_seq_test.shape[0],1,word_seq_test.shape[1]))

In [16]:
print(word_seq_train)

[[   4  436  176 ...    0    0    0]
 [1904 2838 2839 ...    0    0    0]
 [ 692 2841  531 ...    0    0    0]
 ...
 [  19   26    5 ...    0    0    0]
 [  39  151  578 ...    0    0    0]
 [  24  329  694 ...    0    0    0]]


In [17]:
print(word_seq_test)

[[1773  103 2757 ...    0    0    0]
 [ 148 2247 3473 ...    0    0    0]
 [ 296  326 5237 ...    0    0    0]
 ...
 [   1 2268   60 ...    0    0    0]
 [ 207 2787 3690 ...    0    0    0]
 [   1   47   28 ...    0    0    0]]


In [18]:
# train_set = pd.DataFrame(pd.concat([pd.DataFrame(word_seq_train), pd.DataFrame(y_train, columns=['label'])], axis=1))

In [19]:
# train_set

In [20]:
# train_set.to_csv('train.csv', index=False)

In [21]:
# test_set = pd.DataFrame(pd.concat([pd.DataFrame(word_seq_test), pd.DataFrame(y_test, columns=['label'])], axis=1))

In [22]:
# test_set

In [23]:
# test_set.to_csv('test.csv', index=False)

In [24]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import codecs

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Flatten

In [25]:
embeddings_index = {}
f = codecs.open('wiki.en.vec', encoding='utf-8')

for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()


2519428it [11:04, 3794.03it/s]


In [26]:
len(embeddings_index['hundred'])

300

In [27]:
words_not_found = []
word_index = tokenizer.word_index
vocab_size = min(MAX_NUM_WORDS, len(word_index) + 1)
embed_dim = 300 # fasttext vector size 300

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    if i >= vocab_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print(f'words not found: {words_not_found}')

words not found: ['here’s', 'remdesivir', 'it’s', 'don’t', 'who’s', 'isn’t', 'ilinet', 'you’re', 'won’t', 'we’re', 'trump’s', 'there’s', 'aren’t', 'can’t', 'that’s', 'telebriefing', 'we’ve', 'infodemic', 'country’s', 'didn’t', 'georgia’s', 'cdc’s', 'plandemic', 'biden’s', 'parkinson’s', 'children’s', 'amazon’s', 'pelosi’s', 'they’re', 'shouldn’t', 'carolina’s', 'covidview', 'doesn’t', 'bronchia', 'kid’s', 'ozanimod', 'zeposia', 'winsconsin', 'superspreaders', 'public’s', 'sheriff’s', 'state’s', 'educaci', 'patient’s', 'alzheimer’s', 'covid’s', 'god’s', 'ventilaors', 'factchecks', 'microchippe', 'germany’s', 'they’d', 'floyd’s', 'america’s', 'prevenci', 'infecciones', 'atenci', 'serosurveys', 'baloxavir', 'doomscrolling', 'what’s', 'quaranteaming', 'notmypresident', 'chloroquine’s', 'she’s', 'whitmer’s', 'father’s', 'sweden’s', 'russia’s', 'covidtracer', 'world’s', 'we’d', 'veteran’s', 'records', 'amazedmonday', 'undreds', 'congrasswoman', 'pshuttle', 'cleveland’s', 'general’s', 'seroep

In [28]:
type(word_seq_train[0][0])

numpy.int32

In [29]:
y_train

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,


In [30]:
# define model
model = Sequential()
e = Embedding(vocab_size, embed_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=False)

model.add(e)
model.add(LSTM(embed_dim, return_sequences=True)) 
model.add(Dense(1, activation='sigmoid'))

# compile model
model.compile(optimizer='SGD', loss='binary_crossentropy', metrics='accuracy')

print(model.summary())

# fit model
model.fit(word_seq_train, np.array(y_train), epochs=50)

# evaluate model
loss, accuracy = model.evaluate(word_seq_test, np.array(y_test))
print(f'Accuracy: {accuracy}')
print(f'Loss: {loss}')

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 35, 300)           1636800   
_________________________________________________________________
lstm (LSTM)                  (None, 35, 300)           721200    
_________________________________________________________________
dense (Dense)                (None, 35, 1)             301       
Total params: 2,358,301
Trainable params: 721,501
Non-trainable params: 1,636,800
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Ep

In [31]:
model.save('model.h5')

In [32]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)