In [6]:
#This class allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) 
#or into a vector where the coefficient for each token 
from tensorflow.keras.preprocessing.text import Tokenizer

#Converts a text to a sequence of indexes in a fixed-size hashing space.
from tensorflow.keras.preprocessing.text import hashing_trick

#Converts a text to a sequence of words (or tokens).
from tensorflow.keras.preprocessing.text import text_to_word_sequence


from tensorflow.keras.preprocessing import sequence

from tensorflow.keras.preprocessing.sequence import pad_sequences


#Accuracy metrics for model
from sklearn.metrics import accuracy_score


from tensorflow.keras.models import Sequential, load_model

#lets us create embedding of words that represent the meaning of the words in relation to other words.
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout



from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from tensorflow.keras.layers import Dense

import pandas as pd

In [7]:
import numpy as np

In [3]:
!pip install sklearn



In [8]:
data = pd.read_csv('./News-DataSet/combined_news_data_processed.csv')

In [9]:
data.label.value_counts()

REAL    39171
FAKE    38918
Name: label, dtype: int64

In [10]:
len(data)

78089

# Adding Neural Fakes to Data Set

In [11]:
neural_fakes = pd.read_csv('grover_fakes.csv')

In [12]:
neural_fakes = neural_fakes.drop('Unnamed: 0', axis = 1)

In [13]:
neural_fakes['title'] = ''

In [14]:
#turning label into binary encoding
dicc_y = {0:'REAl', 1:'FAKE'}
neural_fakes['label'] = neural_fakes['label'].map(dicc_y)
neural_fakes = neural_fakes[['title', 'text', 'label']]

In [15]:
neural_fakes['label'] = 'FAKE'

# Lematization of words

In [17]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 335 kB/s  eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [16]:
import spacy

In [17]:
import regex as re
import numpy as np
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

def text_preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # Effectively removes HTML markup tags
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    
    doc = nlp(text)
    #Lemmatization, which is the process of reducing a word to its lemma or dictionary form. 
    #For example, the word run is the lemma for the words runs, ran, and running.
    text = ' '.join([token.lemma_ for token in doc if token.text not in STOP_WORDS])
    return text



In [18]:
neural_fakes['text'] = neural_fakes.text.apply(text_preprocessor)

In [19]:
neural_fakes

Unnamed: 0,title,text,label
0,,vladimir putin s regime create complex sophist...,FAKE
1,,vladimir putin spend friday listen u s militar...,FAKE
2,,military strategist peter w singer doubt russi...,FAKE
3,,russian look forward continue friendly relatio...,FAKE
4,,face mount pressure u s administration shutter...,FAKE
5,,vladimir putin rule russia 2000 frequently pho...,FAKE
6,,russia s vladimir putin accuse conspire kgb st...,FAKE
7,,white house actively search replacement chief ...,FAKE
8,,1920 mr trump teenager great grandfather frede...,FAKE
9,,president donald trump upset medium angry vacc...,FAKE


# We shuffle our data again just to make sure it is randomly organized

In [20]:
data = pd.concat(objs = [data, neural_fakes], axis = 0)
data = data.sample(frac=1).reset_index(drop=True)
data

Unnamed: 0,title,text,label
0,Documentary Film-Makers Face Decades In Prison...,documentary face decade prison record oil pipe...,REAL
1,Stop The Olympics,read column believe olympic ideal ve get sta...,REAL
2,GOP has a day of reckoning coming,license dmca face republican party light end t...,FAKE
3,What François De La Rochefoucauld Has To Teach...,merit take cynical view life observation cynic...,FAKE
4,Duterte Plays A Winning Hand With Foreign Poli...,manila rodrigo duterte need money electoral ...,REAL
...,...,...,...
78121,Trump Says Senate Republicans Likely To Pass H...,u s president donald trump wednesday express...,REAL
78122,9 Questions About The Zika Virus You Were Too ...,zika virus discover 1940 people hear year s ...,REAL
78123,Jeb Bush loses TV ad edge to Marco Rubio,kill obama administration rule dismantle obama...,REAL
78124,Trump Administration Limits Government Use Of ...,trump administration tuesday remove moscow b...,REAL


In [21]:
#turning label into binary encoding
dicc_y = {'REAL' : 0, 'FAKE':1}
data['label'] = data['label'].map(dicc_y)

In [22]:
data.label.value_counts()

0    39171
1    38955
Name: label, dtype: int64

# Bag of Words


In [23]:
from src.config import n_tokens, keep_n, embedding_dim

In [24]:
# gather max n_tokens words from ever article and turn them into digits
tokenizer = Tokenizer(num_words=n_tokens)
tokenizer.fit_on_texts(data.text.values)

In [25]:
tokenizer

<keras_preprocessing.text.Tokenizer at 0x129a7f790>

In [26]:
#get the index for each word
word_index = tokenizer.word_index

In [27]:
word_index

{'s': 1,
 'say': 2,
 'trump': 3,
 't': 4,
 'state': 5,
 'people': 6,
 'year': 7,
 'president': 8,
 'clinton': 9,
 'new': 10,
 'time': 11,
 'u': 12,
 'like': 13,
 'republican': 14,
 'go': 15,
 'american': 16,
 'know': 17,
 'tell': 18,
 'come': 19,
 'government': 20,
 'obama': 21,
 'country': 22,
 'work': 23,
 'think': 24,
 'right': 25,
 'campaign': 26,
 'report': 27,
 'election': 28,
 'day': 29,
 'want': 30,
 'house': 31,
 'way': 32,
 'donald': 33,
 'white': 34,
 'take': 35,
 'hillary': 36,
 'party': 37,
 'include': 38,
 'vote': 39,
 'percent': 40,
 'world': 41,
 'call': 42,
 'woman': 43,
 'don': 44,
 'news': 45,
 'need': 46,
 'group': 47,
 'good': 48,
 'win': 49,
 'united': 50,
 'law': 51,
 'week': 52,
 'support': 53,
 'official': 54,
 'find': 55,
 'thing': 56,
 'look': 57,
 'political': 58,
 'man': 59,
 'accord': 60,
 'change': 61,
 'million': 62,
 'national': 63,
 'help': 64,
 'attack': 65,
 'police': 66,
 'long': 67,
 'get': 68,
 'medium': 69,
 'try': 70,
 'america': 71,
 'policy': 

In [30]:
text_sequences = tokenizer.texts_to_sequences(data.text)

In [31]:
# cuantas de las 600 aparecen en el texto 0
len(text_sequences[10])

156

In [32]:
index2_word = {v: k for k, v in word_index.items()}

In [33]:
index2_word[483]

'senior'

In [34]:
# cuales de las 600 aparecen
text_sequences[0][-20:]

[2,
 135,
 272,
 1,
 272,
 163,
 142,
 546,
 471,
 272,
 24,
 130,
 412,
 18,
 147,
 449,
 140,
 86,
 531,
 179]

In [35]:
#sequences of numbers representing content of each sentence
text_sequences

[[157,
  391,
  270,
  471,
  401,
  272,
  333,
  153,
  65,
  612,
  244,
  419,
  383,
  163,
  272,
  621,
  536,
  505,
  401,
  413,
  163,
  84,
  5,
  135,
  372,
  251,
  305,
  322,
  68,
  304,
  79,
  53,
  63,
  10,
  157,
  272,
  434,
  395,
  626,
  548,
  163,
  270,
  495,
  42,
  221,
  41,
  349,
  56,
  568,
  4,
  61,
  157,
  7,
  512,
  380,
  53,
  99,
  354,
  449,
  163,
  471,
  401,
  157,
  418,
  7,
  517,
  434,
  395,
  401,
  548,
  84,
  522,
  29,
  376,
  157,
  418,
  7,
  272,
  495,
  272,
  401,
  434,
  395,
  46,
  25,
  18,
  283,
  495,
  79,
  1,
  25,
  17,
  15,
  22,
  2,
  86,
  88,
  165,
  86,
  1,
  157,
  517,
  272,
  27,
  401,
  111,
  69,
  1,
  1,
  2,
  405,
  1,
  412,
  1,
  304,
  568,
  148,
  25,
  405,
  14,
  102,
  101,
  33,
  3,
  65,
  69,
  26,
  207,
  3,
  186,
  28,
  568,
  637,
  505,
  548,
  1,
  27,
  279,
  481,
  35,
  162,
  27,
  2,
  217,
  305,
  322,
  244,
  271,
  288,
  84,
  51,
  341,
  108,
  3

In [37]:
# info
pd.Series(text_sequences).apply(len).quantile(0.9)

332.0

In [38]:
# keep first n words of every text
padded_sequences = pad_sequences(text_sequences, maxlen = keep_n, padding = 'post')

In [39]:
padded_sequences[1]

array([226, 113, 136,  68, 357, 134,  13, 524,  67, 324, 118, 474,   1,
       509,   1, 259,   1,  10, 445, 432, 536, 149,  91, 139, 293, 256,
       306, 105, 124, 474,   1,  27, 544,  22, 581, 474,   1, 233, 474,
       474,   1, 328,  39, 283, 544,   8, 379, 472,  22,  84,  85,  83,
       178, 230,  66,  55, 339, 384,  20, 492,  98, 474, 435,  54,  76,
       370,  13,  62, 425, 618, 213, 214, 636, 323, 157, 396,  58,  70,
       398,  20, 343, 536, 149, 336,  13, 332, 277,  44,   4, 127, 383,
         1, 474, 357, 297,  56, 474,  49,  56,  87, 261, 270,  38, 536,
       276, 124, 198, 261, 322, 254, 139, 238, 293, 445,  50,   5, 467,
        77, 438,  65, 474, 370,  98, 474, 536, 149,  66,  74, 293, 536,
        20, 553, 282, 474,  22, 308, 437, 466,   6,  22,   1, 173, 474,
       511, 331, 160,   5,  24, 480, 293, 306, 349,   1, 254,  41, 474,
       220,   7,  11, 226,   1, 155, 150, 394, 226,  85,   1, 261,  91,
         1,  27, 268, 632, 536, 124,  11,  85,   1, 261,  32,   

In [40]:
print(len(data.text[0]))
print(padded_sequences[0])

2843
[157 391 270 471 401 272 333 153  65 612 244 419 383 163 272 621 536 505
 401 413 163  84   5 135 372 251 305 322  68 304  79  53  63  10 157 272
 434 395 626 548 163 270 495  42 221  41 349  56 568   4  61 157   7 512
 380  53  99 354 449 163 471 401 157 418   7 517 434 395 401 548  84 522
  29 376 157 418   7 272 495 272 401 434 395  46  25  18 283 495  79   1
  25  17  15  22   2  86  88 165  86   1 157 517 272  27 401 111  69   1
   1   2 405   1 412   1 304 568 148  25 405  14 102 101  33   3  65  69
  26 207   3 186  28 568 637 505 548   1  27 279 481  35 162  27   2 217
 305 322 244 271 288  84  51 341 108 372  25 405 143   1   5  84 114   1
  86 383  74 587 272 302   1 401   1  54   2 135 272   1 272 163 142 546
 471 272  24 130 412  18 147 449 140  86 531 179   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0  

In [41]:
padded_sequences.shape

(78126, 320)

In [42]:
X = padded_sequences

In [43]:
X.shape

(78126, 320)

In [44]:
y = data.label

In [45]:
y.shape

(78126,)

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.12, random_state = 42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.1, random_state = 42)

In [48]:
X_train.shape

(61875, 320)

In [49]:
X_valid.shape

(6875, 320)

In [50]:
y

0        0
1        0
2        1
3        1
4        0
        ..
78121    0
78122    0
78123    0
78124    0
78125    1
Name: label, Length: 78126, dtype: int64

In [51]:
print(n_tokens)
print(embedding_dim)
print(keep_n)

640
16
320


# Creating the Model 

In [52]:
model = Sequential([
    Embedding(
        input_dim=n_tokens,
        output_dim=embedding_dim, 
        input_length=keep_n
    ),
    
    # This layer creates a convolution kernel that is convolved with the layer input over a single spatial (or temporal) dimension to produce a tensor of outputs.
    Conv1D(
        filters=32,
        kernel_size=5,
        padding='same', 
        activation='relu'
    ),
    
    # Downsamples the input representation by taking the maximum value over a spatial window of size pool_size. 
    MaxPooling1D(pool_size=2),
    
    Conv1D(
        filters=2 * 32,
        kernel_size=5, 
        padding='same', 
        activation='relu'
    ),
    
    MaxPooling1D(pool_size=2),
    
    Conv1D(
        filters=4 * 32, 
        kernel_size=5, 
        padding='same', 
        activation='relu'
    ),
    
    MaxPooling1D(pool_size=2),
    
    Conv1D(
        filters=6 * 32, 
        kernel_size=5, 
        padding='same', 
        activation='relu'
    ),
    
    MaxPooling1D(pool_size=2),
    
    LSTM(
       units=64, 
      return_sequences=True),
    
    Dropout(0.1),
    
    LSTM(
        units=32, 
       ),
    
    Dropout(0.1),
    
    
    # to quit
    # to quit till here
    
    Dense(1, activation='sigmoid')
    
    
    
])

In [90]:
model2 = Sequential([
    Embedding(
        input_dim=n_tokens,
        output_dim=32, 
        input_length=keep_n
    ),
    
    # This layer creates a convolution kernel that is convolved with the layer input over a single spatial (or temporal) dimension to produce a tensor of outputs.
    Conv1D(
        filters=32,
        kernel_size=5,
        padding='same', 
        activation='relu'
    ),
    
    # Downsamples the input representation by taking the maximum value over a spatial window of size pool_size. 
    MaxPooling1D(pool_size=2),
    
    Conv1D(
        filters=2 * 32,
        kernel_size=5, 
        padding='same', 
        activation='relu'
    ),
    
    MaxPooling1D(pool_size=2),
    
    Conv1D(
        filters=4 * 32, 
        kernel_size=5, 
        padding='same', 
        activation='relu'
    ),
    
    MaxPooling1D(pool_size=2),
    
    Conv1D(
        filters=6 * 32, 
        kernel_size=5, 
        padding='same', 
        activation='relu'
    ),
    
    MaxPooling1D(pool_size=2),
    
    LSTM(
       units=64, 
      return_sequences=True),
    
    Dropout(0.1),
    
    LSTM(
        units=32, 
       ),
    
    Dropout(0.1),
    
    
    # to quit
    # to quit till here
    
    Dense(1, activation='sigmoid')
    
    
    
])

In [53]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [92]:
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [54]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 320, 16)           10240     
_________________________________________________________________
conv1d (Conv1D)              (None, 320, 32)           2592      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 160, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 160, 64)           10304     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 80, 64)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 80, 128)           41088     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 40, 128)           0

In [96]:
model2.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 320, 32)           19200     
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 320, 32)           5152      
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 160, 32)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 160, 64)           10304     
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 80, 64)            0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 80, 128)           41088     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 40, 128)          

In [55]:
from tensorflow.keras.utils import plot_model


In [95]:
plot_model(model, to_file='model.jpg')

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


# Fitting and Training the Model

In [56]:
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), verbose = 1, epochs = 4, batch_size = 230,)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x153664340>

In [135]:
model2.fit(X_train, y_train, validation_data=(X_valid, y_valid), verbose = 1, epochs = 3, batch_size = 250,)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x166d06460>

# Reviewing Metrics

In [59]:
from sklearn.metrics import confusion_matrix, classification_report

In [101]:
y_pred_test = model.predict_classes(x_test)



In [57]:
y_pred_test3 = model.predict_classes(x_test)



In [139]:
y_pred2_test = model2.predict_classes(x_test)

In [60]:
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

NameError: name 'y_pred_test' is not defined

In [61]:
print(confusion_matrix(y_test, y_pred_test3))
print(classification_report(y_test, y_pred_test3))

[[4372  313]
 [ 381 4310]]
              precision    recall  f1-score   support

           0       0.92      0.93      0.93      4685
           1       0.93      0.92      0.93      4691

    accuracy                           0.93      9376
   macro avg       0.93      0.93      0.93      9376
weighted avg       0.93      0.93      0.93      9376



In [141]:
print(confusion_matrix(y_test, y_pred2_test))
print(classification_report(y_test, y_pred2_test))

[[8460 1234]
 [ 467 9362]]
              precision    recall  f1-score   support

           0       0.95      0.87      0.91      9694
           1       0.88      0.95      0.92      9829

    accuracy                           0.91     19523
   macro avg       0.92      0.91      0.91     19523
weighted avg       0.92      0.91      0.91     19523



# Saving Models and Text Tokenizer

In [69]:
! ls
#we check what folder we are in 

Cleaning and Preparing News Data.ipynb
Exploring Keras.ipynb
[1m[36mGPT2-Model-Fakes[m[m
Generating Fake News with Grover.ipynb
[1m[36mGrover-Fakes[m[m
[1m[36mKeras_Model[m[m
NEURAL FAKES TEST.ipynb
[1m[36mNews-DataSet[m[m
[1m[36mNotebooks[m[m
README.md
Screen Shot 2021-05-11 at 9.26.19 PM.png
Topic Modeling.ipynb
[1m[36mTotally-Real-News[m[m
[1m[36mVisualization Dashboards[m[m
[1m[36mbest_model[m[m
best_model2.h5
fake_text_news_lda.html
[1m[36mgrover[m[m
[1m[36mmodels[m[m
[1m[36msrc[m[m
tokenizer.json


In [103]:
#We save model 1
model.save('./Keras_Model/LSTM_93%ACC.h5')

In [62]:
model.save('./Keras_Model/LSTM2_93%ACC.h5')

In [142]:
#We save model 2
model2.save('./Keras_Model/LSTM_95%recall.h5')

In [104]:
#We save 
from keras_preprocessing.text import  tokenizer_from_json
import json


In [87]:
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))


In [105]:
import io

In [106]:
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer2.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [108]:
with open('./tokenizer2.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

In [109]:
tokenizer

<keras_preprocessing.text.Tokenizer at 0x15b7431f0>

In [None]:
# 1. save tokenizer
# 2. save network .h5

Otro nb:  
1. load tokenizer
2. load network
3. input text and all the predicting logic