## USING RNNs and LSTM to detect fake News

In [100]:
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dropout

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.stem.snowball import SnowballStemmer
import regex as re
from nltk.tokenize import sent_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords


In [101]:
#gets filler words like "the", "is", "in". Meant to clean up text data
nltk.download('stopwords')

#gets pretrained model that splits up sentences into words or paragraphs into sentences
nltk.download('punkt')

#can group words into meaning synonyms, antonyms, Hypernyms, hyponyms, etc.
nltk.download('wordnet')

#putting stop words into a variable for later use
stop_words = stopwords.words('english')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>


In [102]:
df_fake = pd.read_csv('Fake.csv')
df_true = pd.read_csv('True.csv')

In [103]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip their fiscal script","WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they r...",politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits on Monday: Pentagon,"WASHINGTON (Reuters) - Transgender people will be allowed for the first time to enlist in the U.S. military starting on Monday as ordered by federal courts, the Pentagon said on Friday, after President Donald Trump’s administration decided not to appeal rulings that blocked his transgender ban. Two federal appeals courts, one in Washington and one in Virginia, last week rejected the administration’s request to put on hold orders by lower court judges requiring the military to begin accepting...",politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Mueller do his job',"WASHINGTON (Reuters) - The special counsel investigation of links between Russia and President Trump’s 2016 election campaign should continue without interference in 2018, despite calls from some Trump administration allies and Republican lawmakers to shut it down, a prominent Republican senator said on Sunday. Lindsey Graham, who serves on the Senate armed forces and judiciary committees, said Department of Justice Special Counsel Robert Mueller needs to carry on with his Russia investigati...",politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat tip-off: NYT,"WASHINGTON (Reuters) - Trump campaign adviser George Papadopoulos told an Australian diplomat in May 2016 that Russia had political dirt on Democratic presidential candidate Hillary Clinton, the New York Times reported on Saturday. The conversation between Papadopoulos and the diplomat, Alexander Downer, in London was a driving factor behind the FBI’s decision to open a counter-intelligence investigation of Moscow’s contacts with the Trump campaign, the Times reported. Two months after the m...",politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much more' for Amazon shipments,"SEATTLE/WASHINGTON (Reuters) - President Donald Trump called on the U.S. Postal Service on Friday to charge “much more” to ship packages for Amazon (AMZN.O), picking another fight with an online retail giant he has criticized in the past. “Why is the United States Post Office, which is losing many billions of dollars a year, while charging Amazon and others so little to deliver their packages, making Amazon richer and the Post Office dumber and poorer? Should be charging MUCH MORE!” Trum...",politicsNews,"December 29, 2017"


In [104]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing,"Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and the very dishonest fake news media. The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year, President Angry Pants tweeted. 2018 will be ...",News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian Collusion Investigation,"House Intelligence Committee Chairman Devin Nunes is going to have a bad day. He s been under the assumption, like many of us, that the Christopher Steele-dossier was what prompted the Russia investigation so he s been lashing out at the Department of Justice and the FBI in order to protect Trump. As it happens, the dossier is not what started the investigation, according to documents obtained by the New York Times.Former Trump campaign adviser George Papadopoulos was drunk in a wine bar whe...",News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke For Threatening To Poke People ‘In The Eye’,"On Friday, it was revealed that former Milwaukee Sheriff David Clarke, who was being considered for Homeland Security Secretary in Donald Trump s administration, has an email scandal of his own.In January, there was a brief run-in on a plane between Clarke and fellow passenger Dan Black, who he later had detained by the police for no reason whatsoever, except that maybe his feelings were hurt. Clarke messaged the police to stop Black after he deplaned, and now, a search warrant has been exec...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name Coded Into His Website (IMAGES),"On Christmas day, Donald Trump announced that he would be back to work the following day, but he is golfing for the fourth day in a row. The former reality show star blasted former President Barack Obama for playing golf and now Trump is on track to outpace the number of golf games his predecessor played.Updated my tracker of Trump s appearances at Trump properties.71 rounds of golf including today s. At this pace, he ll pass Obama s first-term total by July 24 next year. https://t.co/Fg7V...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump During His Christmas Speech,Pope Francis used his annual Christmas Day message to rebuke Donald Trump without even mentioning his name. The Pope delivered his message just days after members of the United Nations condemned Trump s move to recognize Jerusalem as the capital of Israel. The Pontiff prayed on Monday for the peaceful coexistence of two states within mutually agreed and internationally recognized borders. We see Jesus in the children of the Middle East who continue to suffer because of growing tensions betw...,News,"December 25, 2017"


In [105]:
#categorize them
df_true['status'] = 1
df_fake['status'] = 0

In [106]:
#comibine both data frames
df = pd.concat([df_true, df_fake])

#we are only using the title to predict if news is fake or true thus drop the others
df.drop(['text', 'subject', 'date'], axis=1, inplace=True)

In [107]:
#mix up the data frame so that fake and true news are not in order
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [119]:
pd.set_option('display.max_colwidth', 500)
random = np.random.randint(0,len(df),20)
df.iloc[random]

Unnamed: 0,title,status,maximum_length
21206,"VANISHED: FBI FILES Related To Mysterious “Suicide” Death Of Hillary’s Trusted WH Counsel, Vince Foster DISAPPEAR From National Archives",0,19
5235,Philippine leader changes his tune with praise for U.S. 'allies',1,10
13267,‘Bernie Sanders is on a Mission’,0,6
10473,Republican hold on U.S. state legislatures could slip in election,1,10
27048,"27 People Murdered In Texas Mass Shooting, And Trump’s Response Is Classic Racist",0,13
30252,All the president's men: China's politburo line-up a measure of Xi's power,1,12
39931,Factbox: Contenders for key jobs in Trump's administration,1,8
42430,"After Trump disclosures, UK's May says will continue to share intel with U.S.",1,13
6683,Stephen Colbert Just Got Himself Thrown Off The RNC Stage Before It Even Began (VIDEO),0,15
42183,Another Win For The NRA: Court Rules That Cities Can’t Protect Themselves From Gun Violence,0,15


In [120]:
#check for null values
df.isnull().sum()

title             0
status            0
maximum_length    0
dtype: int64

In [121]:
#gets the length of a inputted sentence
def longest_sentence_length(text):
	return len(text.split())

#Neural networks expect inputs of the same length
#So i have to make it a range of length from 0-max_length in my case 42
df['maximum_length'] = df['title'].apply(lambda x : longest_sentence_length(x))
max_length = max(df['maximum_length'])
print(f'The longest length of a sentence is {max_length} words.')

The longest length of a sentence is 42 words.


In [122]:
#exaclty how to clean up text: removes things that start with 0 or non alphanumeric characters
text_cleaning = "\b0\S*|\b[^A-Za-z0-9]+"

#stemming which is reducing works to their root like running -> run is disabled 
def preprocessing_filter(text, stem=False):
	
	#removes the unwanted characters using 'text_cleaning' 
	text = re.sub(text_cleaning, ' ', text)

	#creates an empty list of the words we are about to put inside
	tokens = []

	
	for token in text.split():

		#if it's not one of those filler words
		if token not in stop_words:

			#if the user decides to stem the words
			if stem:

				#creates a stemmer object
				stemmer = snowballStemmer('english')

				#stems the token
				token = stemmer.stem(token)

			#put the token into the list
			tokens.append(token)

	#joins the tokens back into a sentence
	return " ".join(tokens)

In [123]:
# Word embedding with pre padding 
def one_hot_encoded(text,vocab_size=5000, max_length = 40):
    hot_encoded = one_hot(text,vocab_size)
    return hot_encoded

In [124]:
#Now im going to convert the words into vectors using one hot encoding
def word_embedding(text):

	#from the previous function, get the preprocessed text
	preprocessed_text = preprocessing_filter(text)

	#hot encode the remade sentence into vectors
	return one_hot_encoded(preprocessed_text)

In [125]:
#each word is a vector pointing in a 40 dimensional space
embedded_features = 40

#define the model
model = Sequential()

#5000 size vocabulary, vector dimension of each word, input length max of 42 words
model.add(Embedding(5000, embedded_features, input_length=max_length))

#LSTM layer with 100 neurons, returns sequence to next layer
model.add(LSTM(100, return_sequences=True))

#zeroes out 30% of the neurons to prevent overfitting(knowing the dataset too well)
model.add(Dropout(0.3))

#second LSTM layer and don't have it return sequences because it's the last LSTM layer
model.add(LSTM(100))

#last layer with sigmoid activation function for binary classification 0-1
model.add(Dense(1, activation='sigmoid'))

#compile the model and use binary crossentropy loss function since it's binary classification
#how it will measure weighs and biases is through adam optimizer
#and track accuracy as a metric. EX: True value[0,0,0,0] Predicted value[0,0,0,1] accuracy is 75%
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


print(model.summary())


None


In [126]:
#Hot encode the titles however not all of them will be the same list size
one_hot_encoded_title = df['title'].apply(lambda x : word_embedding(x)).values

In [129]:
#now in order  of have the same length, we will pad the sequences with zeros at the beginning
#previously we hot encoded text with 40 dimensional vectors
#now we are hot encoding the titles with 42 words for max lengnth
#pre padding meaning putting the 0s at the beginning
padded_encoded_title = pad_sequences(one_hot_encoded_title, maxlen=max_length,padding='pre')

In [None]:
#features
X = padded_encoded_title

#labels
y = df['status'].values

print(f'X shape: {X.shape}')
print(f'Y shape: {y.shape}')

X shape: (44898, 42)
Y shape: (44898,)


In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print('X train shape {}'.format(X_train.shape))
print('X test shape {}'.format(X_test.shape))
print('y train shape {}'.format(y_train.shape))
print('y test shape {}'.format(y_test.shape))

X train shape (33673, 42)
X test shape (11225, 42)
y train shape (33673,)
y test shape (11225,)


In [None]:
#monitor: monitors how bad the model is doing on validation loss
#patience: waits 3 epochs before stopping if no improvement
#verbose: prints out messages
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

#saves the best model only according to validation accuracy
#even if you get ovefitting you can still keep the best model
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)

#so it will train each row and at the end of every epoch it will check using all
#the test data to see how well it did. 

#once it runs thourgh all of them it will chose the model that did the best on validation data
# and save it as best_model.h5
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=15, batch_size=64, callbacks=[early_stopping, model_checkpoint])


Epoch 1/15
[1m526/527[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 24ms/step - accuracy: 0.9203 - loss: 0.1906
Epoch 1: val_accuracy improved from None to 0.98192, saving model to best_model.h5




[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 28ms/step - accuracy: 0.9619 - loss: 0.1028 - val_accuracy: 0.9819 - val_loss: 0.0567
Epoch 2/15
[1m526/527[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 25ms/step - accuracy: 0.9901 - loss: 0.0333
Epoch 2: val_accuracy did not improve from 0.98192
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 28ms/step - accuracy: 0.9894 - loss: 0.0349 - val_accuracy: 0.9816 - val_loss: 0.0569
Epoch 3/15
[1m526/527[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 23ms/step - accuracy: 0.9957 - loss: 0.0164
Epoch 3: val_accuracy improved from 0.98192 to 0.98263, saving model to best_model.h5




[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 26ms/step - accuracy: 0.9947 - loss: 0.0181 - val_accuracy: 0.9826 - val_loss: 0.0650
Epoch 4/15
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.9976 - loss: 0.0087
Epoch 4: val_accuracy did not improve from 0.98263
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 27ms/step - accuracy: 0.9970 - loss: 0.0107 - val_accuracy: 0.9811 - val_loss: 0.0780
Epoch 4: early stopping


<keras.src.callbacks.history.History at 0x3487eaa50>

In [None]:
def best_threshold(thresholds:list, X_test):
	accuracies = []
	for thresh in thresholds:
		ypred = model.predict(X_test)
		