In [91]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.preprocessing import LabelEncoder

In [73]:
df_true = pd.read_csv('data/True.csv')
df_fake = pd.read_csv('data/Fake.csv')

In [74]:
df_true['True/Fake'] = 1
df_fake['True/Fake'] = 0

df = pd.concat([df_true, df_fake], axis=0)

In [75]:
df.head()

Unnamed: 0,title,text,subject,date,True/Fake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [55]:
df.subject.unique()

array(['politicsNews', 'worldnews', 'News', 'politics', 'Government News',
       'left-news', 'US_News', 'Middle-east'], dtype=object)

In [80]:
print(df.isna().sum())

True/Fake        0
combined_text    0
dtype: int64


In [62]:
df[['subject', 'True/Fake']].value_counts()

subject          True/Fake
politicsNews     1            11272
worldnews        1            10145
News             0             9050
politics         0             6841
left-news        0             4459
Government News  0             1570
US_News          0              783
Middle-east      0              778
Name: count, dtype: int64

In [64]:
print(df_true['subject'].unique())
print(df_fake['subject'].unique())

['politicsNews' 'worldnews']
['News' 'politics' 'Government News' 'left-news' 'US_News' 'Middle-east']


In [76]:
df['combined_text'] = df['title'] + ' ' + df['text']

In [77]:
del df['subject']
del df['date']
del df['title']
del df['text']

In [78]:
df.head()

Unnamed: 0,True/Fake,combined_text
0,1,"As U.S. budget fight looms, Republicans flip t..."
1,1,U.S. military to accept transgender recruits o...
2,1,Senior U.S. Republican senator: 'Let Mr. Muell...
3,1,FBI Russia probe helped by Australian diplomat...
4,1,Trump wants Postal Service to charge 'much mor...


## Data preprocessing and modeling process

In [88]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9 ]", "", text)
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in text.split() if word.isalpha()]
    text = " ".join(words)
    text = " ".join([word for word in text.split() if word not in stopwords.words("english")])
    return text

df['combined_text'] = df['combined_text'].apply(preprocess_text)

In [94]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['combined_text'])
text_sequences = tokenizer.texts_to_sequences(df['combined_text'])
padded_sequences = pad_sequences(text_sequences, maxlen=200)

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['True/Fake'])

model = Sequential()
model.add(Embedding(5000, 128, input_length=200))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padded_sequences, encoded_labels, epochs=10, validation_split=0.2)

loss, accuracy = model.evaluate(padded_sequences, encoded_labels)
print('Accuracy:', accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.9825604557991028


In [103]:
new_text = 'People landed on Moon'
new_sequence = tokenizer.texts_to_sequences([preprocess_text(new_text)])
new_padded_sequence = pad_sequences(new_sequence, maxlen=200)
prediction = model.predict(new_padded_sequence)
if prediction > 0.5:
  print('Predicted: Fake News')
else:
  print('Predicted: Real News')

Predicted: Real News
