# Importing Dataset

In [None]:
# Insert Kaggle username and key 
import os
os.environ['KAGGLE_USERNAME'] = # username
os.environ['KAGGLE_KEY'] = # key


In [None]:
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset
!unzip fake-and-real-news-dataset.zip

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import tensorflow as tf
# from tensorflow.keras import layers
import gensim
pd.options.mode.chained_assignment = None

# 1. Loading the dataset

In [None]:
fake_news = pd.read_csv("Fake.csv")
real_news = pd.read_csv("True.csv")

In [None]:
fake_news.head()

In [None]:
real_news.head()

<li>Creating the target variable 'fake', with a value of 1 if an article is fake and 0 otherwise.
<li>Joining both dataframes to form the complete dataframe.</li>

In [None]:
fake_news['fake'] = 1
real_news['fake'] = 0
news = pd.concat([fake_news, real_news])
news.reset_index(inplace=True, drop=True)

In [None]:
news.head()

In [None]:
news.info()

In [None]:
counts = news['fake'].value_counts()
fake = counts[1]
real = counts[0]
print('The dataset consists of %d fake news articles and %d real news articles' % (fake,real))

# 2. Data Cleaning

### 2.1 Missing data

In [None]:
#Check for missing data
news.isna().value_counts()

### 2.2 Duplicate articles

In [None]:
# Check for duplicates
news['all_text'] = news['title'] + ' '+ news['text'] 
num_dup = len(news) - len(news['all_text'].unique())
print('There are a total of %d duplicates in the dataset' % num_dup)

In [None]:
news.drop_duplicates(subset=['all_text'], inplace = True)
news.shape

In [None]:
updated_counts = news['fake'].value_counts()
updated_fake = updated_counts[1]
updated_real = updated_counts[0]
print('After cleaning, the dataset consists of %d fake news articles and %d real news articles' % (updated_fake,updated_real))

In [None]:
ax = sns.countplot(x='fake', data=news, palette=['g','r'])
ax.set_title('Fake Breakdown')
for i in ax.patches:
    ax.annotate(f'\n{i.get_height()}', (i.get_x() + 0.3, i.get_height()), ha='center', va='top', color='white', size=15)

After removing duplicates, we now have 21197 real news and 17908 fake news articles to work with.

### 2.3 Removing publisher information for real news
<li>With a quick scan of the real news articles in our dataset, we realise that most texts start off with publisher information, specifically Reuters <b>(eg. 'WASHINGTON (Reuters) -')</b>.</li>
<li>We want to be able to generalise our predictions models to articles from any publisher, hence we drop publisher information from our dataset.</li>

In [None]:
with_publisher_info = news['text'].str.contains('^.+ \(Reuters\) - ', regex=True).sum()
print('A total of %d out of %d real news articles contain publisher information' % (with_publisher_info,updated_real))

In [None]:
news['text'] = news['text'].map(lambda x: re.sub(r'^.+ \(Reuters\) - ','',x))
news[news['fake']== 0]['text']

In [None]:
# update 'all_text' column
news['all_text'] = news['title'] + ' ' + news['text'] 

### 2.4 Articles with no text body

In [None]:
news['length'] = news['text'].map(lambda x: len(x.split()))
news[news['length']==0]

In [None]:
# 446 fake articles with no text body, 1 real article with no text body
news[news['length']==0]['fake'].value_counts()

# 3. Exploratory Data Analysis

In [None]:
# Subject distribution
plt.figure(figsize=(10,5))
ax = sns.countplot(x='subject', data=news, hue='fake', palette=['g','r'])
ax.set_title('Subject Distribution')
ax.set(ylim=(0, 14000))
for p in ax.patches:
    ax.annotate(f'\n{p.get_height().astype(int)}', (p.get_x()+0.2, p.get_height() + 1000), ha='center', va='top', color='black', size=9)


In [None]:
# Segmenting subject distribution by target

plt.figure(figsize=(10,5))
g = sns.catplot(x="subject", col="fake", data=news, kind="count")

#Rotating the xlabels
g.set_xticklabels(rotation=45)


It appears that all real news fall under either of the subjects 'politicsNews' or 'worldnews', while a significant proportion of fake news are categorised as 'News' or 'politics'.

In [None]:
# Overall word length distribution
news['length'] = news['text'].map(lambda x: len(x.split()))
sns.histplot(x='length', data=news, bins = 50, hue='fake')

In [None]:
# Fake data distribution of word length
sns.histplot(x='length', data=news.loc[news['fake'] == 1], bins = 50)

In [None]:
sns.histplot(x='length', data=news.loc[news['fake'] == 0], bins = 50)

In [None]:
# Combining the 2 plots above 

g = sns.FacetGrid(news, col="fake")
g.map(sns.histplot, "length", binwidth=250)

In [None]:
from wordcloud import WordCloud, STOPWORDS
#Word Cloud
stopwords = set(STOPWORDS)
def give_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=0
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
give_wordcloud(news,'All News')
give_wordcloud(news[news['fake'] == 1], 'Fake News')
give_wordcloud(news[news['fake'] == 0], 'Real News')

Fake news shows emotive and loaded language such as 'Drunk' and 'Obsessed', and biased persons such as Donald Trump are often mentioned.
Real news shows distinguished establishments such as 'NATO'and 'Reuters', and does not contain any emotive language. Instead, the words lean more towards factual.

In [None]:
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# # Getting sentiment values from all_text

# analyzer = SentimentIntensityAnalyzer()
# news['sentiment_score'] = [analyzer.polarity_scores(x)['compound'] for x in news['all_text']]
# news.head()

# 4. Testing our hypotheses:
<li></li>
<li></li>
<li></li>

# 5. Neural Network models

In [None]:
text_df = news[['title','text','all_text','fake']]
text_df.head()

### Text cleaning for neural network

In [None]:
def cleanText(data):
    i = data.lower()
    # get rid of urls
    i = re.sub('https?://\S+|www\.\S+', '', i)
    #get rid of non words and extra spaces
    i = re.sub('\\W', ' ', i)
    i = re.sub('\n', '', i)
    i = re.sub(' +', ' ', i)
    i = re.sub('^ ', '', i)
    i = re.sub(' $', '', i)
    return i

In [None]:
text_df['title'] = text_df['title'].map(lambda x: cleanText(x))
text_df['text'] = text_df['text'].map(lambda x: cleanText(x))
text_df['all_text'] = text_df['all_text'].map(lambda x: cleanText(x))

## 5.1 Training a simple RNN model on titles only
https://www.kaggle.com/code/therealcyberlord/fake-news-detection-using-rnn

In [None]:
from sklearn.model_selection import train_test_split

features = news[['title','text','all_text']]
target = news['fake']

X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=1,test_size=0.15)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def tokenize(train_data,test_data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_data)

    train_data = tokenizer.texts_to_sequences(train_data)
    test_data = tokenizer.texts_to_sequences(test_data)
    vocab = tokenizer.word_index
    return train_data,test_data,vocab

def pad(train_data,test_data,max_len):
    train_data = pad_sequences(train_data, padding='post', maxlen=max_len)
    test_data = pad_sequences(test_data, padding='post', maxlen=max_len)
    return train_data,test_data

In [None]:
X_train_title,X_test_title,vocab_title = tokenize(X_train['title'],X_test['title'])
max_vocab_title = len(vocab_title) + 1 # Adding 1 because of reserved 0 index
# Embedding Layer creates one more vector for "UNKNOWN" words, or padded words (0s). This Vector is filled with zeros.

# get length of longest title and pad all shorter titles to match length
max_len_title = max([len(x) for x in X_train_title] + [len(x) for x in X_test_title])
X_train_title,X_test_title = pad(X_train_title,X_test_title,max_len_title)

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, Dense, Dropout

model = Sequential([
    Embedding(max_vocab_title, 100, input_length=max_len_title),
    Bidirectional(tf.keras.layers.SimpleRNN(64,  return_sequences=True)),
    Bidirectional(tf.keras.layers.SimpleRNN(16)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1)
])

model.summary()

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train_title, y_train, epochs=10, 
                    validation_split=0.15, batch_size=30, 
                    shuffle=True, callbacks=[early_stop])

In [None]:
history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = history.epoch

plt.figure(figsize=(12,9))
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss', size=20)
plt.xlabel('Epochs', size=20)
plt.ylabel('Loss', size=20)
plt.legend(prop={'size': 20})
plt.show()

plt.figure(figsize=(12,9))
plt.plot(epochs, acc, 'g', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy', size=20)
plt.xlabel('Epochs', size=20)
plt.ylabel('Accuracy', size=20)
plt.legend(prop={'size': 20})
plt.ylim((0.5,1))
plt.show()

In [None]:
model.evaluate(X_test_title, y_test)

## 5.2 Training an LSTM model on both title and text

### 5.2.1 Training the word2vec model on the dataset

<li>There are certain characteristics distinct to fake news that would be helpful for prediction (eg. number of capitalised words, punctuations etc. However, since word2vec trains better on words in their raw form, we will preprocess the text for this purpose while extracting the distinctive characteristics and train them in another classification model.</li>
<li>Since the use of language and choice of vocabulary differs between real and fake news, we will not lemmatise/stem the words in our corpus</li>


In [None]:
# # only words (can have - or ') will be generated as tokens
# create a list of elements each containing a list of words from each article in the corpus
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import sent_tokenize

def corpus_to_vect(df_column):
    corpus_tokens = []
    stop_words = set(stopwords.words("english"))
    rtokenizer = RegexpTokenizer(r'[a-zA-Z\'\-]+') # matches any word that contains only letters, hyphens, and apostrophes
    docs = df_column.values
    for doc in docs:
        sents = sent_tokenize(doc)
        doc_tokens = []
        for sent in sents:
            sent_lowered = sent.lower()
            words = rtokenizer.tokenize(sent_lowered) # convert all words to lower case
            filtered = [word.strip() for word in words if word not in stop_words and len(word) > 1] # filter out stopwords
            doc_tokens.extend(filtered)
        corpus_tokens.append(doc_tokens)
    return corpus_tokens

In [None]:
# tokenizing the 'text' column 
X_train_alltext = corpus_to_vect(X_train['all_text'])
X_test_alltext = corpus_to_vect(X_test['all_text'])

In [None]:
 # all tokens generated in the first article
print(X_train_alltext[0])

In [None]:
# training word2vec on the 'text' corpus to form the embedding layer
from gensim.models import Word2Vec

EMBEDDING_DIM = 100
w2v = Word2Vec(
    sentences = X_train_alltext,
    vector_size = EMBEDDING_DIM,
    window = 5,
    min_count = 1
)

In [None]:
num = len(w2v.wv)
print('There are a total of %d words in the vocabulary of our trained word2vec model.' % num)

In [None]:
w2v.wv["donald"]

In [None]:
w2v.wv.most_similar("trump")

### 4.2.2 Preparing the neural network model inputs

In [None]:
X_train_alltext,X_test_alltext, vocab_alltext = tokenize(X_train_alltext,X_test_alltext)
max_vocab_alltext = len(vocab_alltext) + 1

In [None]:
# all token indexes generated from the first article
print(X_train_alltext[0])

In [None]:
article_lens = np.array([len(index) for index in token_indices])
plt.hist(article_lens, bins=500)
plt.show()

In [None]:
len_1000 = article_lens[article_lens < 1000]
print('%d out of %d articles have less than 1000 words' % (len(len_1000),len(article_lens)))

<li>Since the inputs to the neural network have to be of the same size, we have to pad each article in the dataset.</li>
<li>Since the majority of articles have less than 1000 words, we pad shorter news articles and truncate longer articles.</li>

In [None]:
max_len_alltext = 1000
X_train_alltext,X_test_alltext = pad(X_train_alltext,X_test_alltext,max_len_alltext)

In [None]:
# creating a weight matrix to retain weights learned by word2vec in the embedding layer later on
def get_weight_matrix(model, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # create weight matrix with number of rows = vocab_size and number of columns = number of embedding dimensions
    weight_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    # for each word in the vocab, store its vector created by the word2vec model
    for word, row_index in vocab.items():
        weight_matrix[row_index] = model.wv[word]
    return weight_matrix

In [None]:
embedding_vectors = get_weight_matrix(w2v, vocab_alltext)

# Training the neural network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout

lstm_model = Sequential([
    Embedding(max_vocab_alltext, output_dim=EMBEDDING_DIM, weights=[embedding_vectors], input_length=max_len_alltext, trainable=False),
    Bidirectional(LSTM(units=128)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [None]:
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
lstm_model.summary()

In [None]:
history = lstm_model.fit(X_train_alltext, y_train, validation_split=0.2, epochs=4, callbacks=[early_stop])

In [None]:
prediction_probs = lstm_model.predict(X_test_alltext)
y_pred = (prediction_probs >= 0.5).astype("int")

In [None]:
from sklearn.metrics import classification_report, accuracy_score

accuracy_score(y_test, y_pred)

# Feature Extraction

In [None]:
news.head()

In [None]:
news.drop(labels=['title','text'],axis=1,inplace=True)
news.head()

In [None]:
news.shape

In [None]:
from sklearn.model_selection import train_test_split
features = news.loc[:,['subject','all_text', 'length']]
target = news.loc[:,'fake']

In [None]:
fake_news['title']

In [None]:
real_news['title']