<a href="https://colab.research.google.com/github/Nakdimon/fakeNewsClassifier/blob/main/fakeNewsLSTM/fakeNewsLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from wordcloud import WordCloud

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
sourceFake = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/fake-real-news-dataset/main/data/Fake.csv')
sourceReal = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/fake-real-news-dataset/main/data/True.csv')
sourceTrain = pd.read_csv('https://raw.githubusercontent.com/Nakdimon/fakeNewsClassifier/main/data/train.csv')
sourceTest = pd.read_csv('https://raw.githubusercontent.com/Nakdimon/fakeNewsClassifier/main/data/test.csv')

In [None]:
fake = sourceFake
real = sourceReal

In [None]:
train = sourceTrain
test = sourceTest

In [None]:
train = train.rename(columns = {'Labels':'label', 'Text':'text'})
labelCol = train.pop('label')
train = train.join(labelCol)

In [None]:
#Replacing clusters with binary labels
train['label'] = train['label'].replace({0:0, 1:0, 2:0, 3:1, 4:1, 5:1})

In [None]:
train_real = train[train['label']==1]
train_fake = train[train['label']==0]

In [None]:
#renaming columns to match other dataframes
test = test.rename(columns = {'Text':'text'})

In [None]:
fake['subject'].value_counts()

In [None]:
plt.figure(figsize = (10,6))
sns.countplot(x = 'subject', data = fake)

In [None]:
text = ' '.join(fake['text'].tolist())
text2 = ' '.join(train_real['text'].tolist())
text5 = ' '.join(train_fake['text'].tolist())
text3 = ' '.join(test['text'].tolist())
text4 = ' '.join(real['text'].tolist())

In [None]:
#Visualizing word presence in fake dataset
wordcloud = WordCloud().generate(text)
fig = plt.figure(figsize = (10,20))
plt.imshow(wordcloud)

In [None]:
#Visualizing word presence in fake dataset
wordcloud = WordCloud().generate(text4)
fig = plt.figure(figsize = (10,20))
plt.imshow(wordcloud)

In [None]:
#Visualizing word presence in train dataset
wordcloud = WordCloud().generate(text2)
fig = plt.figure(figsize = (10,20))
plt.imshow(wordcloud)

In [None]:
#Visualizing word presence in train dataset
wordcloud = WordCloud().generate(text5)
fig = plt.figure(figsize = (10,20))
plt.imshow(wordcloud)

In [None]:
#Visualizing word presence in test dataset
wordcloud = WordCloud().generate(text3)
fig = plt.figure(figsize = (10,20))
plt.imshow(wordcloud)

In [None]:
#Checking for unknown_publishers on the real news dataset to fill in with Unknown tag5
unknown_publishers = []
for index, row in enumerate(real.text.values):
  try:
    record = row.split('-')
    #reading a record text to find which are empty text.
    record[1]

    assert(len(record[0])<120)
  except:
    unknown_publishers.append(index)

In [None]:
#Checking for unknown_publishers on the real news dataset to fill in with Unknown tag5
unknown_publishers_dataSet2 = []
for index, row in enumerate(train_real.text.values):
  try:
    record = row.split('-')
    #reading a record text to find which are empty text.
    record[1]

    assert(len(record[0])<120)
  except:
    unknown_publishers_dataSet2.append(index)

In [None]:
len(unknown_publishers)

In [None]:
len(unknown_publishers_dataSet2)

In [None]:
real.iloc[unknown_publishers].text

In [None]:
train_real.iloc[unknown_publishers_dataSet2]

In [None]:
real.iloc[8970]

In [None]:
real = real.drop(8970, axis=0)

In [None]:
publisher = []
tmp_text = []

for index, row in enumerate(real.text.values):
  if index in unknown_publishers:
    tmp_text.append(row)
    publisher.append('Unknown')
  else:
    record = row.split('-', maxsplit=1)
    publisher.append(record[0].strip())
    tmp_text.append(record[1].strip())

In [None]:
real['publisher'] = publisher
real['text'] = tmp_text

In [None]:
publisher_ds2 = []
tmp_text_ds2 = []

for index, row in enumerate(train_real.text.values):
  if index in unknown_publishers_dataSet2:
    tmp_text_ds2.append(row)
    publisher_ds2.append('Unknown')
  else:
    record = row.split('-', maxsplit=1)
    publisher_ds2.append(record[0].strip())
    tmp_text_ds2.append(record[1].strip())

In [None]:
train_real['publisher'] = publisher_ds2
train_real['text'] = tmp_text_ds2

In [None]:
#Checking fake data for empty text in news:
emtpy_fake_index = [index for index, text in enumerate(fake.text.tolist()) if str(text).strip()=='']

In [None]:
empty_train_fake_index = [index for index, text in enumerate(train_fake.text.tolist()) if str(text).strip()=='']

In [None]:
fake.iloc[emtpy_fake_index]

In [None]:
real['text'] = real['title'] + " " + real['text']
fake['text'] = fake['title'] + " " + fake['text']

In [None]:
real['text'] = real['text'].apply(lambda x: str(x).lower())
fake['text'] = fake['text'].apply(lambda x: str(x).lower())

In [None]:
real.head()

In [None]:
train_real['text'] = train_real['text'].apply(lambda x: str(x).lower())
train_fake['text'] = train_fake['text'].apply(lambda x: str(x).lower())

In [None]:
# Preprocessing text

In [None]:
real['class'] = 1
fake['class'] = 0

In [None]:
real = real[['text', 'class']]

In [None]:
train_real = train_real[['text', 'label']]
train_real = train_real.rename(columns = {'label':'class', 'Text':'text'})

In [None]:
fake = fake[['text', 'class']]

In [None]:
train_fake = train_fake[['text', 'label']]
train_fake = train_fake.rename(columns = {'label':'class', 'Text':'text'})

In [None]:
data = pd.concat([real, fake, train_real, train_fake], ignore_index=True)

In [None]:
data.shape

In [None]:
data.sample(5)

In [None]:
!pip install spacy==2.2.3
!python -m spacy download en_core_web_sm
!pip install beautifulsoup4==4.9.1
!pip install textblob==0.15.3
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

In [None]:
import preprocess_kgptalkie as ps

In [None]:
data['text'] = data['text'].apply(lambda x: ps.remove_special_chars(x))

In [None]:
import gensim

In [None]:
y = data['class'].values

In [None]:
X = [d.split() for d in data['text']. tolist()]

In [None]:
len(X)

In [None]:
DIM = 100
w2v_model = gensim.models.Word2Vec(sentences = X, size = DIM, window = 10, min_count = 1)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

In [None]:
#These sequence values represent a specific word
#You can get the specific value for a word with tokenizer.word_index
X = tokenizer.texts_to_sequences(X)

In [None]:
plt.hist([len(x) for x in X], bins = 700)
plt.show()

In [None]:
nos = np.array([len(x) for x in X])
len(nos[nos>1000])

In [None]:
maxlen = 1000
X = pad_sequences(X, maxlen = maxlen)

In [None]:
len(X[101])

In [None]:
vocab_size = len(tokenizer.word_index)+1
vocab = tokenizer.word_index 

In [None]:
#what is a weight matrix for
def get_weight_matrix(model):
  weight_matrix = np.zeros((vocab_size, DIM))

  for word, i in vocab.items():
    weight_matrix[i] = model.wv[word]
    
    return weight_matrix

In [None]:
#what is an embedding vector
embedding_vectors = get_weight_matrix(w2v_model)

In [None]:
embedding_vectors.shape

In [None]:
trainableModel = Sequential()
trainableModel.add(Embedding(vocab_size, output_dim = DIM, weights = [embedding_vectors], input_length = maxlen, trainable=True))
trainableModel.add(Dropout(0.7))
trainableModel.add(LSTM(units=128))
trainableModel.add(Dropout(0.7))
trainableModel.add(Dense(1, activation='sigmoid'))
trainableModel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
nonTrainableModel = Sequential()
nonTrainableModel.add(Embedding(vocab_size, output_dim = DIM, weights = [embedding_vectors], input_length = maxlen, trainable = False))
nonTrainableModel.add(LSTM(units=128))
nonTrainableModel.add(Dense(1, activation='sigmoid'))
nonTrainableModel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
trainableModel.summary()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
trainableModel.fit(X_train, y_train, validation_split = 0.3, epochs = 6)

In [None]:
nonTrainableModel.fit(X_train, y_train, validation_split=0.3, epochs = 6)

In [None]:
trainable_y_pred = (trainableModel.predict(X_test) >= 0.5).astype(int)
nonTrainable_y_pred = (nonTrainableModel.predict(X_test) >= 0.5).astype(int)

In [None]:
accuracy_score(trainable_y_pred, y_test)

In [None]:
accuracy_score(nonTrainable_y_pred, y_test )

In [None]:
print(classification_report(y_test, trainable_y_pred))

In [None]:
print(classification_report(y_test, nonTrainable_y_pred))


In [None]:
trainableModel.save('/content/trainableModel.h5')
nonTrainableModel.save('/content/nonTrainableModel.h5')