In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from wordcloud import WordCloud 

In [95]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM,Conv1D,MaxPool1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score

# **Exploring Fake News**


In [96]:
fake= pd.read_csv('https://raw.githubusercontent.com/ML-Deep-Learning/Fake_Real_news_dataset/main/Fake.csv')

In [None]:
fake.head()

In [None]:
fake.columns

In [None]:
fake['subject'].value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='subject',data=fake)

# **WordCloud**

In [16]:
text=' '.join(fake['text'].tolist())

In [None]:
wordcloud = WordCloud(width=2000,height=1100,margin=10).generate(text)
fig = plt.figure(figsize=(10,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

# **Exploring Real Data**

In [87]:
real = pd.read_csv('https://raw.githubusercontent.com/ML-Deep-Learning/Fake_Real_news_dataset/main/True.csv')

In [19]:
text =' '.join(real['text'].tolist())

In [None]:
wordcloud = WordCloud(width=2000,height=1100,margin=10).generate(text)
fig = plt.figure(figsize=(10,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

# Cleaning data

In [21]:
unknown_publishers = []
for index , row in enumerate(real.text.values):
  try:
    record = row.split('-', maxsplit=1)
    record[1]

    assert(len(record[0])<120)
  except:
      unknown_publishers.append(index)

In [None]:
len(unknown_publishers)

In [None]:
real.iloc[unknown_publishers].text

In [24]:
publisher = []
tmp_text = []

for index , row in enumerate(real.text.values):
  if index in unknown_publishers:
    tmp_text.append(row)
    publisher.append('Unknown')

  else:
    record = row.split('-' , maxsplit=1)
    publisher.append(record[0].strip())
    tmp_text.append(record[1].strip())


In [26]:
real['publisher']= publisher
real['text'] = tmp_text

In [None]:
real.head()

In [None]:
real.shape

In [28]:
empty_fake_index = [index for index, text in enumerate(fake.text.tolist()) if str(text).strip()==""]

In [None]:
fake.iloc[empty_fake_index]

In [30]:
real['text'] = real['title'] + " " + real['text']
fake['text'] = fake['title'] + " " + fake['text']

In [31]:
real['text'] = real['text'].apply(lambda x: str(x).lower())
fake['text'] = fake['text'].apply(lambda x: str(x).lower())

# Preprocessing Text

In [32]:
real['class'] = 1
fake['class'] = 0

In [33]:
real = real[['text', 'class' ]]

In [34]:
fake = fake[['text', 'class' ]]

In [35]:
data = real.append(fake, ignore_index=True)

In [37]:
# https://github.com/laxmimerit/preprocess_kgptalkie

In [None]:
!pip install spacy==2.2.3
!python -m spacy download en_core_web_sm
!pip install beautifulsoup4==4.9.1
!pip install textblob==0.15.3
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

In [39]:
import preprocess_kgptalkie as ps

In [40]:
data['text'] = data['text'].apply(lambda x: ps.remove_special_chars(x))

In [None]:
data.head()

# Vectorization - Word2Vec

In [42]:
import gensim

In [43]:
y = data['class'].values

In [44]:
X = [d.split() for d in data['text'].tolist()]

In [None]:
type(X[0])

In [None]:
print(X[0])

In [47]:
DIM=100
w2v_model = gensim.models.Word2Vec(sentences=X , size=DIM, window=10, min_count=1)

In [None]:
len(w2v_model.wv.vocab)

In [None]:
w2v_model.wv.vocab

In [None]:
w2v_model.wv['usa']

In [None]:
w2v_model.wv.most_similar('india')

In [52]:
tokeniser = Tokenizer()
tokeniser.fit_on_texts(X)

In [53]:
X= tokeniser.texts_to_sequences(X)

In [None]:
tokeniser.word_index

In [None]:
plt.hist([len(x) for x in X], bins= 700)
plt.show()

In [None]:
nos = np.array([len(x) for x in X])
len(nos[nos>1000])

In [57]:
maxlen = 1000
X = pad_sequences(X, maxlen=maxlen)

In [None]:
len(X[101])

In [59]:
vocab_size = len(tokeniser.word_index) + 1
vocab = tokeniser.word_index

In [60]:
def get_weight_matrix(model):
  weight_matrix = np.zeros((vocab_size,DIM))

  for word, i in vocab.items():
    weight_matrix[i] = model.wv[word]

  return weight_matrix  

In [61]:
embedding_vectors = get_weight_matrix(w2v_model)

In [None]:
embedding_vectors.shape

In [63]:
model = Sequential()
model.add(Embedding(vocab_size, output_dim=DIM, weights = [embedding_vectors],input_length=maxlen,trainable= True))
model.add(LSTM(units=128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',loss= 'binary_crossentropy', metrics='accuracy')

In [None]:
model.summary()

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
model.fit(X_train,y_train,validation_split=0.3, epochs=6)

In [67]:
y_pred = (model.predict(X_test) >=0.5).astype(int)

In [None]:
accuracy_score(y_test , y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
X_test

# Example

In [101]:
x=['House panel asks Trump ex-top aide Bannon to testify: Bloomberg']
x= tokeniser.texts_to_sequences(x)
x= pad_sequences(x, maxlen=maxlen)
(model.predict(x) >=0.5).astype(int)

array([[0]])