<a href="https://colab.research.google.com/github/Jasmeet100/Fake-news-detector/blob/main/fake_n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from wordcloud import WordCloud

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Fake news

In [None]:
fake = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/fake-real-news-dataset/refs/heads/main/data/Fake.csv")

In [None]:
text = " ".join(fake["text"].tolist())

In [None]:
wordcloud = WordCloud(width=1920, height=1080).generate(text)
fig = plt.figure(figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)

# Real news

In [None]:
real = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/fake-real-news-dataset/refs/heads/main/data/True.csv')

In [None]:
text = " ".join(real["text"].tolist())

In [None]:
wordcloud = WordCloud(width=1920, height=1080).generate(text)
fig = plt.figure(figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)

# Cleaning Data

In [None]:
#real.sample(4)

In [None]:
unknown_publisher = []
for index, row in enumerate(real.text.values):
  try:
    record = row.split('-', maxsplit=1)
    record[1]

    assert(len(record[0])<120)

  except:
    unknown_publisher.append(index)


In [None]:
len(unknown_publisher)

In [None]:
real.iloc[unknown_publisher].text

In [None]:
real.drop(8970, axis=0)

In [None]:
publisher = []
tmp_txt = []

for index, row in enumerate(real.text.values):
  if index in unknown_publisher:
    tmp_txt.append(row)
    publisher.append('Unknown')

  else:
    record =  row.split('-', maxsplit = 1)
    publisher.append(record[0].strip())
    tmp_txt.append(record[1].strip())

In [None]:
real['publisher']=publisher
real['text']=tmp_txt

In [None]:
real.head()

In [None]:
empty_fake_index = [index for index,text in enumerate(fake.text.tolist()) if str(text).strip()=='']

In [None]:
fake.iloc[empty_fake_index]

In [None]:
real['text'] = real['text'] + "" + real['title']
fake['text'] = fake['text'] + "" + fake['title']

In [None]:
real["text"] = real['text'].apply(lambda x: str(x).lower())
fake["text"] = fake['text'].apply(lambda x: str(x).lower())

# Preprocessing

In [None]:
real['class']=1
fake['class']=0

In [None]:
real = real[['text','class']]
fake = fake[['text','class']]

In [None]:
data = pd.concat([real, fake], ignore_index=True)

# Vectorsiation

In [None]:
!pip install numpy==1.26.4 # Install compatible numpy version
!pip install gensim
!pip install spacy==3.7.6
!python -m spacy download en_core_web_sm

In [None]:
import gensim #word to vector conversion

In [None]:
y = data['class'].values

In [None]:
X = [d.split() for d in data['text'].tolist()]  #list of lists

In [None]:
DIM = 100
w2v_model = gensim.models.Word2Vec(sentences=X, vector_size=DIM, window=10, min_count=1)

In [None]:
#w2v_model.wv.most_similar('trump')

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

In [None]:
X = tokenizer.texts_to_sequences(X)

In [None]:
maxlen = 1000
X = pad_sequences(X, maxlen = maxlen)

In [None]:
#len(X[100])

In [None]:
vocab_size = len(tokenizer.word_index) + 1 #?
vocab = tokenizer.word_index

In [None]:
def get_weight_matrix(model):
  weight_matrix = np.zeros((vocab_size, DIM))

  for word, i in vocab.items ():
    weight_matrix[i] = model.wv[word]

  return weight_matrix

In [None]:
embedding_vectors = get_weight_matrix(w2v_model) #??

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, output_dim=DIM, weights = [embedding_vectors], input_length = maxlen, trainable=False))
model.add(LSTM(units=128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])   #???????????????????????

In [None]:
model.summary()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=6, batch_size=64) #????

Epoch 1/6
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m963s[0m 2s/step - acc: 0.9313 - loss: 0.1621 - val_acc: 0.9850 - val_loss: 0.0437
Epoch 2/6
[1m 34/527[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m13:13[0m 2s/step - acc: 0.9829 - loss: 0.0421

In [None]:
y_pred = (model.predict(X_test)>=0.5).astype(int)

In [None]:
accuracy_score(y_test,y_pred)   #y_pred?

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
n = ["this is a news"]
n = tokenizer.texts_to_sequences(n)
n = pad_sequences(n, maxlen=maxlen)

In [None]:
#model.predict(n)
(model.predict(n)>=0.5).astype(int)