In [31]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [32]:
# Step 2: Load dataset
df = pd.read_csv("/content/fake_and_real_news.csv")

print(df.head())
print(df['label'].value_counts())

                                                Text label
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake
1  U.S. conservative leader optimistic of common ...  Real
2  Trump proposes U.S. tax overhaul, stirs concer...  Real
3   Court Forces Ohio To Allow Millions Of Illega...  Fake
4  Democrats say Trump agrees to work on immigrat...  Real
label
Fake    5000
Real    4900
Name: count, dtype: int64


In [33]:
# Step 3: Clean text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove links
    text = re.sub(r"[^a-zA-Z]", " ", text)  # keep only letters
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['Text'] = df['Text'].apply(clean_text)


In [34]:
# Step 4: Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
# Real -> 1, Fake -> 0

In [35]:
# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['label'],
                                                    test_size=0.2, random_state=42)


In [36]:
# Step 6: Tokenize
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [37]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [38]:
max_len = 200

In [39]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')


In [40]:
# Step 7: Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # binary classification
])



In [41]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [42]:
# Step 8: Train
history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_test_pad, y_test),
    epochs=5,
    batch_size=32,
    verbose=1
)

Epoch 1/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 538ms/step - accuracy: 0.8986 - loss: 0.2238 - val_accuracy: 0.9995 - val_loss: 0.0032
Epoch 2/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 486ms/step - accuracy: 0.9991 - loss: 0.0027 - val_accuracy: 0.9990 - val_loss: 0.0048
Epoch 3/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 502ms/step - accuracy: 1.0000 - loss: 4.1209e-04 - val_accuracy: 0.9995 - val_loss: 0.0028
Epoch 4/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 511ms/step - accuracy: 1.0000 - loss: 8.9901e-05 - val_accuracy: 0.9995 - val_loss: 0.0030
Epoch 5/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 505ms/step - accuracy: 1.0000 - loss: 4.6068e-05 - val_accuracy: 0.9995 - val_loss: 0.0033


In [43]:
# Step 9: Evaluate
loss, acc = model.evaluate(X_test_pad, y_test, verbose=0)
print(f"✅ Test Accuracy: {acc*100:.2f}%")


✅ Test Accuracy: 99.95%


In [44]:
# Step 10: Save model
model.save("fake_news_model.h5")



In [46]:
import pickle
# Save tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# 1. Load dataset
df = pd.read_csv("/content/fake_and_real_news.csv")   # replace with your file name
df.columns = ["text", "label"]  # make sure columns are correct

# 2. Encode labels (Fake = 0, Real = 1)
df['label'] = df['label'].map({'Fake': 0, 'Real': 1})

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# 4. TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 5. Save vectorizer as pickle
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("✅ vectorizer.pkl file created successfully")


✅ vectorizer.pkl file created successfully
