In [9]:
import pandas as pd

fake_df = pd.read_csv('Fake.csv')
true_df = pd.read_csv('True.csv')

#0 fake, 1  true
fake_df['label'] = 0
true_df['label'] = 1

df = pd.concat([true_df, fake_df], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

len(df)

44898

In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower() # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # Remove URLs
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation

    tokens = text.split()
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return ' '.join(clean_tokens)

df['text'] = df['title'] + " " + df['text']
df['text'] = df['text'].apply(clean_text)


In [12]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X = tokenizer(
    text=df['text'].tolist(),
    add_special_tokens=True,
    max_length=128,
    truncation=True,
    padding='max_length',
    return_tensors='tf',
    return_attention_mask=True
)

y = df['label'].values


In [13]:
from sklearn.model_selection import train_test_split
import numpy as np

input_ids = X['input_ids'].numpy()
attention_mask = X['attention_mask'].numpy()

# Split the data
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, y, random_state=42, test_size=0.2)
train_masks, test_masks, _, _ = train_test_split(attention_mask, y, random_state=42, test_size=0.2)

train_inputs.shape
test_inputs.shape

(8980, 128)

In [14]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, from_pt=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

history = model.fit(
    {'input_ids': train_inputs, 'attention_mask': train_masks},
    train_labels,
    validation_data=({'input_ids': test_inputs, 'attention_mask': test_masks}, test_labels),
    epochs=1,
    batch_size=32
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




In [15]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

print("saved_model")

saved_model
