In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Load training dataset
df = pd.read_csv('train.csv', sep=';', on_bad_lines='skip', engine='python').head(10000)
df['combined_text'] = df['title'] + ' ' + df['text']  # Combine title and text for input

# Parameters
max_words = 5000  # Vocabulary size (top 5000 words)
max_len = 300     # Maximum sequence length
embedding_dim = 100  # Embedding vector size
batch_size = 32
epochs = 10

# Tokenize text (using Keras Tokenizer for consistency)
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['combined_text'])
sequences = tokenizer.texts_to_sequences(df['combined_text'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

# Prepare data
X = padded_sequences
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create DataLoaders
train_dataset = NewsDataset(X_train, y_train)
test_dataset = NewsDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define CNN model
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_len, num_filters=128, kernel_size=5):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=kernel_size)
        self.relu = nn.ReLU()
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)  # Global max pooling
        self.dense1 = nn.Linear(num_filters, 64)
        self.dropout = nn.Dropout(0.5)
        self.dense2 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)  # [batch_size, max_len, embedding_dim]
        x = x.permute(0, 2, 1)  # [batch_size, embedding_dim, max_len] for Conv1d
        x = self.conv1d(x)  # [batch_size, num_filters, max_len - kernel_size + 1]
        x = self.relu(x)
        x = self.global_max_pool(x).squeeze(-1)  # [batch_size, num_filters]
        x = self.dense1(x)  # [batch_size, 64]
        x = self.relu(x)
        x = self.dropout(x)
        x = self.dense2(x)  # [batch_size, 1]
        x = self.sigmoid(x)  # [batch_size, 1]
        return x

# Initialize model, loss, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextCNN(vocab_size=max_words, embedding_dim=embedding_dim, max_len=max_len).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

# Early stopping parameters
patience = 2
best_val_loss = float('inf')
patience_counter = 0
best_model_state = None

# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device).view(-1, 1)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
    train_loss /= len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device).view(-1, 1)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
        val_loss /= len(test_loader.dataset)

    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_state = model.state_dict()
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Load best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)

# Evaluate model
model.eval()
y_pred = []
y_true = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        preds = (outputs > 0.5).float().cpu().numpy()
        y_pred.extend(preds.flatten())
        y_true.extend(labels.cpu().numpy())

# Print classification report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=['Fake', 'Real']))

# Save model (optional)
torch.save(model.state_dict(), 'cnn_fake_news_model.pth')

Epoch 1/10, Train Loss: 0.2559, Val Loss: 0.0695
Epoch 2/10, Train Loss: 0.0577, Val Loss: 0.0635
Epoch 3/10, Train Loss: 0.0278, Val Loss: 0.0672
Epoch 4/10, Train Loss: 0.0105, Val Loss: 0.0768
Early stopping triggered
Classification Report:
              precision    recall  f1-score   support

        Fake       0.96      0.99      0.98       878
        Real       0.99      0.97      0.98      1122

    accuracy                           0.98      2000
   macro avg       0.98      0.98      0.98      2000
weighted avg       0.98      0.98      0.98      2000



In [5]:
from google.colab import files
files.download('cnn_fake_news_model.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>