In [None]:
import os
import time
import psutil
import torch
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    precision_recall_fscore_support,
    matthews_corrcoef,
    roc_auc_score,
    confusion_matrix,
    roc_curve,
    auc,
)
from datasets import Dataset as HFDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    TrainerCallback,
    DataCollatorWithPadding,
    GPT2Tokenizer,
    GPT2LMHeadModel,
)
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import unicodedata
import gc

# Ensure NLTK data is downloaded
nltk.download('wordnet')
nltk.download("stopwords")

# Configuration Class
class Config:
    def __init__(self):
        self.dataset_path = "/notebooks/FakeNewsNet/dataset/"
        self.fake_news_file = "politifact_fake.csv"
        self.real_news_file = "politifact_real.csv"
        self.text_column = "title"
        self.label_column = "label"
        self.hyperparameter_optimization_enabled = False
        self.wandb_project_name = "GossipCop"
        self.model_name = "FacebookAI/roberta-base"
        self.num_trials = 10
        self.learning_rate = 2e-5
        self.num_train_epochs = 6
        self.per_device_train_batch_size = 8
        self.tokenizer_max_length = 512
        self.min_word_count = 6
        self.balance_ratio = 0.8
        self.diagnostics_enabled = True
        self.num_fake_samples = 10
        self.num_real_samples = 10
        self.freeze_bert_layers = 0

config = Config()

# Set device based on availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Preprocessing functions
def preprocess_text(text, config):
    stop_words = set(stopwords.words("english"))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    if pd.isna(text):
        return ""

    text = text.lower()

    words = text.split()

    words = [word for word in words if word.lower() not in stop_words]
    words = [stemmer.stem(word.lower()) for word in words]
    words = [lemmatizer.lemmatize(word.lower()) for word in words]

    return " ".join(words)

# Initialize GPT-2
def initialize_gpt2():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
    return tokenizer, model

gpt2_tokenizer, gpt2_model = initialize_gpt2()

def generate_synthetic_text(text, num_samples=1, max_length=50):
    synthetic_texts = []
    input_ids = gpt2_tokenizer.encode(text, return_tensors='pt').to(device)
    
    for _ in range(num_samples):
        output = gpt2_model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            do_sample=True,
            pad_token_id=gpt2_tokenizer.eos_token_id
        )
        generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
        synthetic_texts.append(generated_text)
    
    return synthetic_texts

def add_synthetic_data(data, text_column, label_column, config):
    synthetic_data = []

    for index, row in data.iterrows():
        text = row[text_column]
        label = row[label_column]

        if not text.strip():
            continue

        synthetic_texts = generate_synthetic_text(text)

        for synthetic_text in synthetic_texts:
            if synthetic_text and synthetic_text.strip():
                synthetic_data.append({text_column: synthetic_text, label_column: label})
                if config.diagnostics_enabled:
                    print(f"Original Text: {text} | Original Label: {label}")
                    print(f"Synthetic Text: {synthetic_text} | Label: {label}")

    synthetic_df = pd.DataFrame(synthetic_data)
    data = pd.concat([data, synthetic_df], ignore_index=True)
    return data

def load_and_preprocess_data(config):
    print("Loading datasets...")
    fake_news_path = os.path.join(config.dataset_path, config.fake_news_file)
    real_news_path = os.path.join(config.dataset_path, config.real_news_file)
    
    fake_news = pd.read_csv(fake_news_path)
    real_news = pd.read_csv(real_news_path)

    if config.num_fake_samples != -1:
        fake_news = fake_news.sample(n=config.num_fake_samples)
    if config.num_real_samples != -1:
        real_news = real_news.sample(n=config.num_real_samples)

    fake_news[config.label_column] = 0  # 0 for fake
    real_news[config.label_column] = 1  # 1 for real
    data = pd.concat([fake_news, real_news], ignore_index=True)

    data = data[[config.label_column, config.text_column]]

    data.dropna(subset=[config.text_column], inplace=True)

    print("Preprocessing text data...")
    data[config.text_column] = data[config.text_column].apply(
        lambda x: preprocess_text(x, config)
    )

    data = data[data[config.text_column].str.strip() != ""]

    # Generate and add synthetic data
    print("Adding synthetic data...")
    data = add_synthetic_data(data, config.text_column, config.label_column, config)

    print("Data loading and preprocessing complete.")
    return data

def tokenize_data(tokenizer, data, text_column, label_column, max_length, diagnostics_enabled):
    print("Tokenizing data...")

    # Add an index column to the data to keep track of original rows
    data = data.reset_index()
    data.rename(columns={'index': 'original_index'}, inplace=True)

    def tokenize_function(examples):
        return tokenizer(
            examples[text_column],
            padding="max_length",
            truncation=True,
            max_length=max_length
        )

    # Convert the DataFrame to a Hugging Face Dataset
    dataset = HFDataset.from_pandas(data)
    
    # Tokenize the dataset
    dataset = dataset.map(tokenize_function, batched=True)

    # Only split if we have enough samples
    if len(dataset) > 1:
        train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
        train_dataset = train_test_split['train']
        test_dataset = train_test_split['test']
    else:
        train_dataset = dataset
        test_dataset = None

    if diagnostics_enabled:
        num_samples_to_check = min(10, len(train_dataset))
        print(f"\n--- Sample Tokenized Training Data ({num_samples_to_check} samples) ---")
        for i in range(num_samples_to_check):
            original_index = train_dataset[i]['original_index']
            assert 'input_ids' in train_dataset[i], f"input_ids not found in sample {i}"
            assert 'attention_mask' in train_dataset[i], f"attention_mask not found in sample {i}"
            assert label_column in train_dataset[i], f"label not found in sample {i}"

            print(f"Sample {i+1}:")
            print("Title:", data.iloc[original_index][text_column])
            print("Tokenized Input IDs:", train_dataset[i]['input_ids'])
            print("Tokenized Attention Mask:", train_dataset[i]['attention_mask'])
            print("Original Label:", data.iloc[original_index][label_column])
            print("Label:", train_dataset[i][label_column])
            print("--------------------")

    if diagnostics_enabled and test_dataset is not None:
        num_samples_to_check = min(10, len(test_dataset))
        print(f"\n--- Sample Tokenized Testing Data ({num_samples_to_check} samples) ---")
        for i in range(num_samples_to_check):
            original_index = test_dataset[i]['original_index']
            assert 'input_ids' in test_dataset[i], f"input_ids not found in sample {i}"
            assert 'attention_mask' in test_dataset[i], f"attention_mask not found in sample {i}"
            assert label_column in test_dataset[i], f"label not found in sample {i}"

            print(f"Sample {i+1}:")
            print("Title:", data.iloc[original_index][text_column])
            print("Tokenized Input IDs:", test_dataset[i]['input_ids'])
            print("Tokenized Attention Mask:", test_dataset[i]['attention_mask'])
            print("Original Label:", data.iloc[original_index][label_column])
            print("Label:", test_dataset[i][label_column])
            print("--------------------")

    print("Tokenization complete.")
    return train_dataset, test_dataset

def compute_metrics(eval_pred):
    if isinstance(eval_pred, tuple):
        logits, labels = eval_pred
    else:
        logits = eval_pred.predictions
        labels = eval_pred.label_ids
        
    preds = logits.argmax(axis=-1)

    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", zero_division=0
    )
    mcc = matthews_corrcoef(labels, preds)
    try:
        auc_score = roc_auc_score(labels, logits[:, 1])
    except ValueError:
        auc_score = float('nan')
    
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "auc": auc_score,
        "mcc": mcc,
    }

class FinalMetricsCallback(TrainerCallback):
    def __init__(self, trainer, eval_dataset):
        self.trainer = trainer
        self.eval_dataset = eval_dataset

    def on_evaluate(self, args, state, control, **kwargs):
        predictions = self.trainer.predict(self.eval_dataset)
        metrics = compute_metrics((predictions.predictions, self.eval_dataset[config.label_column]))
        print("\nFinal Metrics after evaluation:")
        for key, value in metrics.items():
            print(f"{key.capitalize()}: {value:.4f}")

def plot_metrics(trainer_state, predictions, labels):
    train_loss = [x['loss'] for x in trainer_state.log_history if 'loss' in x]
    eval_loss = [x['eval_loss'] for x in trainer_state.log_history if 'eval_loss' in x]
    eval_accuracy = [x['eval_accuracy'] for x in trainer_state.log_history if 'eval_accuracy' in x]
    train_accuracy = [x['accuracy'] for x in trainer_state.log_history if 'accuracy' in x]

    # Print debug information
    print(f"Train Loss: {train_loss}")
    print(f"Eval Loss: {eval_loss}")
    print(f"Train Accuracy: {train_accuracy}")
    print(f"Eval Accuracy: {eval_accuracy}")

    # Truncate to match lengths
    min_length = min(len(train_loss), len(eval_loss), len(eval_accuracy), len(train_accuracy))
    
    if min_length == 0:
        print("No sufficient data for plotting.")
        return  # Exit early if no data is available
    
    train_loss = train_loss[:min_length]
    eval_loss = eval_loss[:min_length]
    eval_accuracy = eval_accuracy[:min_length]
    train_accuracy = train_accuracy[:min_length]

    epochs = range(1, min_length + 1)

    fpr, tpr, _ = roc_curve(labels, predictions[:, 1])
    roc_auc = auc(fpr, tpr)

    conf_matrix = confusion_matrix(labels, predictions.argmax(axis=-1))

    fig, axs = plt.subplots(2, 2, figsize=(14, 10))

    axs[0, 0].plot(epochs, train_loss, 'bo-', label='Training loss')
    axs[0, 0].plot(epochs, eval_loss, 'ro-', label='Validation loss')
    axs[0, 0].set_title('Training and Validation Loss')
    axs[0, 0].set_xlabel('Epochs')
    axs[0, 0].set_ylabel('Loss')
    axs[0, 0].set_xticks(epochs)
    axs[0, 0].legend()
    axs[0, 0].set_ylim(bottom=min(min(train_loss), min(eval_loss)) - 0.01, top=max(max(train_loss), max(eval_loss)) + 0.01)

    axs[0, 1].plot(epochs, train_accuracy, 'bo-', label='Training accuracy')
    axs[0, 1].plot(epochs, eval_accuracy, 'ro-', label='Validation accuracy')
    axs[0, 1].set_title('Training and Validation Accuracy')
    axs[0, 1].set_xlabel('Epochs')
    axs[0, 1].set_ylabel('Accuracy')
    axs[0, 1].set_xticks(epochs)
    axs[0, 1].legend()
    axs[0, 1].set_ylim(bottom=min(min(train_accuracy), min(eval_accuracy)) - 0.01, top=max(max(train_accuracy), max(eval_accuracy)) + 0.01)

    axs[1, 0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    axs[1, 0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    axs[1, 0].set_xlim([0.0, 1.0])
    axs[1, 0].set_ylim([0.0, 1.05])
    axs[1, 0].set_xlabel('False Positive Rate')
    axs[1, 0].set_ylabel('True Positive Rate')
    axs[1, 0].set_title('Receiver Operating Characteristic (ROC)')
    axs[1, 0].legend(loc="lower right")

    sns.heatmap(
        conf_matrix,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=["Fake", "Real"],
        yticklabels=["Fake", "Real"],
        cbar=False,
        ax=axs[1, 1]
    )
    axs[1, 1].set_xlabel("Predicted")
    axs[1, 1].set_ylabel("True")
    axs[1, 1].set_title("Confusion Matrix - Final Model")

    plt.tight_layout()
    plt.show()


def freeze_layers(model, num_layers_to_freeze):
    """Freezes the first `num_layers_to_freeze` layers of the model."""
    if hasattr(model, 'bert'):
        encoder_layers = model.bert.encoder.layer
    elif hasattr(model, 'roberta'):
        encoder_layers = model.roberta.encoder.layer
    elif hasattr(model, 'distilbert'):
        encoder_layers = model.distilbert.transformer.layer
    elif hasattr(model, 'albert'):
        encoder_layers = model.albert.encoder.albert_layer_groups
    elif hasattr(model, 'electra'):
        encoder_layers = model.electra.encoder.layer
    else:
        raise ValueError("Model type not supported for freezing layers.")
    
    for i in range(num_layers_to_freeze):
        if hasattr(encoder_layers, '__len__') and i < len(encoder_layers):
            for param in encoder_layers[i].parameters():
                param.requires_grad = False
        else:
            raise ValueError(f"Model does not have {num_layers_to_freeze} layers to freeze.")

def main():
    print("Starting main execution...")
    data = load_and_preprocess_data(config)

    tokenizer = AutoTokenizer.from_pretrained(config.model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    train_dataset, test_dataset = tokenize_data(tokenizer, data, config.text_column, config.label_column, config.tokenizer_max_length, config.diagnostics_enabled)

    if config.wandb_project_name:
        import wandb
        wandb.init(project=config.wandb_project_name, config={
            "learning_rate": config.learning_rate,
            "epochs": config.num_train_epochs,
            "batch_size": config.per_device_train_batch_size,
            "model_name": config.model_name
        })

    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=config.num_train_epochs,
        per_device_train_batch_size=config.per_device_train_batch_size,
        per_device_eval_batch_size=config.per_device_train_batch_size,
        learning_rate=config.learning_rate,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        weight_decay=0.1,
        report_to="wandb" if config.wandb_project_name else []
    )

    print("Loading the final model for training...")
    model = AutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=2).to(device)

    if config.freeze_bert_layers > 0:
        print(f"Freezing the first {config.freeze_bert_layers} layers of the model.")
        freeze_layers(model, config.freeze_bert_layers)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    trainer.add_callback(FinalMetricsCallback(trainer, test_dataset))

    start_time = time.time()
    print("Training the final model...")
    trainer.train()
    end_time = time.time()
    process = psutil.Process(os.getpid())
    memory_usage = process.memory_info().rss / (1024 ** 3)

    print(f"Training time: {(end_time - start_time) / 60:.2f} minutes")
    print(f"Memory usage: {memory_usage:.2f} GB")

    for epoch in range(config.num_train_epochs):
        train_output = trainer.predict(train_dataset)
        train_metrics = compute_metrics((train_output.predictions, train_output.label_ids))
        eval_output = trainer.evaluate()
        print(f"Epoch {epoch + 1} - Training Accuracy: {train_metrics['accuracy']:.4f}, Validation Accuracy: {eval_output['eval_accuracy']:.4f}")

    predictions = trainer.predict(test_dataset)
    preds = predictions.predictions.argmax(-1)
    labels = test_dataset[config.label_column]

    metrics = compute_metrics((predictions.predictions, labels))

    print("\nFinal Evaluation Metrics:")
    for key, value in metrics.items():
        print(f"{key.capitalize()}: {value:.4f}")

    report = classification_report(labels, preds, target_names=["Fake", "Real"])
    print("\nClassification Report:\n", report)

    plot_metrics(trainer.state, predictions.predictions, labels)

    if config.wandb_project_name:
        wandb.finish()

    del trainer, model, predictions
    gc.collect()
    torch.cuda.empty_cache()

    print("Main execution complete.")

if __name__ == "__main__":
    main()


Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fa925b74d10>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7fa921394cd0, raw_cell="import os
import time
import psutil
import torch
i.." store_history=True silent=False shell_futures=True cell_id=c7391487-7c70-42cb-8dbf-a8e11c991009>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

Using device: cuda


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Starting main execution...
Loading datasets...
Preprocessing text data...
Adding synthetic data...
Original Text: breaking: 3 liber celebr arrest conspiraci assassin presid trump | Original Label: 0
Synthetic Text: breaking: 3 liber celebr arrest conspiraci assassin presid trump

"It is a shame that a few of the men involved in this attack were not prosecuted for their role in the attack," he said.
 (Reporting by Robert Birnbaum | Label: 0
Original Text: michel obama receiv life-shatt news doctor | Original Label: 0
Synthetic Text: michel obama receiv life-shatt news doctor

New Delhi: A woman was killed after her body was found in a road in Jammu and Kashmir city on Saturday, police said.



The body of Atef | Label: 0
Original Text: breaking: barcelona terrorist cousin u name barack | Original Label: 0
Synthetic Text: breaking: barcelona terrorist cousin u name barack

U.S. senator and New Jersey Gov. Chris Christie, the Republican presidential nominee, speaks at a news conference in