In [2]:
from transformers import DataCollatorWithPadding, RobertaConfig,  DistilBertTokenizer, DistilBertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification, BertForSequenceClassification, BertTokenizer, AlbertForSequenceClassification, AlbertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

In [None]:
config = RobertaConfig.from_pretrained('roberta-base',
    num_labels=2,
    problem_type='single_label_classification',
    classifier_dropout=0.1,  # Add dropout to prevent overfitting
    hidden_dropout_prob=0.1,  # Add dropout to transformer layers
    attention_probs_dropout_prob=0.1  # Add dropout to attention
)

In [None]:
roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2, problem_type='single_label_classification')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base', num_labels=2)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, problem_type='single_label_classification')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
albert = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2, problem_type='single_label_classification')
albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', num_labels=2)

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [4]:
df = pd.read_csv('datasets/WELFake_Dataset_cleaned.csv', encoding='utf-8', on_bad_lines='skip', engine='python')

In [5]:
df = pd.DataFrame(df)

In [6]:
print(df.head())

                                                text label
0  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...     1
1  Bobby Jindal, raised Hindu, uses story of Chri...     0
2  SATAN 2: Russia unvelis an image of its terrif...     1
3  About Time! Christian Group Sues Amazon and SP...     1
4  DR BEN CARSON TARGETED BY THE IRS: “I never ha...     1


In [7]:
print(df['label'].dtype)

object


In [12]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 41794 entries, 0 to 41809
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    41794 non-null  object
 1   label   41794 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 979.5+ KB
None


In [9]:
print(df['label'].nunique())

11


In [None]:
data_collator = DataCollatorWithPadding(
    tokenizer=roberta_tokenizer,
    padding=True,
    max_length=512
)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
train_df, test_df = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)

In [None]:
#roberta tokenize function
def roberta_tokenize_function(text):
    # Ensure text is a string
    if not isinstance(text, str):
        text = str(text)
    
    # Tokenize the text
    tokenized = roberta_tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors=None
    )
    
    return {
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask']
    }

In [None]:
#bert tokenize function
def bert_tokenize_function(examples):
    return bert_tokenizer(examples, padding="max_length", truncation=True, max_length=128)

In [None]:
#albert tokenize function
def albert_tokenize_function(examples):
    return albert_tokenizer(examples, padding="max_length", truncation=True, max_length=128)

In [None]:
train_df[['input_ids', 'attention_mask']] = train_df['text'].apply(
    lambda x: pd.Series(roberta_tokenize_function(x))
)

test_df[['input_ids', 'attention_mask']] = test_df['text'].apply(
    lambda x: pd.Series(roberta_tokenize_function(x))
)

In [None]:
train_dataset_roberta = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'label']])
test_dataset_roberta = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'label']])

In [None]:
training_args = TrainingArguments(
    output_dir='./results-roberta',
    learning_rate=2e-5,  
    num_train_epochs=5,  
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=16,
    warmup_steps=500, 
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    fp16=True,
    metric_for_best_model="accuracy",
    report_to='none'
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=roberta,
    args=training_args,
    train_dataset=train_dataset_roberta,
    eval_dataset=test_dataset_roberta,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Start fine-tuning
trainer.train()

trainer.save_model('./fake-news-roberta')

Step,Training Loss
100,0.0092
200,0.0085
300,0.003
400,0.0028
500,0.0018
600,0.0007
700,0.001
800,0.0003


In [None]:
test_preds = trainer.predict(test_dataset_roberta)
test_labels = test_preds.predictions.argmax(-1)
true_labels = test_dataset_roberta['label']
accuracy = accuracy_score(true_labels, test_labels)

In [None]:
print(accuracy)

0.9989516354487


In [None]:
train_df, test_df = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)

In [None]:
train_df[['input_ids', 'attention_mask']] = train_df['text'].apply(
    lambda x: pd.Series(bert_tokenize_function(x), index=['input_ids', 'attention_mask'])
)

test_df[['input_ids', 'attention_mask']] = test_df['text'].apply(
    lambda x: pd.Series(bert_tokenize_function(x), index=['input_ids', 'attention_mask'])
)

In [None]:
train_dataset_bert = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'label']])
test_dataset_bert = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'label']])

In [None]:
training_args = TrainingArguments(
    output_dir='./results-bert',
    run_name='fndcu',
    learning_rate=5e-5,
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=32,
    fp16=True,
    logging_steps=100,
    report_to='none',
    save_strategy='no'
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=bert,
    args=training_args,
    train_dataset=train_dataset_bert,
    eval_dataset=test_dataset_bert
)

# Start fine-tuning
trainer.train()

trainer.save_model('./models/fake-news-bert')

Step,Training Loss
100,0.0464
200,0.0282
300,0.0155
400,0.0091
500,0.005
600,0.003
700,0.0015
800,0.0004


In [None]:
test_preds = trainer.predict(test_dataset_bert)
test_labels = test_preds.predictions.argmax(-1)
true_labels = test_dataset_bert['label']
accuracy = accuracy_score(true_labels, test_labels)

TypeError: can only concatenate str (not "float") to str

In [None]:
print(accuracy)

0.9939893765725468


In [None]:
train_df, test_df = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)

In [None]:
train_df[['input_ids', 'attention_mask']] = train_df['text'].apply(
    lambda x: pd.Series(albert_tokenize_function(x), index=['input_ids', 'attention_mask'])
)

test_df[['input_ids', 'attention_mask']] = test_df['text'].apply(
    lambda x: pd.Series(albert_tokenize_function(x), index=['input_ids', 'attention_mask'])
)

In [None]:
train_dataset_albert = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'label']])
test_dataset_albert = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'label']])

In [None]:
training_args = TrainingArguments(
    output_dir='./results-albert',
    run_name='fndcu',
    learning_rate=5e-5,
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=32,
    fp16=True,
    logging_steps=100,
    report_to='none',
    save_strategy='no'
)

In [None]:
trainer = Trainer(
    model=albert,
    args=training_args,
    train_dataset=train_dataset_albert,
    eval_dataset=test_dataset_albert
)

trainer.train()

trainer.save_model('./models/fake-news-albert')

Step,Training Loss
100,0.104
200,0.0335


Step,Training Loss
100,0.104
200,0.0335
300,0.0207
400,0.0164
500,0.0083
600,0.0057
700,0.0036
800,0.0014


In [None]:
test_preds = trainer.predict(test_dataset_albert)
test_labels = test_preds.predictions.argmax(-1)
true_labels = test_dataset_albert['label']
accuracy = accuracy_score(true_labels, test_labels)

In [None]:
print(accuracy)

0.9942689404528935
