# Imports

In [27]:
from datasets import DatasetDict, Dataset,load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification,TrainingArguments, Trainer,logging
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
from sklearn.model_selection import train_test_split
import os
import torch
import torch.nn.functional as F
import pandas as pd
os.environ["WANDB_DISABLED"] = "true"

In [28]:
logging.enable_progress_bar()  
logging.set_verbosity_info()

# Preprocessing

In [29]:
df = pd.read_csv('../../data/new_data/processed/ready_data.csv')
df['target'] = df['target'].astype(int)
df['text']=df['statement']+" "+df['tweet']
df = df.drop(['statement','tweet'],axis=1)
df.head()

Unnamed: 0,target,text
0,1,End of eviction moratorium means millions of A...
1,1,End of eviction moratorium means millions of A...
2,1,End of eviction moratorium means millions of A...
3,1,End of eviction moratorium means millions of A...
4,1,End of eviction moratorium means millions of A...


In [30]:
train_df, tmp_df = train_test_split(
    df, test_size=0.20, stratify=df['target'], random_state=4)
val_df, test_df  = train_test_split(
    tmp_df, test_size=0.50, stratify=tmp_df['target'], random_state=42)
original_test_labels = test_df["target"]

In [31]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [32]:
model_path = "google-bert/bert-base-uncased"

tokenizer= AutoTokenizer.from_pretrained(model_path)

id2label = {0:"Fake", 1:"Real"}
label2id = {"Fake": 0, "Real":1}
model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=2,
                                                          id2label = id2label,
                                                          label2id = label2id)

loading configuration file config.json from cache at C:\Users\ignat\.cache\huggingface\hub\models--google-bert--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\ignat\.cache\huggingface\hub\models--google-bert--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\vocab.txt
loading fil

In [33]:
def preprocess_function(data):
    return tokenizer(data["text"])
train_tokenized = train_dataset.map(preprocess_function, batched=True)
val_tokenized = val_dataset.map(preprocess_function, batched=True)
test_tokenized = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/107354 [00:00<?, ? examples/s]

Map: 100%|██████████| 107354/107354 [00:14<00:00, 7341.29 examples/s]
Map: 100%|██████████| 13419/13419 [00:01<00:00, 8532.10 examples/s]
Map: 100%|██████████| 13420/13420 [00:02<00:00, 6696.74 examples/s]


In [34]:
train_tokenized = train_tokenized.rename_column("target","labels")
val_tokenized = val_tokenized.rename_column("target","labels")
test_tokenized = test_tokenized.rename_column("target","labels")

In [35]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

# BERT Upload

## Freezing layers

In [36]:
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [37]:
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    positive_class_probs = probabilities[:, 1]
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'],3)
    
    predicted_classes = np.argmax(predictions, axis=1)
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

## Training params

In [38]:
# lr = 2e-5
# batch_size = 32
# num_epochs = 5

# training_args = TrainingArguments(
#     output_dir="/kaggle/working/real/bert-fakenews_classifier",
#     learning_rate=lr,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=num_epochs,
#     logging_strategy="steps",
#     eval_strategy="steps",
#     eval_steps=1000,
#     logging_steps=50,
#     save_strategy="steps",
#     save_steps=2000,
#     load_best_model_at_end=True,
# )

In [39]:
print(train_tokenized[0].keys())

dict_keys(['labels', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'])


In [40]:
print(test_tokenized[0].keys())

dict_keys(['labels', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'])


## Training pooling layers

In [41]:
##trainer = Trainer(
##    args=training_args,
#    train_dataset=train_tokenized,
#    eval_dataset=val_tokenized,
#    tokenizer=tokenizer,
#    data_collator=data_collator,
#    compute_metrics=compute_metrics,
#)

#trainer.train()

In [42]:
#import shutil

#shutil.make_archive(
  #  "/kaggle/working/bert_classifier_backup_8000",  
  # 'zip',
  #  "/kaggle/working/real/bert-fakenews_classifier/checkpoint-8000"  
#)

# Trained model upload

In [44]:
model_path = "Igoras6534/fine_tuned_BERT_fakenews_8000"  

model_ready = AutoModelForSequenceClassification.from_pretrained(model_path)


loading configuration file config.json from cache at C:\Users\ignat\.cache\huggingface\hub\models--Igoras6534--fine_tuned_BERT_fakenews_8000\snapshots\979892c3a826a4776928476fa4a23a6876cbf601\config.json
Model config BertConfig {
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Fake",
    "1": "Real"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Fake": 0,
    "Real": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vo

In [45]:
trainer = Trainer(
    model=model_ready,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
)

  trainer = Trainer(
No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## Evaluation on test set

In [47]:
predictions = trainer.predict(test_tokenized.remove_columns(["labels"]))
logits = predictions.predictions
metrics = compute_metrics((logits, original_test_labels.reset_index(drop=True)))
print(metrics)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Prediction *****
  Num examples = 13420
  Batch size = 8


{'Accuracy': np.float64(0.873), 'AUC': np.float64(0.943)}


In [52]:
model.eval() 

sample_text="Scientists have discovered that the latest 5G networks are secretly altering human DNA, causing irreversible mutations that could affect generations to come. Despite official statements claiming safety, whistleblowers inside telecom companies reveal that the government is hiding the truth to push the rollout faster. Early symptoms include chronic fatigue, memory loss, and mysterious skin rashes appearing in cities with new 5G towers. Experts warn this could trigger a public health crisis worse than any pandemic we've seen before. Stay informed and protect yourself from this invisible threat."
inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding=True)

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

probs = F.softmax(logits, dim=-1)

predicted_class_idx = torch.argmax(probs, dim=-1).item()

print(f"Tekst: {sample_text}")
print(f"Przewidywana klasa: {id2label.get(predicted_class_idx, str(predicted_class_idx))}")
print(f"Prawdopodobieństwa: {probs.squeeze().tolist()}")


Tekst: Scientists have discovered that the latest 5G networks are secretly altering human DNA, causing irreversible mutations that could affect generations to come. Despite official statements claiming safety, whistleblowers inside telecom companies reveal that the government is hiding the truth to push the rollout faster. Early symptoms include chronic fatigue, memory loss, and mysterious skin rashes appearing in cities with new 5G towers. Experts warn this could trigger a public health crisis worse than any pandemic we've seen before. Stay informed and protect yourself from this invisible threat.
Przewidywana klasa: Fake
Prawdopodobieństwa: [0.6091264486312866, 0.3908735513687134]
