In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Complete project structure creation code

import os

# Base project path (your project path)
base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"

# Define full folder structure
folders = [
    "data",
    "data/processed",
    "model",
    "model/credibility_model",
    "src",
    "notebook"
]

# Define files to create
files = [
    "src/preprocessing.py",
    "src/train.py",
    "src/predict.py",
    "src/extract.py",
    "src/config.py",
    "src/utils.py",
    "main.py",
    "evaluate.py",
    "requirements.txt",
    "README.md"
]

# Create folders
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    os.makedirs(folder_path, exist_ok=True)
    print(f"Folder created: {folder_path}")

# Create empty files
for file in files:
    file_path = os.path.join(base_path, file)

    # Create file only if it does not exist
    if not os.path.exists(file_path):
        with open(file_path, "w") as f:
            f.write("")
        print(f"File created: {file_path}")
    else:
        print(f"File already exists: {file_path}")

print("\nFull project structure created successfully.")

Folder created: /content/drive/MyDrive/Main_Project_26/credibility_module/data
Folder created: /content/drive/MyDrive/Main_Project_26/credibility_module/data/processed
Folder created: /content/drive/MyDrive/Main_Project_26/credibility_module/model
Folder created: /content/drive/MyDrive/Main_Project_26/credibility_module/model/credibility_model
Folder created: /content/drive/MyDrive/Main_Project_26/credibility_module/src
Folder created: /content/drive/MyDrive/Main_Project_26/credibility_module/notebook
File already exists: /content/drive/MyDrive/Main_Project_26/credibility_module/src/preprocessing.py
File already exists: /content/drive/MyDrive/Main_Project_26/credibility_module/src/train.py
File already exists: /content/drive/MyDrive/Main_Project_26/credibility_module/src/predict.py
File already exists: /content/drive/MyDrive/Main_Project_26/credibility_module/src/extract.py
File already exists: /content/drive/MyDrive/Main_Project_26/credibility_module/src/config.py
File already exists:

In [None]:
# Define your data folder path
data_path = "/content/drive/MyDrive/Main_Project_26/credibility_module/data"

# Download dataset zip file directly into data folder
!wget -P "$data_path" https://www.cs.ucsb.edu/~william/data/liar_dataset.zip

--2026-02-26 08:06:00--  https://www.cs.ucsb.edu/~william/data/liar_dataset.zip
Resolving www.cs.ucsb.edu (www.cs.ucsb.edu)... 23.185.0.253, 2620:12a:8000::253, 2620:12a:8001::253
Connecting to www.cs.ucsb.edu (www.cs.ucsb.edu)|23.185.0.253|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://sites.cs.ucsb.edu/~william/data/liar_dataset.zip [following]
--2026-02-26 08:06:00--  https://sites.cs.ucsb.edu/~william/data/liar_dataset.zip
Resolving sites.cs.ucsb.edu (sites.cs.ucsb.edu)... 128.111.27.164
Connecting to sites.cs.ucsb.edu (sites.cs.ucsb.edu)|128.111.27.164|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1013571 (990K) [application/zip]
Saving to: ‘/content/drive/MyDrive/Main_Project_26/credibility_module/data/liar_dataset.zip.1’


2026-02-26 08:06:02 (1021 KB/s) - ‘/content/drive/MyDrive/Main_Project_26/credibility_module/data/liar_dataset.zip.1’ saved [1013571/1013571]



In [None]:
# Extract dataset
!unzip -o "$data_path/liar_dataset.zip" -d "$data_path"

Archive:  /content/drive/MyDrive/Main_Project_26/credibility_module/data/liar_dataset.zip
  inflating: /content/drive/MyDrive/Main_Project_26/credibility_module/data/README  
  inflating: /content/drive/MyDrive/Main_Project_26/credibility_module/data/test.tsv  
  inflating: /content/drive/MyDrive/Main_Project_26/credibility_module/data/train.tsv  
  inflating: /content/drive/MyDrive/Main_Project_26/credibility_module/data/valid.tsv  


In [None]:
import os

data_path = "/content/drive/MyDrive/Main_Project_26/credibility_module/data"

print("Files in data folder:")
for file in os.listdir(data_path):
    print(file)

Files in data folder:
train.tsv
valid.tsv
test.tsv
README
liar_dataset.zip
processed
liar_dataset.zip.1


In [None]:
# Write preprocessing.py into src folder

code = '''
import os
import pandas as pd
from datasets import Dataset

# Column names of LIAR dataset
COLUMNS = [
    "id", "label", "statement", "subject", "speaker",
    "speaker_job", "state", "party", "barely_true_counts",
    "false_counts", "half_true_counts", "mostly_true_counts",
    "pants_on_fire_counts", "context"
]

# Label mapping
LABEL_MAP = {
    "pants-fire": 0,
    "false": 1,
    "barely-true": 2,
    "half-true": 3,
    "mostly-true": 4,
    "true": 5
}

def load_dataset(file_path):
    df = pd.read_csv(file_path, sep="\\t", names=COLUMNS)
    df["label"] = df["label"].map(LABEL_MAP)
    df = df[["statement", "label"]]
    return Dataset.from_pandas(df)

def tokenize_dataset(dataset, tokenizer):
    return dataset.map(
        lambda x: tokenizer(
            x["statement"],
            truncation=True,
            padding="max_length",
            max_length=256
        ),
        batched=True
    )

def preprocess_and_save(data_dir, tokenizer):

    processed_dir = os.path.join(data_dir, "processed")

    splits = ["train", "valid", "test"]

    for split in splits:

        print(f"Processing {split} dataset...")

        file_path = os.path.join(data_dir, f"{split}.tsv")

        dataset = load_dataset(file_path)

        dataset = tokenize_dataset(dataset, tokenizer)

        save_path = os.path.join(processed_dir, split)

        dataset.save_to_disk(save_path)

        print(f"Saved to {save_path}")

    print("Preprocessing completed successfully.")
'''

path = "/content/drive/MyDrive/Main_Project_26/credibility_module/src/preprocessing.py"

with open(path, "w") as f:
    f.write(code)

print("preprocessing.py created successfully.")

preprocessing.py created successfully.


In [None]:
from transformers import AutoTokenizer
import sys

# Add src folder to Python path
sys.path.append("/content/drive/MyDrive/Main_Project_26/credibility_module/src")

from preprocessing import preprocess_and_save

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Your data folder
data_dir = "/content/drive/MyDrive/Main_Project_26/credibility_module/data"

# Run preprocessing
preprocess_and_save(data_dir, tokenizer)

Processing train dataset...


Map:   0%|          | 0/10240 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10240 [00:00<?, ? examples/s]

Saved to /content/drive/MyDrive/Main_Project_26/credibility_module/data/processed/train
Processing valid dataset...


Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1284 [00:00<?, ? examples/s]

Saved to /content/drive/MyDrive/Main_Project_26/credibility_module/data/processed/valid
Processing test dataset...


Map:   0%|          | 0/1267 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1267 [00:00<?, ? examples/s]

Saved to /content/drive/MyDrive/Main_Project_26/credibility_module/data/processed/test
Preprocessing completed successfully.


In [None]:
import os

processed_path = "/content/drive/MyDrive/Main_Project_26/credibility_module/data/processed"

for split in ["train", "valid", "test"]:

    path = f"{processed_path}/{split}"

    print(f"\n{split} folder contents:")

    if os.path.exists(path):
        print(os.listdir(path))
    else:
        print("Not found")


train folder contents:
['data-00000-of-00001.arrow', 'state.json', 'dataset_info.json']

valid folder contents:
['data-00000-of-00001.arrow', 'state.json', 'dataset_info.json']

test folder contents:
['data-00000-of-00001.arrow', 'state.json', 'dataset_info.json']


In [None]:
code = '''
import os
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

def train_model(base_path):

    processed_path = os.path.join(base_path, "data", "processed")
    model_save_path = os.path.join(base_path, "model", "credibility_model")

    print("Loading processed datasets...")

    train_dataset = load_from_disk(os.path.join(processed_path, "train"))
    valid_dataset = load_from_disk(os.path.join(processed_path, "valid"))

    print("Datasets loaded successfully.")

    model_name = "distilbert-base-uncased"

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=6
    )

    training_args = TrainingArguments(

        output_dir=os.path.join(base_path, "results"),

        eval_strategy="epoch",

        save_strategy="epoch",

        learning_rate=1e-5,

        per_device_train_batch_size=8,

        per_device_eval_batch_size=8,

        num_train_epochs=8,

        weight_decay=0.01,

        logging_dir=os.path.join(base_path, "logs"),
    )

    trainer = Trainer(

        model=model,

        args=training_args,

        train_dataset=train_dataset,

        eval_dataset=valid_dataset,

    )

    print("Starting training...")

    trainer.train()

    print("Training completed.")

    print("Saving model...")

    model.save_pretrained(model_save_path)

    tokenizer.save_pretrained(model_save_path)

    print("Model saved at:", model_save_path)
'''

path = "/content/drive/MyDrive/Main_Project_26/credibility_module/src/train.py"

with open(path, "w") as f:
    f.write(code)

print("train.py created successfully.")

train.py created successfully.


In [None]:
import sys

base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"

sys.path.append(base_path + "/src")

from train import train_model

train_model(base_path)

Loading processed datasets...
Datasets loaded successfully.


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


Starting training...


Epoch,Training Loss,Validation Loss
1,1.716637,1.695116
2,1.578048,1.680243


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Training completed.
Saving model...


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model saved at: /content/drive/MyDrive/Main_Project_26/credibility_module/model/credibility_model


In [None]:
import os

model_path = "/content/drive/MyDrive/Main_Project_26/credibility_module/model/credibility_model"

print(os.listdir(model_path))

['config.json', 'model.safetensors', 'tokenizer_config.json', 'tokenizer.json']


In [None]:
import os

base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"
predict_file = os.path.join(base_path, "src", "predict.py")

code = """
import os
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Label names
LABEL_NAMES = [
    "Pants on Fire",
    "False",
    "Barely True",
    "Half True",
    "Mostly True",
    "True"
]

def load_model(model_path):

    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    model.eval()

    return tokenizer, model


def predict_text(text, model_path):

    tokenizer, model = load_model(model_path)

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )

    with torch.no_grad():

        outputs = model(**inputs)

    probs = F.softmax(outputs.logits, dim=1)

    pred_class = torch.argmax(probs).item()

    confidence = probs[0][pred_class].item()

    return {

        "prediction": LABEL_NAMES[pred_class],

        "confidence": round(confidence, 3)

    }
"""

with open(predict_file, "w") as f:
    f.write(code)

print("predict.py created successfully.")

predict.py created successfully.


In [None]:
import sys

base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"
model_path = base_path + "/model/credibility_model"

sys.path.append(base_path + "/src")

from predict import predict_text

text = "The government increased taxes last year."

result = predict_text(text, model_path)

print(result)

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

{'prediction': 'False', 'confidence': 0.242}


In [None]:
!pip install newspaper3k lxml_html_clean

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.3-py3-none-any.whl.metadata (2.3 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.4.0-py3-none-any.whl.metadata (2.4 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downl

In [None]:
import os

base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"
extract_file = os.path.join(base_path, "src", "extract.py")

code = """
from newspaper import Article

def extract_article_text(url):

    try:
        article = Article(url)

        article.download()

        article.parse()

        text = article.text

        return text

    except Exception as e:

        print("Error extracting article:", str(e))

        return None
"""

with open(extract_file, "w") as f:
    f.write(code)

print("extract.py created successfully.")

extract.py created successfully.


In [None]:
import sys

base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"

sys.path.append(base_path + "/src")

from extract import extract_article_text

url = "https://www.bbc.com/news/world-asia-67471138"

text = extract_article_text(url)

print(text[:500])

But many members of their families are now in Chinese custody; some have made remorseful confessions. Thousands of those working in the scam centres have already been handed over to the Chinese police. Governments in the region are trying to get hundreds more, still trapped in Laukkaing, out to safety.


In [None]:
import sys

base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"

sys.path.append(base_path + "/src")

from extract import extract_article_text
from predict import predict_text

model_path = base_path + "/model/credibility_model"

url = "https://www.bbc.com/news/world-asia-67471138"

text = extract_article_text(url)

if text:

    result = predict_text(text, model_path)

    print("Prediction:", result["prediction"])
    print("Confidence:", result["confidence"])

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

Prediction: Barely True
Confidence: 0.227


In [None]:
import os

base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"
main_file = os.path.join(base_path, "main.py")

code = """
import sys
import os

# Add src folder to path
base_path = os.path.dirname(os.path.abspath(__file__))

sys.path.append(os.path.join(base_path, "src"))

from extract import extract_article_text
from predict import predict_text


def check_news_url(url):

    model_path = os.path.join(base_path, "model", "credibility_model")

    print("\\nExtracting article text...")

    text = extract_article_text(url)

    if not text or len(text.strip()) == 0:

        print("Failed to extract article text.")

        return

    print("Text extracted successfully.")

    print("\\nPredicting credibility...")

    result = predict_text(text, model_path)

    print("\\n===== RESULT =====")

    print("Prediction :", result["prediction"])

    print("Confidence :", result["confidence"])


if __name__ == "__main__":

    url = input("Enter news URL: ")

    check_news_url(url)
"""

with open(main_file, "w") as f:
    f.write(code)

print("main.py created successfully.")

main.py created successfully.


In [None]:
import sys

base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"

sys.path.append(base_path)

from main import check_news_url

url = "https://www.bbc.com/news/world-asia-67471138"

check_news_url(url)


Extracting article text...
Text extracted successfully.

Predicting credibility...


Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]


===== RESULT =====
Prediction : Barely True
Confidence : 0.227


In [None]:
import os

base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"
req_file = os.path.join(base_path, "requirements.txt")

requirements = """torch
transformers
datasets
pandas
newspaper3k
lxml_html_clean
scikit-learn
numpy
"""

with open(req_file, "w") as f:
    f.write(requirements)

print("requirements.txt created successfully.")

requirements.txt created successfully.


In [None]:
import os

base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"
readme_file = os.path.join(base_path, "README.md")

readme = """# News Credibility Detection System using DistilBERT

## Project Overview

This project detects the credibility of news articles using a fine-tuned DistilBERT transformer model. The system extracts article text from a given news URL and predicts its credibility level.

## Features

- Extract article text from news URL
- Preprocess LIAR dataset
- Train DistilBERT model for credibility classification
- Predict credibility with confidence score
- Fully modular file-based structure

## Dataset

Dataset used: LIAR dataset

Classes:
- Pants on Fire
- False
- Barely True
- Half True
- Mostly True
- True

## Project Structure

credibility_module/

- data/ → dataset files
- model/ → trained model
- src/ → source code
- main.py → main execution file
- requirements.txt → dependencies
- README.md → documentation

## Installation

Install dependencies:

pip install -r requirements.txt

## Usage

Run the main file:

python main.py

Enter a news URL to get credibility prediction.

## Technologies Used

- Python
- PyTorch
- HuggingFace Transformers
- DistilBERT
- Newspaper3k
- Google Colab

## Model

Base model: distilbert-base-uncased
Fine-tuned on LIAR dataset

## Output Example

Prediction: Mostly True
Confidence: 0.842

## Author

MCA Final Year Project
News Credibility Detection System
"""

with open(readme_file, "w") as f:
    f.write(readme)

print("README.md created successfully.")

README.md created successfully.


In [None]:
import os

base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"
eval_file = os.path.join(base_path, "evaluate.py")

code = """
import os
import torch
import numpy as np
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report


LABEL_NAMES = [
    "Pants on Fire",
    "False",
    "Barely True",
    "Half True",
    "Mostly True",
    "True"
]


def evaluate_model(base_path):

    processed_path = os.path.join(base_path, "data", "processed", "test")

    model_path = os.path.join(base_path, "model", "credibility_model")

    print("Loading test dataset...")

    test_dataset = load_from_disk(processed_path)

    print("Loading model...")

    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    model.eval()

    true_labels = []
    pred_labels = []

    print("Running evaluation...")

    for example in test_dataset:

        inputs = tokenizer(

            example["statement"],

            return_tensors="pt",

            truncation=True,

            padding=True,

            max_length=128

        )

        with torch.no_grad():

            outputs = model(**inputs)

        logits = outputs.logits

        pred = torch.argmax(logits, dim=1).item()

        true_labels.append(example["label"])

        pred_labels.append(pred)

    accuracy = accuracy_score(true_labels, pred_labels)

    print("\\nAccuracy:", round(accuracy * 100, 2), "%")

    print("\\nClassification Report:")

    print(classification_report(true_labels, pred_labels, target_names=LABEL_NAMES))


if __name__ == "__main__":

    base_path = os.path.dirname(os.path.abspath(__file__))

    evaluate_model(base_path)
"""

with open(eval_file, "w") as f:
    f.write(code)

print("evaluate.py created successfully.")

evaluate.py created successfully.


In [None]:
import sys

base_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"

sys.path.append(base_path)

from evaluate import evaluate_model

evaluate_model(base_path)

Loading test dataset...
Loading model...


Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

Running evaluation...

Accuracy: 29.04 %

Classification Report:
               precision    recall  f1-score   support

Pants on Fire       0.33      0.14      0.20        92
        False       0.28      0.40      0.33       249
  Barely True       0.30      0.19      0.23       212
    Half True       0.28      0.28      0.28       265
  Mostly True       0.29      0.34      0.31       241
         True       0.31      0.29      0.30       208

     accuracy                           0.29      1267
    macro avg       0.30      0.27      0.28      1267
 weighted avg       0.29      0.29      0.29      1267



In [None]:
# Step 9: Create ZIP file for project submission

import shutil

project_path = "/content/drive/MyDrive/Main_Project_26/credibility_module"
zip_output_path = "/content/drive/MyDrive/Main_Project_26/credibility_module_submission"

# Create ZIP archive
shutil.make_archive(zip_output_path, 'zip', project_path)

print("ZIP file created successfully!")
print("Location:", zip_output_path + ".zip")

ZIP file created successfully!
Location: /content/drive/MyDrive/Main_Project_26/credibility_module_submission.zip


Propaganda starts here!!!!!!


In [None]:
# Complete project structure creation code for PROPAGANDA MODULE

import os

# Base project path
base_path = "/content/drive/MyDrive/Main_Project_26/propaganda_module"

# Define folder structure
folders = [
    "data",
    "data/processed",
    "model",
    "model/propaganda_model",
    "src",
    "notebook"
]

# Define files
files = [
    "src/preprocessing.py",
    "src/train.py",
    "src/predict.py",
    "src/extract.py",
    "src/technique_detector.py",
    "src/config.py",
    "src/utils.py",
    "main.py",
    "evaluate.py",
    "requirements.txt",
    "README.md"
]

# Create folders
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    os.makedirs(folder_path, exist_ok=True)
    print(f"Folder created: {folder_path}")

# Create empty files
for file in files:
    file_path = os.path.join(base_path, file)

    if not os.path.exists(file_path):
        with open(file_path, "w") as f:
            f.write("")
        print(f"File created: {file_path}")
    else:
        print(f"File already exists: {file_path}")

print("\nPropaganda module structure created successfully.")

Folder created: /content/drive/MyDrive/Main_Project_26/propaganda_module/data
Folder created: /content/drive/MyDrive/Main_Project_26/propaganda_module/data/processed
Folder created: /content/drive/MyDrive/Main_Project_26/propaganda_module/model
Folder created: /content/drive/MyDrive/Main_Project_26/propaganda_module/model/propaganda_model
Folder created: /content/drive/MyDrive/Main_Project_26/propaganda_module/src
Folder created: /content/drive/MyDrive/Main_Project_26/propaganda_module/notebook
File created: /content/drive/MyDrive/Main_Project_26/propaganda_module/src/preprocessing.py
File created: /content/drive/MyDrive/Main_Project_26/propaganda_module/src/train.py
File created: /content/drive/MyDrive/Main_Project_26/propaganda_module/src/predict.py
File created: /content/drive/MyDrive/Main_Project_26/propaganda_module/src/extract.py
File created: /content/drive/MyDrive/Main_Project_26/propaganda_module/src/technique_detector.py
File created: /content/drive/MyDrive/Main_Project_26/pr

In [None]:
!pip install kagglehub



In [None]:
import kagglehub
import os

path = kagglehub.dataset_download("mahdimashayekhi/propaganda-detection")

print("Downloaded to:", path)

print("Files inside dataset:")
print(os.listdir(path))

Using Colab cache for faster access to the 'propaganda-detection' dataset.
Downloaded to: /kaggle/input/propaganda-detection
Files inside dataset:
['propaganda_dataset.csv']


In [None]:
import shutil

source_csv = None

for file in os.listdir(path):
    if file.endswith(".csv"):
        source_csv = os.path.join(path, file)

destination = "/content/drive/MyDrive/Main_Project_26/propaganda_module/data/dataset.csv"

shutil.copy(source_csv, destination)

print("Dataset copied to:", destination)

Dataset copied to: /content/drive/MyDrive/Main_Project_26/propaganda_module/data/dataset.csv


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/Main_Project_26/propaganda_module/data/dataset.csv")

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (10000, 2)


Unnamed: 0,text,label
0,FYI: Look how they twist facts to confuse the ...,1
1,Alert: The media is run by traitors hiding the...,1
2,FYI: This is our last chance to defend our val...,1
3,The hiking trail reopened after seasonal closu...,0
4,FYI: Only a true patriot would support this pl...,1


In [None]:
%cd /content/drive/MyDrive/Main_Project_26/propaganda_module

/content/drive/MyDrive/Main_Project_26/propaganda_module


In [None]:
config_code = '''
import os

BASE_DIR = "/content/drive/MyDrive/Main_Project_26/propaganda_module"

DATASET_PATH = os.path.join(BASE_DIR, "data/dataset.csv")

MODEL_PATH = os.path.join(BASE_DIR, "model/propaganda_model")

MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = 2

MAX_LENGTH = 128

TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8

NUM_EPOCHS = 2

LEARNING_RATE = 2e-5

TEST_SIZE = 0.1

LABEL_MAP = {
    0: "Non-propaganda",
    1: "Propaganda"
}
'''

with open("src/config.py", "w") as f:
    f.write(config_code)

print("config.py created")

config.py created


In [None]:
preprocessing_code = '''
import pandas as pd
from datasets import Dataset
from src.config import TEST_SIZE

def load_and_prepare_dataset(csv_path):

    df = pd.read_csv(csv_path)

    df = df.dropna()

    dataset = Dataset.from_pandas(df)

    split = dataset.train_test_split(test_size=TEST_SIZE)

    return split["train"], split["test"]
'''

with open("src/preprocessing.py", "w") as f:
    f.write(preprocessing_code)

print("preprocessing.py created")

preprocessing.py created


In [None]:
train_code = '''
import os
import sys

# Fix for Colab and Google Drive imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from src.preprocessing import load_and_prepare_dataset
from src.config import *

def tokenize(examples, tokenizer):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )

def train():

    print("Loading dataset...")
    train_dataset, test_dataset = load_and_prepare_dataset(DATASET_PATH)

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    print("Tokenizing dataset...")
    train_dataset = train_dataset.map(
        lambda x: tokenize(x, tokenizer),
        batched=True
    )

    test_dataset = test_dataset.map(
        lambda x: tokenize(x, tokenizer),
        batched=True
    )

    # Remove text column
    train_dataset = train_dataset.remove_columns(["text"])
    test_dataset = test_dataset.remove_columns(["text"])

    # Convert to torch format
    train_dataset.set_format("torch")
    test_dataset.set_format("torch")

    print("Loading model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS
    )

    print("Setting training arguments...")
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        num_train_epochs=NUM_EPOCHS,
        logging_dir="./logs"
    )

    print("Initializing trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )

    print("Training started...")
    trainer.train()

    print("Saving model...")
    os.makedirs(MODEL_PATH, exist_ok=True)

    model.save_pretrained(MODEL_PATH)
    tokenizer.save_pretrained(MODEL_PATH)

    print("Model saved successfully at:", MODEL_PATH)


if __name__ == "__main__":
    train()
'''

with open("src/train.py", "w") as f:
    f.write(train_code)

print("Corrected train.py created successfully.")

Corrected train.py created successfully.


In [None]:
!pip install transformers datasets torch pandas scikit-learn newspaper3k lxml_html_clean



In [None]:
!python src/train.py

Loading dataset...
Loading tokenizer...
Tokenizing dataset...
Map: 100% 9000/9000 [00:01<00:00, 6116.65 examples/s]
Map: 100% 1000/1000 [00:00<00:00, 8204.77 examples/s]
Loading model...
Loading weights: 100% 100/100 [00:00<00:00, 644.58it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.bias    | [38;5;208mUNEXPECTED[0m | 
vocab_transform.weight  | [38;5;208mUNEXPECTED[0m | 
vocab_layer_norm.weight | [38;5;208mUNEXPECTED[0m | 
vocab_layer_norm.bias   | [38;5;208mUNEXPECTED[0m | 
vocab_projector.bias    | [38;5;208mUNEXPECTED[0m | 
classifier.bias         | [31mMISSING[0m    | 
pre_classifier.bias     | [31mMISSING[0m    | 
pre_classifier.weight   | [31mMISSING[0m    | 
classifier.weight       | [31mMISSING[0m    | 

[3mNotes:
- [38;5;208mUNEXPECTED[0m[3m	:ca

In [None]:
predict_code = '''
import os
import sys
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Fix path for Colab
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from src.config import MODEL_PATH, LABEL_MAP, MAX_LENGTH


# Load tokenizer and model
print("Loading trained model...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

model.eval()


def predict(text):
    """
    Predict whether text is propaganda or not
    """

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH
    )

    with torch.no_grad():
        outputs = model(**inputs)

    probs = F.softmax(outputs.logits, dim=1)

    pred = torch.argmax(probs).item()

    confidence = probs[0][pred].item()

    return {
        "label": LABEL_MAP[pred],
        "confidence": round(confidence, 3)
    }


# Test example
if __name__ == "__main__":

    sample = "This corrupt government is destroying the country."

    result = predict(sample)

    print(result)
'''

with open("src/predict.py", "w") as f:
    f.write(predict_code)

print("predict.py created successfully.")

predict.py created successfully.


In [None]:
!python src/predict.py

Loading trained model...
Loading weights: 100% 104/104 [00:00<00:00, 1963.05it/s, Materializing param=pre_classifier.weight]
{'label': 'Propaganda', 'confidence': 1.0}


In [None]:
extract_code = '''
import os
import sys
from newspaper import Article

# Fix path for Colab
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


def extract_article(url):
    """
    Extract main article text from URL
    """

    try:
        article = Article(url)

        article.download()

        article.parse()

        text = article.text

        if len(text) == 0:
            return "Error: No text extracted"

        return text

    except Exception as e:

        return f"Error extracting article: {str(e)}"


# Test example
if __name__ == "__main__":

    test_url = "https://www.bbc.com/news"

    text = extract_article(test_url)

    print(text[:500])
'''

with open("src/extract.py", "w") as f:
    f.write(extract_code)

print("extract.py created successfully.")

extract.py created successfully.


In [None]:
!python src/extract.py

Will drivers still make a difference in F1 in 2026?

Have F1's new rules damaged its status as the ultimate challenge? Andrew Benson assesses what the drivers are having to do differently and whether skill still matters.


In [None]:
main_code = '''
import os
import sys

# Fix import path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from src.extract import extract_article
from src.predict import predict

try:
    from src.technique_detector import detect_technique
    technique_available = True
except:
    technique_available = False


def analyze_url(url):

    print("\\nExtracting article...")

    text = extract_article(url)

    if text.startswith("Error"):
        return text

    print("Analyzing propaganda...")

    result = predict(text)

    if technique_available and result["label"] == "Propaganda":

        technique = detect_technique(text)

    else:

        technique = "None"

    return {
        "label": result["label"],
        "confidence": result["confidence"],
        "technique": technique
    }


if __name__ == "__main__":

    url = input("https://www.bbc.com/news/articles/cx2g3vmde0eo ")

    result = analyze_url(url)

    print("\\nFinal Result:")
    print(result)
'''

with open("main.py", "w") as f:
    f.write(main_code)

print("main.py created successfully.")

main.py created successfully.


In [None]:
!python main.py

Loading trained model...
Loading weights: 100% 104/104 [00:00<00:00, 1649.25it/s, Materializing param=pre_classifier.weight]
Enter news URL: Traceback (most recent call last):
  File "/content/drive/MyDrive/Main_Project_26/propaganda_module/main.py", line 48, in <module>
^C


In [None]:
evaluate_code = '''
import os
import sys

# Fix import path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from src.preprocessing import load_and_prepare_dataset
from src.predict import predict
from src.config import DATASET_PATH


def evaluate():

    print("Loading dataset...")

    train, test = load_and_prepare_dataset(DATASET_PATH)

    correct = 0

    total = len(test)

    print("Evaluating model...")

    for sample in test:

        result = predict(sample["text"])

        true_label = "Propaganda" if sample["label"] == 1 else "Non-propaganda"

        if result["label"] == true_label:

            correct += 1

    accuracy = correct / total

    print("\\nEvaluation Results:")
    print("Total samples:", total)
    print("Correct predictions:", correct)
    print("Accuracy:", round(accuracy, 4))


if __name__ == "__main__":

    evaluate()
'''

with open("evaluate.py", "w") as f:
    f.write(evaluate_code)

print("evaluate.py created successfully.")

evaluate.py created successfully.


In [None]:
!python evaluate.py

Loading trained model...
Loading weights: 100% 104/104 [00:00<00:00, 939.55it/s, Materializing param=pre_classifier.weight]
Loading dataset...
Evaluating model...

Evaluation Results:
Total samples: 1000
Correct predictions: 1000
Accuracy: 1.0


In [None]:
submit_fast_code = '''
import sys
import os

# Fix import path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from src.extract import extract_article
from src.predict import predict

# ⭐ Paste your submission URL here
URL = "https://www.bbc.com/news/articles/c0q3747jvnwo"


def run_submission():

    print("Analyzing URL:", URL)

    text = extract_article(URL)

    if text.startswith("Error"):
        print(text)
        return

    result = predict(text)

    print("\\nResult:")
    print("Prediction:", result["label"])
    print("Confidence:", result["confidence"])


if __name__ == "__main__":
    run_submission()
'''

with open("submit.py", "w") as f:
    f.write(submit_fast_code)

print("submit.py created successfully.")

submit.py created successfully.


In [None]:
!python submit.py

Loading trained model...
Loading weights: 100% 104/104 [00:00<00:00, 984.58it/s, Materializing param=pre_classifier.weight]
Analyzing URL: https://www.bbc.com/news/articles/c0q3747jvnwo

Result:
Prediction: Propaganda
Confidence: 0.965


In [None]:
# Step 9: Create ZIP file for project submission

import shutil

project_path = "/content/drive/MyDrive/Main_Project_26/propaganda_module"
zip_output_path = "/content/drive/MyDrive/Main_Project_26/propaganda_module_submission"

# Create ZIP archive
shutil.make_archive(zip_output_path, 'zip', project_path)

print("ZIP file created successfully!")
print("Location:", zip_output_path + ".zip")

ZIP file created successfully!
Location: /content/drive/MyDrive/Main_Project_26/propaganda_module_submission.zip


In [None]:
import os

base_path = "/content/drive/MyDrive/credibility_roberta"

folders = [
    "data/raw",
    "models",
    "src/preprocessing",
    "src/training",
    "src/evaluation"
]

for folder in folders:
    os.makedirs(os.path.join(base_path, folder), exist_ok=True)

print("Project structure created")


Project structure created


In [None]:
!pip install transformers datasets torch pandas scikit-learn tqdm




In [None]:
%cd /content/drive/MyDrive/credibility_roberta/data/raw
!wget https://www.cs.ucsb.edu/~william/data/liar_dataset.zip
!unzip liar_dataset.zip


/content/drive/MyDrive/credibility_roberta/data/raw
--2026-02-26 10:47:19--  https://www.cs.ucsb.edu/~william/data/liar_dataset.zip
Resolving www.cs.ucsb.edu (www.cs.ucsb.edu)... 23.185.0.253, 2620:12a:8001::253, 2620:12a:8000::253
Connecting to www.cs.ucsb.edu (www.cs.ucsb.edu)|23.185.0.253|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://sites.cs.ucsb.edu/~william/data/liar_dataset.zip [following]
--2026-02-26 10:47:19--  https://sites.cs.ucsb.edu/~william/data/liar_dataset.zip
Resolving sites.cs.ucsb.edu (sites.cs.ucsb.edu)... 128.111.27.164
Connecting to sites.cs.ucsb.edu (sites.cs.ucsb.edu)|128.111.27.164|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1013571 (990K) [application/zip]
Saving to: ‘liar_dataset.zip.3’


2026-02-26 10:47:21 (996 KB/s) - ‘liar_dataset.zip.3’ saved [1013571/1013571]

Archive:  liar_dataset.zip
replace README? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: README          

In [None]:
%%writefile /content/drive/MyDrive/credibility_roberta/src/preprocessing/dataset_loader.py

import pandas as pd
from datasets import Dataset

columns = [
    "id","label","statement","subject","speaker",
    "speaker_job","state","party","barely_true_counts",
    "false_counts","half_true_counts","mostly_true_counts",
    "pants_on_fire_counts","context"
]

label_map = {
    "pants-fire":0,
    "false":1,
    "barely-true":2,
    "half-true":3,
    "mostly-true":4,
    "true":5
}

def load_dataset(path):

    df = pd.read_csv(path, sep="\t", names=columns)

    df["label"] = df["label"].map(label_map)

    df["text"] = (
        df["statement"].fillna("") + " " +
        df["subject"].fillna("") + " " +
        df["speaker"].fillna("") + " " +
        df["party"].fillna("") + " " +
        df["context"].fillna("")
    )

    return Dataset.from_pandas(df[["text","label"]])


Overwriting /content/drive/MyDrive/credibility_roberta/src/preprocessing/dataset_loader.py


In [None]:
%%writefile /content/drive/MyDrive/credibility_roberta/src/training/train_roberta.py

import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import sys
sys.path.append("/content/drive/MyDrive/credibility_roberta")

from src.preprocessing.dataset_loader import load_dataset

print("Loading dataset...")

train_dataset = load_dataset("/content/drive/MyDrive/credibility_roberta/data/raw/train.tsv")
valid_dataset = load_dataset("/content/drive/MyDrive/credibility_roberta/data/raw/valid.tsv")
test_dataset = load_dataset("/content/drive/MyDrive/credibility_roberta/data/raw/test.tsv")

model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True)
valid_dataset = valid_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/credibility_roberta/models/results",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    weight_decay=0.01,
    warmup_steps=500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.evaluate(test_dataset)

model.save_pretrained("/content/drive/MyDrive/credibility_roberta/models/credibility_model")
tokenizer.save_pretrained("/content/drive/MyDrive/credibility_roberta/models/credibility_model")


Overwriting /content/drive/MyDrive/credibility_roberta/src/training/train_roberta.py


In [None]:
!pip install -U transformers




In [None]:
!python /content/drive/MyDrive/credibility_roberta/src/training/train_roberta.py


Loading dataset...
Map: 100% 10240/10240 [00:02<00:00, 4698.05 examples/s]
Map: 100% 1284/1284 [00:00<00:00, 5406.68 examples/s]
Map: 100% 1267/1267 [00:00<00:00, 5258.49 examples/s]
Loading weights: 100% 197/197 [00:00<00:00, 893.60it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]
[1mRobertaForSequenceClassification LOAD REPORT[0m from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | [38;5;208mUNEXPECTED[0m | 
lm_head.dense.bias              | [38;5;208mUNEXPECTED[0m | 
lm_head.dense.weight            | [38;5;208mUNEXPECTED[0m | 
lm_head.bias                    | [38;5;208mUNEXPECTED[0m | 
lm_head.layer_norm.weight       | [38;5;208mUNEXPECTED[0m | 
lm_head.layer_norm.bias         | [38;5;208mUNEXPECTED[0m | 
classifier.out_proj.bias        | [31mMISSING[0m    | 
classifier.dense.weight         | [31mMISSING[0m    | 
classifier.dense.bias           | 

In [None]:
!pip install -U transformers


In [None]:
import torch
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification

from sklearn.metrics import classification_report, accuracy_score, f1_score

import sys
sys.path.append("/content/drive/MyDrive/credibility_roberta")

from src.preprocessing.dataset_loader import load_dataset


# =========================
# Load model and tokenizer
# =========================

model_path = "/content/drive/MyDrive/credibility_roberta/models/credibility_model"

print("Loading model...")

model = AutoModelForSequenceClassification.from_pretrained(model_path)

tokenizer = AutoTokenizer.from_pretrained(model_path)

model.eval()


# =========================
# Load test dataset
# =========================

print("Loading test dataset...")

test_dataset = load_dataset(
    "/content/drive/MyDrive/credibility_roberta/data/raw/test.tsv"
)


# =========================
# Tokenize dataset
# =========================

def tokenize(example):

    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )


test_dataset = test_dataset.map(tokenize, batched=True)


# =========================
# Run evaluation
# =========================

print("Running evaluation...")

predictions = []

true_labels = []


for item in test_dataset:

    inputs = {

        "input_ids": torch.tensor([item["input_ids"]]),

        "attention_mask": torch.tensor([item["attention_mask"]])
    }

    with torch.no_grad():

        outputs = model(**inputs)

    logits = outputs.logits

    pred = torch.argmax(logits, dim=1).item()

    predictions.append(pred)

    true_labels.append(item["label"])


# =========================
# Calculate metrics
# =========================

accuracy = accuracy_score(true_labels, predictions)

f1 = f1_score(true_labels, predictions, average="weighted")

print("\n========================")
print("Evaluation Results")
print("========================\n")

print(f"Accuracy: {accuracy*100:.2f}%")

print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:\n")

print(classification_report(true_labels, predictions))


Loading model...


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading test dataset...


Map:   0%|          | 0/1267 [00:00<?, ? examples/s]

Running evaluation...


In [None]:
!python /content/drive/MyDrive/credibility_roberta/src/evaluation/evaluate_roberta.py
