In [None]:
import pandas as pd
import numpy as np
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import logging
import os
import time
from torch.utils.data import Dataset
 
# ==== SETUP LOGGING ====
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
 
# ==== CUDA/GPU CONFIGURATION ====
def configure_cuda():
    if not torch.cuda.is_available():
        logger.error("CUDA not available! Exiting.")
        return False
    logger.info(f"CUDA Available: {torch.cuda.is_available()}")
    logger.info(f"GPU Count: {torch.cuda.device_count()}")
    logger.info(f"GPU Name: {torch.cuda.get_device_name(0)}")
    logger.info(f"CUDA Version: {torch.version.cuda}")
    return True
 
# ==== DATA PROCESSING ====
def process_biased_words(x):
    if not isinstance(x, str):
        return ''
    if x.startswith('['):
        try:
            parsed = ast.literal_eval(x)
            if isinstance(parsed, list):
                return ' '.join(str(item) for item in parsed)
            return str(parsed)
        except:
            return ''
    return x
 
# ==== CUSTOM DATASET ====
class BiasDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
 
    def __len__(self):
        return len(self.texts)
 
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
 
# ==== PYTORCH BERT PIPELINE ====
def run_pytorch_bert_pipeline(
    csv_path,
    output_path="bias_classifier_model",
    sample_size=None,
    max_len=96,
    batch_size=16,
    epochs=3,
    validation_split=0.1
):
    start_time = time.time()
    logger.info(f"Starting PyTorch BERT pipeline at {time.strftime('%H:%M:%S')}")
 
    # Configure CUDA
    if not configure_cuda():
        logger.error("CUDA setup failed. Exiting.")
        return None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")
 
    # Load data
    logger.info(f"Loading data from {csv_path}")
    df = pd.read_csv(csv_path, nrows=sample_size) if sample_size else pd.read_csv(csv_path)
    logger.info(f"Loaded {len(df)} rows")
 
    # Process data
    logger.info("Processing data")
    if 'biased_words' in df.columns:
        df['biased_words'] = df['biased_words'].fillna('').apply(process_biased_words)
    else:
        df['biased_words'] = ''
    for col in ['text', 'topic', 'article']:
        df[col] = df.get(col, '').fillna('').astype(str)
    df['full_input'] = df['text'] + " " + df['topic'] + " " + df['article'] + " " + df['biased_words']
 
    # Process labels
    if 'type' not in df.columns:
        logger.error("Column 'type' not found")
        return None
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['type'])
    num_classes = len(label_encoder.classes_)
    logger.info(f"Classes: {label_encoder.classes_}")
 
    # Split data
    logger.info("Splitting data")
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['full_input'].tolist(), y,
        test_size=validation_split, stratify=y, random_state=42
    )
 
    # Initialize tokenizer
    logger.info("Loading BERT tokenizer")
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 
    # Create datasets
    logger.info("Creating datasets")
    train_dataset = BiasDataset(train_texts, train_labels, tokenizer, max_len)
    val_dataset = BiasDataset(val_texts, val_labels, tokenizer, max_len)
 
    # Initialize model
    logger.info("Loading BERT model")
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=num_classes
    ).to(device)
 
    # Training arguments
    training_args = TrainingArguments(
        output_dir="./checkpoints",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_steps=len(train_labels) // batch_size // 2,
        logging_dir='./logs',
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        fp16=True,  # Enable mixed precision for GPU
        report_to="none"
    )
 
    # Define compute metrics
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        accuracy = (predictions == labels).mean()
        return {"accuracy": accuracy}
 
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )
 
    # Train
    logger.info("Starting training")
    trainer.train()
 
    # Evaluate
    logger.info("Evaluating model")
    eval_results = trainer.evaluate()
    logger.info(f"Validation Accuracy: {eval_results['eval_accuracy']}")
 
    # Classification report
    logger.info("Generating classification report")
    predictions = trainer.predict(val_dataset)
    pred_classes = np.argmax(predictions.predictions, axis=-1)
    logger.info("\nClassification Report:")
    print(classification_report(val_labels, pred_classes, target_names=label_encoder.classes_))
 
    # Save model
    os.makedirs(output_path, exist_ok=True)
    logger.info(f"Saving model to {output_path}")
    model.save_pretrained(output_path)
    tokenizer.save_pretrained(output_path)
    import pickle
    with open(f"{output_path}/label_encoder.pkl", "wb") as f:
        pickle.dump(label_encoder, f)
 
    training_time = time.time() - start_time
    logger.info(f"Completed in {training_time // 3600}h {(training_time % 3600) // 60}m")
    return {
        'model': model,
        'tokenizer': tokenizer,
        'label_encoder': label_encoder,
        'accuracy': eval_results['eval_accuracy']
    }
 
# ==== ENTRY POINT ====
if __name__ == "__main__":
    # Verify CUDA
    if not configure_cuda():
        logger.error("CUDA not available. Exiting.")
        exit(1)
    
    # Run pipeline
    result = run_pytorch_bert_pipeline(csv_path="balanced_data.csv")
    logger.info("Done!")

In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
 
 
model_path = "bias_classifier_model"
tokenizer = AutoTokenizer.from_pretrained(model_path,local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
test_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)
 
samples = [
["""As wildfires rage across California, floods displace thousands in the Midwest, and heatwaves scorch cities from Texas to New York, the evidence is undeniable: the climate crisis is no longer a distant threat—it’s here. And yet, as communities suffer and ecosystems collapse, fossil fuel corporations continue to post record-breaking profits, protected by conservative politicians and a global system rigged in their favor.
In 2024 alone, the five largest oil companies reported over $200 billion in profits. Instead of investing in renewable energy or helping vulnerable communities transition to a green economy, these corporations funneled billions into stock buybacks and executive bonuses. Their message is clear: profits come before people, and the planet can burn so long as the shareholders stay rich.
Even more alarming is the political shielding they receive from right-wing lawmakers, many of whom deny climate science altogether. Republican leaders in Congress have repeatedly blocked climate legislation, gutted the Environmental Protection Agency’s regulatory powers, and prioritized drilling permits over clean air and water.
Meanwhile, climate activists—many of them youth, Indigenous leaders, and marginalized communities—continue to face police repression, surveillance, and criminalization. Peaceful protesters at pipeline sites are arrested, while oil spills and environmental destruction go unpunished.
We need a Green New Deal-level transformation: bold investments in wind, solar, and green infrastructure; the creation of millions of unionized green jobs; and climate reparations for communities hit hardest by pollution and environmental racism.
The time for delay is over. The time to act is now."""
],
["""The United States thrives when government steps back and lets free enterprise lead. In recent years, however, progressive lawmakers have increasingly pushed for regulation, redistribution, and intervention that stifles innovation and discourages hard work.
From overreaching environmental mandates to government-controlled healthcare proposals, the left continues to champion policies that prioritize bureaucracy over results. These moves are not only anti-business—they’re anti-American.
America's economic engine runs best when the private sector is free to create, compete, and grow. Small business owners across the country are already struggling with inflation and labor shortages—problems worsened by excessive government interference and rising taxes.
We must return to policies that reward productivity, protect property rights, and uphold free-market values. Deregulation, tax reform, and energy independence will not only restore our economy—they’ll renew our national spirit.
"""],
["""As artificial intelligence tools become increasingly integrated into everyday life—from health diagnostics to criminal justice systems—Democratic and Republican lawmakers alike are recognizing the need for clear regulatory frameworks.
A bipartisan group in Congress recently introduced the American AI Responsibility Act, aiming to address transparency, data privacy, and algorithmic bias. While the bill doesn’t go as far as some activists demand, it marks an important step toward balancing innovation with accountability.
Tech CEOs have expressed cautious support, stating that some regulation is needed to maintain public trust, but they warn against overregulation that could drive development offshore.
Experts agree: regulation must be careful, measured, and informed by the science—not by political theater. While divisions remain, the shared concern over AI’s risks may offer a rare opportunity for consensus in Washington.
"""],
["""In yet another blow to working-class Americans, Senate Republicans have blocked legislation that would raise the federal minimum wage to $17 per hour by 2027. With wages stagnant and inflation hitting food, rent, and transportation costs, the move is being widely condemned by labor leaders and economists.
The current $7.25 minimum wage has not been raised since 2009, despite historic gains in productivity and corporate profits. Over 60% of Americans support a raise, but Republican lawmakers claim it would “hurt small businesses”—an argument that many economists say is overblown.
In reality, the refusal to raise wages preserves exploitative systems where billion-dollar corporations rely on underpaid workers while CEO salaries skyrocket.
This is not just about economics—it’s about dignity. Every American who works full-time should be able to afford basic necessities. Congress’s failure to act is a moral failure, and it’s up to voters to hold them accountable."""],
 ["""The southern border has long been a flashpoint in American politics, but recent data shows that tougher enforcement and advanced surveillance technology are yielding results. Illegal crossings dropped 30% in the first quarter of 2025 compared to the previous year, according to Homeland Security reports.
Under the new measures, authorities have deployed AI-powered drones, reinforced border fencing, and accelerated asylum screening procedures. Critics on the left say the policies are “inhumane,” but officials argue they are necessary to protect national sovereignty and public safety.
Drug seizures have also increased, particularly fentanyl shipments originating from cartels that exploit weak border points. Law enforcement agencies say the new tools and funding are making a significant impact.
The Biden administration was slow to act early in its term, but this policy shift marks a necessary correction. The right to immigrate must be balanced with the rule of law—and American citizens deserve to feel safe and secure in their own country.
"""]
]
 
for text in samples:
    output = test_pipeline(text, truncation=True, max_length=512)
    sorted_output = sorted(output[0], key=lambda x: x["score"], reverse=True)
    top_label = sorted_output[0]
 
    label_map = {
    0: "left", 1: "center", 2: "right"}
    # Extract the numeric part of the label like 'LABEL_1' -> 1
    label_index = int(top_label['label'].split('_')[-1])
    readable_label = label_map[label_index]
 
    print(f"\nText: {text}")
    print(f"Predicted Bias: {readable_label} \n confidence score:({top_label['score']:.4f})")