In [None]:
# --- Cell 1: Imports and Configuration ---
import tensorflow as tf
import polars as pl
import pandas as pd
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import os

# --- Configuration ---
MODEL_NAME = 'distilbert-base-uncased'
INPUT_FILE = 'preprocessed_data.parquet'
OUTPUT_DIR = 'bias_classification_model' # Directory to save the model and tokenizer
MAX_LENGTH = 512
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
TEST_SPLIT_SIZE = 0.2
RANDOM_STATE = 42
EXPECTED_LABELS = {0, 1, 2, 3} # Define expected labels clearly
NUM_LABELS = len(EXPECTED_LABELS) # Determine num_labels from expected labels

print("Libraries imported and configuration set.")

Collecting tf-keras
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Downloading tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.7 MB ? eta -:--:--
   ------------------ --------------------- 0.8/1.7 MB 1.7 MB/s eta 0:00:01
   ------------------------------ --------- 1.3/1.7 MB 1.9 MB/s eta 0:00:01
   ------------------------------------ --- 1.6/1.7 MB 1.9 MB/s eta 0:00:01
   ------------------------------------ --- 1.6/1.7 MB 1.9 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 1.3 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.19.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
# --- Cell 2: GPU Setup ---
print("Available GPUs: ", tf.config.list_physical_devices('GPU'))
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        # Configure memory growth for each GPU
        for gpu in physical_devices:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Configured memory growth for {len(physical_devices)} GPU(s).")
        # Optional: Set specific GPU device if needed (usually TF handles this)
        # tf.config.set_visible_devices(physical_devices[0], 'GPU')
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(f"RuntimeError configuring GPU: {e}")
        print("Ensure this cell is run before any TensorFlow operations that initialize the GPU.")
else:
    print("No GPU detected. Using CPU.")

GPU not detected. Using CPU.


In [None]:
# --- Cell 3: Load Data ---
import polars as pl # Make sure polars is imported

# Assume INPUT_FILE is defined in Cell 1
# INPUT_FILE = 'preprocessed_data.parquet'

print(f"Loading data from {INPUT_FILE}...")
try:
    df = pl.read_parquet(INPUT_FILE)
    print(f"Data loaded successfully. Initial shape: {df.shape}")

    # Display schema and first few rows
    print("\nData Schema:")
    # Correct way to display schema in Polars: access the .schema attribute
    print(df.schema)

    print("\nFirst 5 rows:")
    print(df.head()) # .head() works correctly

except FileNotFoundError:
    print(f"Error: Input file not found at {INPUT_FILE}")
    raise
except Exception as e:
    print(f"Error loading or processing data: {e}")
    raise # Reraise the exception to stop execution

shape: (5, 9)
┌────────────┬──────────┬────────────┬───────┬───┬────────────┬────────────┬───────────┬───────────┐
│ content    ┆ outlet   ┆ original_i ┆ label ┆ … ┆ tokens     ┆ tokens_no_ ┆ tokens_le ┆ tokens_fi │
│ ---        ┆ ---      ┆ ndex       ┆ ---   ┆   ┆ ---        ┆ stop       ┆ mmatized  ┆ ltered    │
│ str        ┆ str      ┆ ---        ┆ i64   ┆   ┆ list[str]  ┆ ---        ┆ ---       ┆ ---       │
│            ┆          ┆ i64        ┆       ┆   ┆            ┆ list[str]  ┆ list[str] ┆ list[str] │
╞════════════╪══════════╪════════════╪═══════╪═══╪════════════╪════════════╪═══════════╪═══════════╡
│ FBI        ┆ BBC News ┆ 800        ┆ 2     ┆ … ┆ ["fbi",    ┆ ["fbi",    ┆ ["fbi",   ┆ ["fbi",   │
│ arrests    ┆          ┆            ┆       ┆   ┆ "arrests", ┆ "arrests", ┆ "arrest", ┆ "arrest", │
│ so-called  ┆          ┆            ┆       ┆   ┆ …          ┆ …          ┆ … "capito ┆ … "capito │
│ sedition…  ┆          ┆            ┆       ┆   ┆ "capitol"… ┆ "capitol"… ┆ 

In [None]:
# Convert to Pandas for easier integration with TensorFlow
pandas_df = df.to_pandas()

# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Define tokenization function
def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='tf')

# Tokenize the clean_content column
tokenized_data = [tokenize_function(text) for text in pandas_df['clean_content']]

# Extract input_ids and attention_masks
input_ids = tf.stack([data['input_ids'][0] for data in tokenized_data])
attention_masks = tf.stack([data['attention_mask'][0] for data in tokenized_data])
labels = tf.convert_to_tensor(pandas_df['label'].values)

In [None]:
# --- Cell 4: Data Cleaning and Validation ---

print("Starting data cleaning and validation...")
# Check for missing values
missing_content = df.filter(pl.col("clean_content").is_null() | (pl.col("clean_content") == "")).height
missing_label = df.filter(pl.col("label").is_null()).height
print(f"Rows with missing/empty 'clean_content': {missing_content}")
print(f"Rows with missing 'label': {missing_label}")

# Remove rows with missing values
initial_rows = df.height
df = df.filter(pl.col("clean_content").is_not_null() & (pl.col("clean_content") != "") & pl.col("label").is_not_null())
print(f"Removed {initial_rows - df.height} rows with missing values.")
print(f"Shape after cleaning: {df.shape}")

if df.height == 0:
    raise ValueError("Error: No valid data remaining after filtering.")

# Verify labels
unique_labels_in_data = set(df['label'].unique().to_list())
print(f"Unique labels found in data: {unique_labels_in_data}")
if not unique_labels_in_data.issubset(EXPECTED_LABELS):
     raise ValueError(f"Labels must only contain values within {EXPECTED_LABELS}. Found: {unique_labels_in_data}")
print("Labels are within the expected set.")

# Convert to Pandas (needed for some downstream steps like tokenizer list input and maybe class weights)
pandas_df = df.to_pandas()
print("Converted Polars DataFrame to Pandas DataFrame.")

In [None]:
# --- Cell 5: Tokenization ---

print("Initializing tokenizer...")
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

print(f"Tokenizing 'clean_content' (max_length={MAX_LENGTH})... This may take some time.")

# Tokenize the text data
# Using .tolist() is often required as input to the tokenizer for batch processing
tokenized_encodings = tokenizer(
    pandas_df['clean_content'].tolist(),
    padding='max_length',
    truncation=True,
    max_length=MAX_LENGTH,
    return_tensors='tf' # Return TensorFlow tensors
)

# Extract tensors
input_ids = tokenized_encodings['input_ids']
attention_masks = tokenized_encodings['attention_mask']
# Ensure labels are also tensors (and correct type)
labels = tf.convert_to_tensor(pandas_df['label'].values, dtype=tf.int64)

print("Tokenization complete.")
print("Input IDs shape:", input_ids.shape)
print("Attention Masks shape:", attention_masks.shape)
print("Labels shape:", labels.shape)

# Free up memory from potentially large pandas_df if no longer needed
# del pandas_df # Uncomment if memory is tight

In [None]:
# --- Cell 6: Data Splitting ---

print("Splitting data into training and validation sets...")

# Convert TensorFlow tensors to NumPy arrays *before* splitting
# This is necessary because train_test_split uses NumPy indexing internally
input_ids_np = input_ids.numpy()
attention_masks_np = attention_masks.numpy()
labels_np = labels.numpy()

# Split data using NumPy arrays
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    input_ids_np,
    attention_masks_np,
    labels_np,
    test_size=TEST_SPLIT_SIZE,
    random_state=RANDOM_STATE,
    stratify=labels_np  # Stratify to maintain label distribution in splits
)

print("Data split complete.")
print("\n--- Training Data Shapes ---")
print(f"  Input IDs:      {train_inputs.shape}")
print(f"  Attention Masks:{train_masks.shape}")
print(f"  Labels:         {train_labels.shape}")
print("\n--- Validation Data Shapes ---")
print(f"  Input IDs:      {val_inputs.shape}")
print(f"  Attention Masks:{val_masks.shape}")
print(f"  Labels:         {val_labels.shape}")

# Optional: Check label distribution in splits
# unique_train, counts_train = np.unique(train_labels, return_counts=True)
# unique_val, counts_val = np.unique(val_labels, return_counts=True)
# print(f"\nTrain label distribution: {dict(zip(unique_train, counts_train))}")
# print(f"Validation label distribution: {dict(zip(unique_val, counts_val))}")

# Free up memory from the full numpy arrays if needed
# del input_ids_np, attention_masks_np, labels_np # Uncomment if memory is tight

In [None]:
# --- Cell 7: Calculate Class Weights ---

# Calculate based on the *training* data labels to avoid data leakage from validation set
print("Calculating class weights based on training data...")
unique_train_labels, counts = np.unique(train_labels, return_counts=True)
print(f"Training label distribution: {dict(zip(unique_train_labels, counts))}")

# Use all expected labels for weight calculation, even if some are missing in this training split.
# This prevents errors if a class has 0 samples in the training data.
all_possible_classes = np.array(sorted(list(EXPECTED_LABELS)))

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=all_possible_classes, # Ensure all potential classes are considered
    y=train_labels                # Calculate weights based on the actual training labels present
)

# Create dict mapping integer label to float weight
class_weight_dict = dict(zip(all_possible_classes, class_weights))

print(f"Calculated class weights: {class_weight_dict}")

In [None]:
# --- Cell 9: Model Training ---

print(f"Starting model training for {EPOCHS} epochs...")
print(f"Batch size: {BATCH_SIZE}")
print(f"Using class weights: {class_weight_dict}")

# Prepare datasets in the format expected by model.fit
# Input is a list or tuple containing the input tensors
train_data_inputs = [train_inputs, train_masks]
val_data_inputs = [val_inputs, val_masks]

history = model.fit(
    train_data_inputs,       # Model inputs
    train_labels,            # Target labels
    validation_data=(val_data_inputs, val_labels), # Validation data tuple
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    class_weight=class_weight_dict # Apply class weights to handle imbalance
)

print("Training finished.")

# Optional: Plot training history
# import matplotlib.pyplot as plt
# pd.DataFrame(history.history).plot(figsize=(8, 5))
# plt.grid(True)
# plt.gca().set_ylim(0, 1) # set the y-axis range to [0,1]
# plt.title("Model Training History")
# plt.show()

In [None]:
# --- Cell 10: Model Evaluation ---

print("Evaluating model on the validation set...")

# Prepare validation data input format
val_data_inputs = [val_inputs, val_masks]

eval_results = model.evaluate(
    val_data_inputs,
    val_labels,
    batch_size=BATCH_SIZE, # Use a consistent batch size
    return_dict=True # Returns results as a dictionary
)

print("\n--- Evaluation Results ---")
for metric, value in eval_results.items():
    print(f"{metric.capitalize()}: {value:.4f}")

# Example of accessing specific metrics:
# print(f"Validation Loss: {eval_results['loss']:.4f}")
# print(f"Validation Accuracy: {eval_results['accuracy']:.4f}")

In [None]:
# --- Cell 11: Save Model and Tokenizer ---

print(f"Saving model and tokenizer to directory: {OUTPUT_DIR}...")

# Create the output directory if it doesn't exist
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"Created directory: {OUTPUT_DIR}")

# Save the trained model weights and configuration file
model.save_pretrained(OUTPUT_DIR)

# Save the tokenizer files
tokenizer.save_pretrained(OUTPUT_DIR)

print("Model and tokenizer saved successfully.")
print(f"Files saved in: {os.path.abspath(OUTPUT_DIR)}")