<a href="https://colab.research.google.com/github/MWestberg3/bias-checker/blob/issue-6/bias_checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [None]:
import pandas as pd
import kagglehub as kh
import os

path = kh.dataset_download("subhankarpanda56/news-bias")

print("Path to dataset: ", path)

In [None]:
# Find the CSV file within the downloaded directory:
for filename in os.listdir(path):
  if filename.endswith(".csv"):
    csv_file_path = os.path.join(path, filename)
    break # Stop after finding the first CSV file


bias_raw_df = pd.read_csv(csv_file_path).iloc[:300]
print(bias_raw_df.head())

In [None]:
len(bias_raw_df)

In [None]:
bias_raw_df.groupby('bias_text').count().plot(kind='pie', y='ID', autopct='%1.0f%%')

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(bias_raw_df['content_original'], bias_raw_df['bias_text'], test_size=0.2, random_state=42)

In [None]:
X_train[0]

# NLTK Pre-processing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

# Tokenization
X_token_train = [word_tokenize(line)[:100] for line in X_train]
X_token_test = [word_tokenize(line)[:100] for line in X_test]

In [None]:
print(X_token_train[0])

In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemma_X_train = [[lemmatizer.lemmatize(word) for word in line] for line in X_token_train]
lemma_X_test = [[lemmatizer.lemmatize(word) for word in line] for line in X_token_test]
print(lemma_X_train[0])

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

# Remove stopwords
clear_X_train = [[word for word in line if word not in stopwords.words('english')] for line in lemma_X_train]
clear_X_test = [[word for word in line if word not in stopwords.words('english')] for line in lemma_X_test]
print(clear_X_train[0])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initiate a tfidf vectorizer object
vectorizer = TfidfVectorizer(preprocessor=' '.join)
# Fitting the training dataset and transform it
X_train_tfidf = vectorizer.fit_transform(clear_X_train)
# Transform the test dataset
X_test_tfidf = vectorizer.transform(clear_X_test)

print(X_train_tfidf.shape)

In [None]:
print(X_train_tfidf[0])

In [None]:
import gensim

D = gensim.corpora.Dictionary(clear_X_train)
print(D)

In [None]:
train_indices = [[D.token2id[t] for t in line] for line in clear_X_train]
test_indices = [[D.token2id[t] if t in D else len(D) for t in line] for line in clear_X_test]

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

train_tensor = pad_sequence([torch.tensor(line) for line in train_indices],
                            batch_first=True,
                            padding_value=len(D)+2)
test_tensor = pad_sequence([torch.tensor(line) for line in test_indices],
                           batch_first=True,
                           padding_value=len(D)+2)

X_train_seq = nn.functional.one_hot(train_tensor)
X_test_seq = nn.functional.one_hot(test_tensor)

print(X_train_seq.shape)
print(X_test_seq.shape)

# Baseline Models with sklearn

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

ridge_classifier = RidgeClassifier()
ridge_classifier.fit(X_train_tfidf, y_train)

In [None]:
ridge_classifier_prediction = ridge_classifier.predict(X_test_tfidf)

In [None]:
ridge_classifier_accuracy = accuracy_score(y_test, ridge_classifier_prediction)
ridge_classifier_f1 = f1_score(y_test, ridge_classifier_prediction, average='weighted')

print(f"Ridge Classifier Accuracy: {ridge_classifier_accuracy}")
print(f"Ridge Classifier F1 Score: {ridge_classifier_f1}")

In [None]:
dummy_classifer = DummyClassifier(strategy='most_frequent')
dummy_classifer.fit(X_train_tfidf, y_train)

In [None]:
dummy_classifer_prediction = dummy_classifer.predict(X_test_tfidf)

dummy_classifer_accuracy = accuracy_score(y_test, dummy_classifer_prediction)
dummy_classifer_f1 = f1_score(y_test, dummy_classifer_prediction, average='weighted')

print(f"Dummy Classifier Accuracy: {dummy_classifer_accuracy}")
print(f"Dummy Classifier F1 Score: {dummy_classifer_f1}")

# Torch RNN Models

In [None]:
# # Simple tensor operations
# import torch

# # Putting data into a tensor
# X_train_tensor = torch.tensor(X_train_tfidf.toarray()).float()
# X_test_tensor = torch.tensor(X_test_tfidf.toarray()).float()

In [None]:
import torch.nn as nn

# Using super to use the net like a layer (inherit nn.Module)
class RNNNet(nn.Module):
  def __init__(self, input_size, hidden_size=128, num_of_classes=3): # Default hidden_layer size is 128 nodes
    super(RNNNet, self).__init__()
    self.hidden_size = hidden_size
    # Define the RNN layer as LSTM
    self.rnn_layer = nn.LSTM(input_size,
                             hidden_size,
                             batch_first=True)
    # Define the output layer
    self.output_layer = nn.Linear(hidden_size, num_of_classes)

  def forward(self, x):
      _, (hidden, _) = self.rnn_layer(x)  # Use the final hidden state
      return self.output_layer(hidden[-1])  # Output logits (no Softmax)

In [None]:
lr = 0.01 # Learning rate
epochs = 70 # Training epochs

input_size = X_train_seq.shape[-1] # Extract the last dimension

model = RNNNet(input_size).to("cuda:0")

loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


In [None]:
import numpy as np
# Define the custom mapping
custom_mapping = {'left': 0, 'center': 1, 'right': 2}

# Map the y_train and y_test using the custom mapping
y_train_mapped = [custom_mapping[label] for label in y_train]
y_test_mapped = [custom_mapping[label] for label in y_test]

# Convert to numpy arrays if needed (optional for some models)
y_train_encoded = np.array(y_train_mapped)
y_test_encoded = np.array(y_test_mapped)

# Print the custom mapping (optional)
print(custom_mapping)  # {'left': 0, 'center': 1, 'right': 2}


In [None]:
# Putting labels in tensors
y_train_tensor = torch.LongTensor(y_train_encoded)
y_test_tensor = torch.LongTensor(y_test_encoded)

# Repeat the learning process for the number of epochs
for e in range(epochs):
    # Forward pass
    predictions = model(X_train_seq.float().to("cuda:0"))
    # Calculate the loss -- the squeeze ensures the dimensions are consistent
    loss = loss_func(torch.squeeze(predictions), y_train_tensor.to("cuda:0"))

    # Model clean gradient
    optimizer.zero_grad()
    # Model backward pass to get the gradient
    loss.backward()
    # Model updating weights
    optimizer.step()

    # Print the training process
    if e % 20 == 0:
        print("Epoch:", e, "loss:", loss.item())

In [None]:
from sklearn import metrics

# Forward pass on test set
y_pred = model(X_test_seq.float().to("cuda:0"))

# Convert model output to predicted class indices
y_pred_classes = torch.argmax(y_pred, dim=1)

# Convert ground truth to numpy
y_test_numpy = y_test_tensor.cpu().numpy()

# Calculate accuracy
test_accuracy = metrics.accuracy_score(y_test_numpy, y_pred_classes.cpu().numpy())
print("Test Accuracy:", test_accuracy)

