In [None]:
!pip install transformers torch pandas scikit-learn



In [None]:
from google.colab import files
uploaded = files.upload()

import pandas as pd

# Load the dataset
data = pd.read_csv('dataset-2 - dataset-2.csv')  # Replace with your actual file name
texts = data['text'].tolist()  # Adjust the column name as necessary
bias_scores = data['bias_score'].tolist()  # Adjust the column name as necessary

print(data.head())  # Display the first few rows of the dataset

Saving dataset-2 - dataset-2.csv to dataset-2 - dataset-2 (2).csv
                                                text  bias_score
0  NYPD Commissioner Dermot Shea on Monday expres...         0.0
1  School systems across the country are adopting...         9.5
2  And then along came President Barry Obama, who...         9.5
3  The curfews, which have never before occurred ...         6.0
4  Rather than help be a part of the solution, Tr...         8.5


In [None]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(texts, bias_scores, test_size=0.2, random_state=42)

print(f'Training samples: {len(X_train)}, Testing samples: {len(X_test)}')

Training samples: 3139, Testing samples: 785


In [None]:
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class BiasDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding='max_length', truncation=True, return_tensors='pt', max_length=128)
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float).unsqueeze(0)  # For regression
        }

# Create DataLoaders
train_dataset = BiasDataset(X_train, y_train)
test_dataset = BiasDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)



In [None]:
import torch
from transformers import BertForSequenceClassification, AdamW

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)  # 1 for regression
optimizer = AdamW(model.parameters(), lr=3e-5)

# Check if a GPU is available and move the model to the GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
model.train()
for epoch in range(10):  # You can increase the number of epochs
    for batch in train_loader:
        optimizer.zero_grad()

        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

Epoch 1, Loss: 1.5780552625656128
Epoch 2, Loss: 6.654885292053223
Epoch 3, Loss: 0.24797450006008148
Epoch 4, Loss: 0.15849140286445618
Epoch 5, Loss: 2.2460474967956543
Epoch 6, Loss: 0.958620548248291
Epoch 7, Loss: 0.09033676236867905
Epoch 8, Loss: 0.3190920352935791
Epoch 9, Loss: 0.3995860517024994
Epoch 10, Loss: 0.04128565266728401


In [None]:
from sklearn.metrics import mean_squared_error

model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = outputs.logits.squeeze().tolist()  # Get predictions

        if not isinstance(preds, list):
            preds = [preds]
        predictions.extend(preds)

# Calculate the Mean Squared Error
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 7.2360425647508855


In [None]:
model.save_pretrained('bias_score_model')
tokenizer.save_pretrained('bias_score_model')

('bias_score_model/tokenizer_config.json',
 'bias_score_model/special_tokens_map.json',
 'bias_score_model/vocab.txt',
 'bias_score_model/added_tokens.json')

In [None]:
# Function to predict bias score for a custom input
def predict_bias_score(input_text):
    # Tokenize the input text
    encoding = tokenizer(input_text, padding='max_length', truncation=True, return_tensors='pt', max_length=128)

    # Move input to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Make prediction
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        bias_score = outputs.logits.squeeze().item()  # Get the predicted score

    return bias_score

# Example usage
custom_input = "In an apparent attempt to blame sexism for the blowback the Rashida Tlaib received for her Impeach the Motherf*cker remark, Pelosi wondered aloud what the response would have been if a man made the same"  # Replace with your input
predicted_score = predict_bias_score(custom_input)
print(f'Predicted Bias Score: {predicted_score:.2f} out of 10')

Predicted Bias Score: 9.54 out of 10
