# Setup

In [38]:
# DATA LINKS

henry_imdb = "/content/drive/MyDrive/Courses/Sophomore Spring Courses/CS226R/IMDB Dataset Processed.csv"
henry_imdb_debiased = "/content/drive/MyDrive/Courses/Sophomore Spring Courses/CS226R/IMDB Dataset Processed (Debiased).csv"

joseph_imdb = "/content/drive/MyDrive/CS226R Final Project/IMDB Dataset Processed (1000).csv"
joseph_imdb_debiased = "/content/drive/MyDrive/CS226R Final Project/IMDB Dataset Processed (Debiased) (1000).csv"

# joseph_imdb_custom_debiased = '/content/drive/MyDrive/CS226R Final Project/IMDB Dataset Processed (Debiased, Custom, 1000).csv'
joseph_imdb_custom_debiased = '/content/drive/MyDrive/CS226R Final Project/IMDB Dataset Processed (Debiased, Custom, 1000, New).csv'

In [None]:
# Fairness Libraries
# https://fairlearn.org/v0.10/api_reference/generated/fairlearn.reductions.EqualizedOdds.html
# https://aif360.readthedocs.io/en/stable/modules/explainers.html

In [39]:
!pip install aif360
!pip install fairlearn
!pip install datasets



In [40]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from fairlearn.preprocessing import CorrelationRemover
from fairlearn.reductions import DemographicParity, EqualizedOdds
from aif360.datasets import BinaryLabelDataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install fairlearn

[31mERROR: Operation cancelled by user[0m[31m
[0m

In [41]:
# Load datasets

# imdb = pd.read_csv(henry_imdb)
# imdb_debiased = pd.read_csv(henry_imdb_debiased)
imdb = pd.read_csv(joseph_imdb)
imdb_debiased = pd.read_csv(joseph_imdb_debiased)

imdb_custom_debiased = pd.read_csv(joseph_imdb_custom_debiased)

# Fairness Metrics

In [42]:
# https://fairware.cs.umass.edu/papers/Verma.pdf

from sklearn.metrics import confusion_matrix

def evaluate_metrics_for_specific_value(y_true, y_pred, protected_column = None, specific_value = None):
    if specific_value:
        # Use boolean array to directly index y_true and y_pred
        mask = protected_column == specific_value
        cm = confusion_matrix(y_true[mask], y_pred[mask])
    else:
        cm = confusion_matrix(y_true, y_pred)
    # Ensure the confusion matrix has dimensions [2, 2]
    if cm.shape != (2, 2):
        cm = np.pad(cm, ((2-cm.shape[0], 0), (2-cm.shape[1], 0)), mode='constant')

    # Extract TP, FP, FN, TN for the specific value
    TP = cm[1, 1] if len(cm) > 1 else 0
    FP = cm[0, 1] if len(cm) > 1 else 0
    FN = cm[1, 0] if len(cm) > 1 else 0
    TN = cm[0, 0] if len(cm) > 1 else 0

    # Compute metrics
    PPV = TP / (TP + FP) if TP + FP > 0 else 0  # Positive Predictive Value (Precision)
    FDR = FP / (TP + FP) if TP + FP > 0 else 0  # False Discovery Rate
    FOR = FN / (TN + FN) if TN + FN > 0 else 0  # False Omission Rate
    NPV = TN / (TN + FN) if TN + FN > 0 else 0  # Negative Predictive Value
    TPR = TP / (TP + FN) if TP + FN > 0 else 0  # True Positive Rate (Recall/Sensitivity)
    FPR = FP / (FP + TN) if FP + TN > 0 else 0  # False Positive Rate
    FNR = FN / (TP + FN) if TP + FN > 0 else 0  # False Negative Rate
    TNR = TN / (FP + TN) if FP + TN > 0 else 0  # True Negative Rate

    if specific_value:
        metrics_dict = {
            f'True Positive Rate ({specific_value})': TPR,
            f'False Positive Rate ({specific_value})': FPR,
            f'False Negative Rate ({specific_value})': FNR,
            f'True Negative Rate ({specific_value})': TNR,
            f'Positive Predictive Value (Precision) ({specific_value})': PPV,
            f'False Discovery Rate ({specific_value})': FDR,
            f'False Omission Rate ({specific_value})': FOR,
            f'Negative Predictive Value ({specific_value})': NPV
        }
    else:
        metrics_dict = {
            'True Positive Rate': TPR,
            'False Positive Rate': FPR,
            'False Negative Rate': FNR,
            'True Negative Rate': TNR,
            'Positive Predictive Value (Precision)': PPV,
            'False Discovery Rate': FDR,
            'False Omission Rate': FOR,
            'Negative Predictive Value': NPV
        }

    return metrics_dict

# Establish classifier baseline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.mixture import GaussianMixture
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
import pickle

df = imdb
# Check the unique values of the sentiment column
print("Unique values in 'sentiment':", df['sentiment'].unique())
label_mapping = {'negative': 0, 'positive': 1}
df['sentiment'] = df['sentiment'].map(label_mapping)
print("Unique values in 'sentiment':", df['sentiment'].unique())
class ClassifierComparison:
    def __init__(self, df, feature_columns, target_column, protected_column, test_size=0.2, random_state=42):
        self.df = df
        self.feature_columns = feature_columns
        self.target_column = target_column
        self.protected_column = protected_column
        self.test_size = test_size
        self.random_state = random_state

        self.classifiers = {
          "Logistic Regression": LogisticRegression(),
          "Decision Tree": DecisionTreeClassifier(),
          "Support Vector Machine": SVC(),
          "AdaBoost": AdaBoostClassifier(),
          "Linear SVC": LinearSVC(),
          "Gaussian Mixture": GaussianMixture(),
          "Linear Discriminant Analysis": LinearDiscriminantAnalysis()
      }


        self.X = df[feature_columns]
        self.y = df[target_column]

        indices = np.arange(self.X.shape[0])

        self.X_train, self.X_test, self.y_train, self.y_test, self.idx_train, self.idx_test = train_test_split(
    self.X, self.y, indices, test_size=self.test_size, random_state=self.random_state)

        self.results = {}

    def train_and_evaluate(self):
        for name, clf in self.classifiers.items():
            clf.fit(self.X_train, self.y_train)
            y_pred = clf.predict(self.X_test)

            accuracy = accuracy_score(self.y_test, y_pred)

            # Get protected column values only for the test indices
            protected_test_values = self.df[self.protected_column].iloc[self.idx_test]

            # Fairness metrics
            dp_diff = demographic_parity_difference(self.y_test, y_pred, sensitive_features=protected_test_values)
            eo_diff = equalized_odds_difference(self.y_test, y_pred, sensitive_features=protected_test_values)

            male_metrics = evaluate_metrics_for_specific_value(self.y_test, y_pred, protected_test_values, 'male')
            female_metrics = evaluate_metrics_for_specific_value(self.y_test, y_pred, protected_test_values, 'female')
            general_metrics = evaluate_metrics_for_specific_value(self.y_test, y_pred, protected_test_values, None)

            self.results[name] = {
                'Accuracy': accuracy,
                'Demographic Parity Difference': dp_diff,
                'Equalized Odds Difference': eo_diff,
                **general_metrics,
                **female_metrics,
                **male_metrics,
            }

    def display_results(self):
        for name, metrics_dict in self.results.items():
            print(f"{name}: Accuracy - {metrics_dict['Accuracy']:.4f}, Demographic Parity Difference - {metrics_dict['Demographic Parity Difference']:.4f}, Equalized Odds Difference - {metrics_dict['Equalized Odds Difference']:.4f}")
            for metric_name, metric_value in metrics_dict.items():
                if metric_name not in ['Accuracy', 'Demographic Parity Difference', 'Equalized Odds Difference']:
                    print(f"    {metric_name}: {metric_value:.4f}")
            print()

    def save_results(self, filename):
        with open(filename, 'wb') as file:
            pickle.dump(self.results, file)

Unique values in 'sentiment': ['positive' 'negative']
Unique values in 'sentiment': [1 0]


# Regular

In [None]:
from sklearn.preprocessing import LabelEncoder
df = imdb
feature_columns = [col for col in df.columns if col.startswith('sentence_embedding_')]
target_column = 'sentiment'
protected_column = 'gender'
test_size = 0.2
random_state = 42

# Instantiate the ClassifierComparison object
cc = ClassifierComparison(df, feature_columns, target_column, protected_column, test_size, random_state)

# Encode the sentiment column to numeric values
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])
print("Unique values in 'sentiment' after encoding:", df['sentiment'].unique())
cc = ClassifierComparison(df, feature_columns, target_column, protected_column, test_size, random_state)

# Train and evaluate classifiers
cc.train_and_evaluate()

# Display results
cc.display_results()

Unique values in 'sentiment' after encoding: [1 0]
Logistic Regression: Accuracy - 0.7650, Demographic Parity Difference - 0.2880, Equalized Odds Difference - 0.3652
    True Positive Rate: 0.9174
    False Positive Rate: 0.4684
    False Negative Rate: 0.0826
    True Negative Rate: 0.5316
    Positive Predictive Value (Precision): 0.7500
    False Discovery Rate: 0.2500
    False Omission Rate: 0.1923
    Negative Predictive Value: 0.8077
    True Positive Rate (female): 0.9310
    False Positive Rate (female): 0.6579
    False Negative Rate (female): 0.0690
    True Negative Rate (female): 0.3421
    Positive Predictive Value (Precision) (female): 0.7642
    False Discovery Rate (female): 0.2358
    False Omission Rate (female): 0.3158
    Negative Predictive Value (female): 0.6842
    True Positive Rate (male): 0.8824
    False Positive Rate (male): 0.2927
    False Negative Rate (male): 0.1176
    True Negative Rate (male): 0.7073
    Positive Predictive Value (Precision) (male): 



# Debiased

In [None]:
from sklearn.preprocessing import LabelEncoder
df = imdb_debiased
feature_columns = [col for col in df.columns if col.startswith('sentence_embedding_')]
target_column = 'sentiment'
protected_column = 'gender'
test_size = 0.2
random_state = 42

# Instantiate the ClassifierComparison object
cc = ClassifierComparison(df, feature_columns, target_column, protected_column, test_size, random_state)

# Encode the sentiment column to numeric values
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])
print("Unique values in 'sentiment' after encoding:", df['sentiment'].unique())
cc = ClassifierComparison(df, feature_columns, target_column, protected_column, test_size, random_state)

# Train and evaluate classifiers
cc.train_and_evaluate()

# Display results
cc.display_results()

Unique values in 'sentiment' after encoding: [1 0]
Logistic Regression: Accuracy - 0.6050, Demographic Parity Difference - 0.0267, Equalized Odds Difference - 0.0294
    True Positive Rate: 0.9917
    False Positive Rate: 0.9873
    False Negative Rate: 0.0083
    True Negative Rate: 0.0127
    Positive Predictive Value (Precision): 0.6061
    False Discovery Rate: 0.3939
    False Omission Rate: 0.5000
    Negative Predictive Value: 0.5000
    True Positive Rate (female): 1.0000
    False Positive Rate (female): 1.0000
    False Negative Rate (female): 0.0000
    True Negative Rate (female): 0.0000
    Positive Predictive Value (Precision) (female): 0.6960
    False Discovery Rate (female): 0.3040
    False Omission Rate (female): 0.0000
    Negative Predictive Value (female): 0.0000
    True Positive Rate (male): 0.9706
    False Positive Rate (male): 0.9756
    False Negative Rate (male): 0.0294
    True Negative Rate (male): 0.0244
    Positive Predictive Value (Precision) (male): 



# Results with Custom Algorithm

In [None]:
from sklearn.preprocessing import LabelEncoder
df = imdb_custom_debiased
feature_columns = [col for col in df.columns if col.startswith('sentence_embedding_')]
target_column = 'sentiment'
protected_column = 'gender'
test_size = 0.2
random_state = 42

# Instantiate the ClassifierComparison object
cc = ClassifierComparison(df, feature_columns, target_column, protected_column, test_size, random_state)

# Encode the sentiment column to numeric values
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])
print("Unique values in 'sentiment' after encoding:", df['sentiment'].unique())
cc = ClassifierComparison(df, feature_columns, target_column, protected_column, test_size, random_state)

# Train and evaluate classifiers
cc.train_and_evaluate()

# Display results
cc.display_results()

Unique values in 'sentiment' after encoding: [1 0]
Logistic Regression: Accuracy - 0.6050, Demographic Parity Difference - 0.0267, Equalized Odds Difference - 0.0294
    True Positive Rate: 0.9917
    False Positive Rate: 0.9873
    False Negative Rate: 0.0083
    True Negative Rate: 0.0127
    Positive Predictive Value (Precision): 0.6061
    False Discovery Rate: 0.3939
    False Omission Rate: 0.5000
    Negative Predictive Value: 0.5000
    True Positive Rate (female): 1.0000
    False Positive Rate (female): 1.0000
    False Negative Rate (female): 0.0000
    True Negative Rate (female): 0.0000
    Positive Predictive Value (Precision) (female): 0.6960
    False Discovery Rate (female): 0.3040
    False Omission Rate (female): 0.0000
    Negative Predictive Value (female): 0.0000
    True Positive Rate (male): 0.9706
    False Positive Rate (male): 0.9756
    False Negative Rate (male): 0.0294
    True Negative Rate (male): 0.0244
    Positive Predictive Value (Precision) (male): 



# Additional Code for Future Investigation

In [43]:
 # Using Pre-trained Word Embeddings with a Feedforward Neural Network
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D
from keras.initializers import Constant

# Load GloVe embeddings
embeddings_index = {}
with open('glove.6B.100d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

# Prepare the tokenizer and the embedding matrix
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

num_words = len(word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define model
model = Sequential()
model.add(Embedding(num_words,
                    embedding_dim,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=max_length,
                    trainable=False))
model.add(GlobalAveragePooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.100d.txt'

In [None]:
import pandas as pd
from collections import Counter

# Sample DataFrame
data = {
    'text': ["The quick brown fox.", "The lazy dog jumps.", "The quick brown fox.", "The lazy dog jumps."],
    'label': [0, 1, 0, 1]
}
df = pd.DataFrame(data)

def adjust_class_proportion(df):
    # Count the occurrences of each class label
    class_counts = Counter(df['label'])

    # Find the minimum class count
    min_count = min(class_counts.values())

    # Create an empty DataFrame to store adjusted samples
    adjusted_df = pd.DataFrame(columns=df.columns)

    # Iterate over each class label
    for label, count in class_counts.items():
        # Sample the dataframe to have the same number of samples as the minimum class count
        class_df = df[df['label'] == label].sample(min_count, replace=False)

        # Append the sampled dataframe to the adjusted_df
        adjusted_df = pd.concat([adjusted_df, class_df], ignore_index=True)

    return adjusted_df

# Adjust the class proportion
adjusted_df = adjust_class_proportion(df)

print("Original DataFrame:")
print(df)
print("\nAdjusted DataFrame:")
print(adjusted_df)


Original DataFrame:
                   text  label
0  The quick brown fox.      0
1   The lazy dog jumps.      1
2  The quick brown fox.      0
3   The lazy dog jumps.      1

Adjusted DataFrame:
                   text label
0  The quick brown fox.     0
1  The quick brown fox.     0
2   The lazy dog jumps.     1
3   The lazy dog jumps.     1


# Downstream Debiasing Implementation

In [None]:
# Version 1
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from fairlearn.metrics import MetricFrame, selection_rate, equalized_odds_difference

# Function to measure biases in sentiment predictions
def bias_metrics(y_true, y_pred, sensitive_features):
    mf = MetricFrame(metrics={
        'accuracy': accuracy_score,
        'selection_rate': selection_rate,
        'equalized_odds_difference': lambda y_true, y_pred: equalized_odds_difference(y_true, y_pred, sensitive_feature=sensitive_features)
    },
    y_true=y_true,
    y_pred=y_pred,
    sensitive_features=sensitive_features)
    return mf.by_group

# Load the IMDB dataset
dataset = load_dataset("imdb")

# Assume sensitive attribute 'gender' is available in the dataset
sensitive_features = dataset['train']['gender']

# Tokenization using BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the data into train and test
train_dataset, test_dataset = tokenized_datasets["train"], tokenized_datasets["test"]

# Load BERT for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()


# Evaluate the model using bias metrics
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
bias_results = bias_metrics(predictions.label_ids, preds, sensitive_features)


print(bias_results)

# Calculate accuracy and F1 score
accuracy = accuracy_score(predictions.label_ids, preds)
f1 = f1_score(predictions.label_ids, preds)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
# Version 9001
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from fairlearn.metrics import MetricFrame, selection_rate, equalized_odds_difference

# Function to measure biases in sentiment predictions
def bias_metrics(y_true, y_pred, sensitive_features):
    mf = MetricFrame(metrics={
        'accuracy': accuracy_score,
        'selection_rate': selection_rate,
        'equalized_odds_difference': lambda y_true, y_pred: equalized_odds_difference(y_true, y_pred, sensitive_feature=sensitive_features)
    },
    y_true=y_true,
    y_pred=y_pred,
    sensitive_features=sensitive_features)
    return mf.by_group

# Load the IMDB dataset
dataset = load_dataset("imdb")

# Tokenization using BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Assuming a function to debias embeddings (pseudocode) - MUST IMPLEMENT
def debias_embeddings(model):
    # Pseudocode: Modify model embeddings based on debiasing algorithm
    pass

# Define model loading function to allow switching between original and debiased
def load_model(debias=False):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
    if debias:
        debias_embeddings(model)
    return model

# Load models
original_model = load_model(debias=False)
debiased_model = load_model(debias=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainers for both models
trainer_original = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

trainer_debiased = Trainer(
    model=debiased_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

# Train both models
trainer_original.train()
trainer_debiased.train()

# Evaluate both models using bias metrics
predictions_original = trainer_original.predict(tokenized_datasets["test"])
predictions_debiased = trainer_debiased.predict(tokenized_datasets["test"])

preds_original = np.argmax(predictions_original.predictions, axis=-1)
preds_debiased = np.argmax(predictions_debiased.predictions, axis=-1)

# Assume sensitive attribute 'gender' is available in the dataset
sensitive_features = dataset['test']['gender']

bias_results_original = bias_metrics(predictions_original.label_ids, preds_original, sensitive_features)
bias_results_debiased = bias_metrics(predictions_debiased.label_ids, preds_debiased, sensitive_features)

print("Original Model Bias and Accuracy:")
print(bias_results_original)

print("Debiased Model Bias and Accuracy:")
print(bias_results_debiased)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
